mirror of
https://github.com/Farama-Foundation/Gymnasium.git
synced 2025-08-01 22:11:25 +00:00
* Ditch most of the seeding.py and replace np_random with the numpy default_rng. Let's see if tests pass * Updated a bunch of RNG calls from the RandomState API to Generator API * black; didn't expect that, did ya? * Undo a typo * blaaack * More typo fixes * Fixed setting/getting state in multidiscrete spaces * Fix typo, fix a test to work with the new sampling * Correctly (?) pass the randomly generated seed if np_random is called with None as seed * Convert the Discrete sample to a python int (as opposed to np.int64) * Remove some redundant imports * First version of the compatibility layer for old-style RNG. Mainly to trigger tests. * Removed redundant f-strings * Style fixes, removing unused imports * Try to make tests pass by removing atari from the dockerfile * Try to make tests pass by removing atari from the setup * Try to make tests pass by removing atari from the setup * Try to make tests pass by removing atari from the setup * First attempt at deprecating `env.seed` and supporting `env.reset(seed=seed)` instead. Tests should hopefully pass but throw up a million warnings. * black; didn't expect that, didya? * Rename the reset parameter in VecEnvs back to `seed` * Updated tests to use the new seeding method * Removed a bunch of old `seed` calls. Fixed a bug in AsyncVectorEnv * Stop Discrete envs from doing part of the setup (and using the randomness) in init (as opposed to reset) * Add explicit seed to wrappers reset * Remove an accidental return * Re-add some legacy functions with a warning. * Use deprecation instead of regular warnings for the newly deprecated methods/functions
321 lines
10 KiB
Python
321 lines
10 KiB
Python
"""classic Acrobot task"""
|
|
from typing import Optional
|
|
|
|
import numpy as np
|
|
from numpy import sin, cos, pi
|
|
|
|
from gym import core, spaces
|
|
from gym.utils import seeding
|
|
|
|
__copyright__ = "Copyright 2013, RLPy http://acl.mit.edu/RLPy"
|
|
__credits__ = [
|
|
"Alborz Geramifard",
|
|
"Robert H. Klein",
|
|
"Christoph Dann",
|
|
"William Dabney",
|
|
"Jonathan P. How",
|
|
]
|
|
__license__ = "BSD 3-Clause"
|
|
__author__ = "Christoph Dann <cdann@cdann.de>"
|
|
|
|
# SOURCE:
|
|
# https://github.com/rlpy/rlpy/blob/master/rlpy/Domains/Acrobot.py
|
|
|
|
|
|
class AcrobotEnv(core.Env):
|
|
|
|
"""
|
|
Acrobot is a 2-link pendulum with only the second joint actuated.
|
|
Initially, both links point downwards. The goal is to swing the
|
|
end-effector at a height at least the length of one link above the base.
|
|
Both links can swing freely and can pass by each other, i.e., they don't
|
|
collide when they have the same angle.
|
|
**STATE:**
|
|
The state consists of the sin() and cos() of the two rotational joint
|
|
angles and the joint angular velocities :
|
|
[cos(theta1) sin(theta1) cos(theta2) sin(theta2) thetaDot1 thetaDot2].
|
|
For the first link, an angle of 0 corresponds to the link pointing downwards.
|
|
The angle of the second link is relative to the angle of the first link.
|
|
An angle of 0 corresponds to having the same angle between the two links.
|
|
A state of [1, 0, 1, 0, ..., ...] means that both links point downwards.
|
|
**ACTIONS:**
|
|
The action is either applying +1, 0 or -1 torque on the joint between
|
|
the two pendulum links.
|
|
.. note::
|
|
The dynamics equations were missing some terms in the NIPS paper which
|
|
are present in the book. R. Sutton confirmed in personal correspondence
|
|
that the experimental results shown in the paper and the book were
|
|
generated with the equations shown in the book.
|
|
However, there is the option to run the domain with the paper equations
|
|
by setting book_or_nips = 'nips'
|
|
**REFERENCE:**
|
|
.. seealso::
|
|
R. Sutton: Generalization in Reinforcement Learning:
|
|
Successful Examples Using Sparse Coarse Coding (NIPS 1996)
|
|
.. seealso::
|
|
R. Sutton and A. G. Barto:
|
|
Reinforcement learning: An introduction.
|
|
Cambridge: MIT press, 1998.
|
|
.. warning::
|
|
This version of the domain uses the Runge-Kutta method for integrating
|
|
the system dynamics and is more realistic, but also considerably harder
|
|
than the original version which employs Euler integration.
|
|
"""
|
|
|
|
metadata = {"render.modes": ["human", "rgb_array"], "video.frames_per_second": 15}
|
|
|
|
dt = 0.2
|
|
|
|
LINK_LENGTH_1 = 1.0 # [m]
|
|
LINK_LENGTH_2 = 1.0 # [m]
|
|
LINK_MASS_1 = 1.0 #: [kg] mass of link 1
|
|
LINK_MASS_2 = 1.0 #: [kg] mass of link 2
|
|
LINK_COM_POS_1 = 0.5 #: [m] position of the center of mass of link 1
|
|
LINK_COM_POS_2 = 0.5 #: [m] position of the center of mass of link 2
|
|
LINK_MOI = 1.0 #: moments of inertia for both links
|
|
|
|
MAX_VEL_1 = 4 * pi
|
|
MAX_VEL_2 = 9 * pi
|
|
|
|
AVAIL_TORQUE = [-1.0, 0.0, +1]
|
|
|
|
torque_noise_max = 0.0
|
|
|
|
#: use dynamics equations from the nips paper or the book
|
|
book_or_nips = "book"
|
|
action_arrow = None
|
|
domain_fig = None
|
|
actions_num = 3
|
|
|
|
def __init__(self):
|
|
self.viewer = None
|
|
high = np.array(
|
|
[1.0, 1.0, 1.0, 1.0, self.MAX_VEL_1, self.MAX_VEL_2], dtype=np.float32
|
|
)
|
|
low = -high
|
|
self.observation_space = spaces.Box(low=low, high=high, dtype=np.float32)
|
|
self.action_space = spaces.Discrete(3)
|
|
self.state = None
|
|
|
|
def reset(self, seed: Optional[int] = None):
|
|
super().reset(seed=seed)
|
|
self.state = self.np_random.uniform(low=-0.1, high=0.1, size=(4,)).astype(
|
|
np.float32
|
|
)
|
|
return self._get_ob()
|
|
|
|
def step(self, a):
|
|
s = self.state
|
|
torque = self.AVAIL_TORQUE[a]
|
|
|
|
# Add noise to the force action
|
|
if self.torque_noise_max > 0:
|
|
torque += self.np_random.uniform(
|
|
-self.torque_noise_max, self.torque_noise_max
|
|
)
|
|
|
|
# Now, augment the state with our force action so it can be passed to
|
|
# _dsdt
|
|
s_augmented = np.append(s, torque)
|
|
|
|
ns = rk4(self._dsdt, s_augmented, [0, self.dt])
|
|
|
|
ns[0] = wrap(ns[0], -pi, pi)
|
|
ns[1] = wrap(ns[1], -pi, pi)
|
|
ns[2] = bound(ns[2], -self.MAX_VEL_1, self.MAX_VEL_1)
|
|
ns[3] = bound(ns[3], -self.MAX_VEL_2, self.MAX_VEL_2)
|
|
self.state = ns
|
|
terminal = self._terminal()
|
|
reward = -1.0 if not terminal else 0.0
|
|
return (self._get_ob(), reward, terminal, {})
|
|
|
|
def _get_ob(self):
|
|
s = self.state
|
|
return np.array(
|
|
[cos(s[0]), sin(s[0]), cos(s[1]), sin(s[1]), s[2], s[3]], dtype=np.float32
|
|
)
|
|
|
|
def _terminal(self):
|
|
s = self.state
|
|
return bool(-cos(s[0]) - cos(s[1] + s[0]) > 1.0)
|
|
|
|
def _dsdt(self, s_augmented):
|
|
m1 = self.LINK_MASS_1
|
|
m2 = self.LINK_MASS_2
|
|
l1 = self.LINK_LENGTH_1
|
|
lc1 = self.LINK_COM_POS_1
|
|
lc2 = self.LINK_COM_POS_2
|
|
I1 = self.LINK_MOI
|
|
I2 = self.LINK_MOI
|
|
g = 9.8
|
|
a = s_augmented[-1]
|
|
s = s_augmented[:-1]
|
|
theta1 = s[0]
|
|
theta2 = s[1]
|
|
dtheta1 = s[2]
|
|
dtheta2 = s[3]
|
|
d1 = (
|
|
m1 * lc1 ** 2
|
|
+ m2 * (l1 ** 2 + lc2 ** 2 + 2 * l1 * lc2 * cos(theta2))
|
|
+ I1
|
|
+ I2
|
|
)
|
|
d2 = m2 * (lc2 ** 2 + l1 * lc2 * cos(theta2)) + I2
|
|
phi2 = m2 * lc2 * g * cos(theta1 + theta2 - pi / 2.0)
|
|
phi1 = (
|
|
-m2 * l1 * lc2 * dtheta2 ** 2 * sin(theta2)
|
|
- 2 * m2 * l1 * lc2 * dtheta2 * dtheta1 * sin(theta2)
|
|
+ (m1 * lc1 + m2 * l1) * g * cos(theta1 - pi / 2)
|
|
+ phi2
|
|
)
|
|
if self.book_or_nips == "nips":
|
|
# the following line is consistent with the description in the
|
|
# paper
|
|
ddtheta2 = (a + d2 / d1 * phi1 - phi2) / (m2 * lc2 ** 2 + I2 - d2 ** 2 / d1)
|
|
else:
|
|
# the following line is consistent with the java implementation and the
|
|
# book
|
|
ddtheta2 = (
|
|
a + d2 / d1 * phi1 - m2 * l1 * lc2 * dtheta1 ** 2 * sin(theta2) - phi2
|
|
) / (m2 * lc2 ** 2 + I2 - d2 ** 2 / d1)
|
|
ddtheta1 = -(d2 * ddtheta2 + phi1) / d1
|
|
return (dtheta1, dtheta2, ddtheta1, ddtheta2, 0.0)
|
|
|
|
def render(self, mode="human"):
|
|
from gym.envs.classic_control import rendering
|
|
|
|
s = self.state
|
|
|
|
if self.viewer is None:
|
|
self.viewer = rendering.Viewer(500, 500)
|
|
bound = self.LINK_LENGTH_1 + self.LINK_LENGTH_2 + 0.2 # 2.2 for default
|
|
self.viewer.set_bounds(-bound, bound, -bound, bound)
|
|
|
|
if s is None:
|
|
return None
|
|
|
|
p1 = [-self.LINK_LENGTH_1 * cos(s[0]), self.LINK_LENGTH_1 * sin(s[0])]
|
|
|
|
p2 = [
|
|
p1[0] - self.LINK_LENGTH_2 * cos(s[0] + s[1]),
|
|
p1[1] + self.LINK_LENGTH_2 * sin(s[0] + s[1]),
|
|
]
|
|
|
|
xys = np.array([[0, 0], p1, p2])[:, ::-1]
|
|
thetas = [s[0] - pi / 2, s[0] + s[1] - pi / 2]
|
|
link_lengths = [self.LINK_LENGTH_1, self.LINK_LENGTH_2]
|
|
|
|
self.viewer.draw_line((-2.2, 1), (2.2, 1))
|
|
for ((x, y), th, llen) in zip(xys, thetas, link_lengths):
|
|
l, r, t, b = 0, llen, 0.1, -0.1
|
|
jtransform = rendering.Transform(rotation=th, translation=(x, y))
|
|
link = self.viewer.draw_polygon([(l, b), (l, t), (r, t), (r, b)])
|
|
link.add_attr(jtransform)
|
|
link.set_color(0, 0.8, 0.8)
|
|
circ = self.viewer.draw_circle(0.1)
|
|
circ.set_color(0.8, 0.8, 0)
|
|
circ.add_attr(jtransform)
|
|
|
|
return self.viewer.render(return_rgb_array=mode == "rgb_array")
|
|
|
|
def close(self):
|
|
if self.viewer:
|
|
self.viewer.close()
|
|
self.viewer = None
|
|
|
|
|
|
def wrap(x, m, M):
|
|
"""Wraps ``x`` so m <= x <= M; but unlike ``bound()`` which
|
|
truncates, ``wrap()`` wraps x around the coordinate system defined by m,M.\n
|
|
For example, m = -180, M = 180 (degrees), x = 360 --> returns 0.
|
|
|
|
Args:
|
|
x: a scalar
|
|
m: minimum possible value in range
|
|
M: maximum possible value in range
|
|
|
|
Returns:
|
|
x: a scalar, wrapped
|
|
"""
|
|
diff = M - m
|
|
while x > M:
|
|
x = x - diff
|
|
while x < m:
|
|
x = x + diff
|
|
return x
|
|
|
|
|
|
def bound(x, m, M=None):
|
|
"""Either have m as scalar, so bound(x,m,M) which returns m <= x <= M *OR*
|
|
have m as length 2 vector, bound(x,m, <IGNORED>) returns m[0] <= x <= m[1].
|
|
|
|
Args:
|
|
x: scalar
|
|
|
|
Returns:
|
|
x: scalar, bound between min (m) and Max (M)
|
|
"""
|
|
if M is None:
|
|
M = m[1]
|
|
m = m[0]
|
|
# bound x between min (m) and Max (M)
|
|
return min(max(x, m), M)
|
|
|
|
|
|
def rk4(derivs, y0, t):
|
|
"""
|
|
Integrate 1D or ND system of ODEs using 4-th order Runge-Kutta.
|
|
This is a toy implementation which may be useful if you find
|
|
yourself stranded on a system w/o scipy. Otherwise use
|
|
:func:`scipy.integrate`.
|
|
|
|
Args:
|
|
derivs: the derivative of the system and has the signature ``dy = derivs(yi)``
|
|
y0: initial state vector
|
|
t: sample times
|
|
args: additional arguments passed to the derivative function
|
|
kwargs: additional keyword arguments passed to the derivative function
|
|
|
|
Example 1 ::
|
|
## 2D system
|
|
def derivs(x):
|
|
d1 = x[0] + 2*x[1]
|
|
d2 = -3*x[0] + 4*x[1]
|
|
return (d1, d2)
|
|
dt = 0.0005
|
|
t = arange(0.0, 2.0, dt)
|
|
y0 = (1,2)
|
|
yout = rk4(derivs6, y0, t)
|
|
|
|
If you have access to scipy, you should probably be using the
|
|
scipy.integrate tools rather than this function.
|
|
This would then require re-adding the time variable to the signature of derivs.
|
|
|
|
Returns:
|
|
yout: Runge-Kutta approximation of the ODE
|
|
"""
|
|
|
|
try:
|
|
Ny = len(y0)
|
|
except TypeError:
|
|
yout = np.zeros((len(t),), np.float_)
|
|
else:
|
|
yout = np.zeros((len(t), Ny), np.float_)
|
|
|
|
yout[0] = y0
|
|
|
|
for i in np.arange(len(t) - 1):
|
|
|
|
thist = t[i]
|
|
dt = t[i + 1] - thist
|
|
dt2 = dt / 2.0
|
|
y0 = yout[i]
|
|
|
|
k1 = np.asarray(derivs(y0))
|
|
k2 = np.asarray(derivs(y0 + dt2 * k1))
|
|
k3 = np.asarray(derivs(y0 + dt2 * k2))
|
|
k4 = np.asarray(derivs(y0 + dt * k3))
|
|
yout[i + 1] = y0 + dt / 6.0 * (k1 + 2 * k2 + 2 * k3 + k4)
|
|
# We only care about the final timestep and we cleave off action value which will be zero
|
|
return yout[-1][:4]
|