mirror of
https://github.com/Farama-Foundation/Gymnasium.git
synced 2025-07-31 13:54:31 +00:00
Add new MuJoCo bindings (#2762)
This commit is contained in:
committed by
GitHub
parent
8f9b62f6a6
commit
3e006f3ea5
1
.github/workflows/build.yml
vendored
1
.github/workflows/build.yml
vendored
@@ -11,7 +11,6 @@ jobs:
|
|||||||
- uses: actions/checkout@v2
|
- uses: actions/checkout@v2
|
||||||
- run: |
|
- run: |
|
||||||
docker build -f py.Dockerfile \
|
docker build -f py.Dockerfile \
|
||||||
--build-arg MUJOCO_KEY=$MUJOCO_KEY \
|
|
||||||
--build-arg PYTHON_VERSION=${{ matrix.python-version }} \
|
--build-arg PYTHON_VERSION=${{ matrix.python-version }} \
|
||||||
--tag gym-docker .
|
--tag gym-docker .
|
||||||
- name: Run tests
|
- name: Run tests
|
||||||
|
@@ -9,7 +9,7 @@ repos:
|
|||||||
hooks:
|
hooks:
|
||||||
- id: codespell
|
- id: codespell
|
||||||
args:
|
args:
|
||||||
- --ignore-words-list=nd,reacher,thist,ths
|
- --ignore-words-list=nd,reacher,thist,ths, ure, referenc
|
||||||
- repo: https://gitlab.com/PyCQA/flake8
|
- repo: https://gitlab.com/PyCQA/flake8
|
||||||
rev: 4.0.1
|
rev: 4.0.1
|
||||||
hooks:
|
hooks:
|
||||||
|
@@ -46,6 +46,11 @@ env.close()
|
|||||||
|
|
||||||
Gym keeps strict versioning for reproducibility reasons. All environments end in a suffix like "\_v0". When changes are made to environments that might impact learning results, the number is increased by one to prevent potential confusion.
|
Gym keeps strict versioning for reproducibility reasons. All environments end in a suffix like "\_v0". When changes are made to environments that might impact learning results, the number is increased by one to prevent potential confusion.
|
||||||
|
|
||||||
|
## MuJoCo Environments
|
||||||
|
|
||||||
|
The latest "\_v4" and future versions of the MuJoCo environments will no longer depend on `mujoco-py`. Instead `mujoco` will be the required dependency for future gym MuJoCo environment versions. Old gym MuJoCo environment versions that depend on `mujoco-py` will still be kept but unmaintained.
|
||||||
|
To install the dependencies for the latest gym MuJoCo environments use `pip install gym[mujoco]`. Dependencies for old MuJoCo environments can still be installed by `pip install gym[mujoco_py]`.
|
||||||
|
|
||||||
## Citation
|
## Citation
|
||||||
|
|
||||||
A whitepaper from when Gym just came out is available https://arxiv.org/pdf/1606.01540, and can be cited with the following bibtex entry:
|
A whitepaper from when Gym just came out is available https://arxiv.org/pdf/1606.01540, and can be cited with the following bibtex entry:
|
||||||
|
@@ -162,6 +162,13 @@ register(
|
|||||||
reward_threshold=-3.75,
|
reward_threshold=-3.75,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
register(
|
||||||
|
id="Reacher-v4",
|
||||||
|
entry_point="gym.envs.mujoco.reacher_v4:ReacherEnv",
|
||||||
|
max_episode_steps=50,
|
||||||
|
reward_threshold=-3.75,
|
||||||
|
)
|
||||||
|
|
||||||
register(
|
register(
|
||||||
id="Pusher-v2",
|
id="Pusher-v2",
|
||||||
entry_point="gym.envs.mujoco:PusherEnv",
|
entry_point="gym.envs.mujoco:PusherEnv",
|
||||||
@@ -169,6 +176,13 @@ register(
|
|||||||
reward_threshold=0.0,
|
reward_threshold=0.0,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
register(
|
||||||
|
id="Pusher-v4",
|
||||||
|
entry_point="gym.envs.mujoco.pusher_v4:PusherEnv",
|
||||||
|
max_episode_steps=100,
|
||||||
|
reward_threshold=0.0,
|
||||||
|
)
|
||||||
|
|
||||||
register(
|
register(
|
||||||
id="InvertedPendulum-v2",
|
id="InvertedPendulum-v2",
|
||||||
entry_point="gym.envs.mujoco:InvertedPendulumEnv",
|
entry_point="gym.envs.mujoco:InvertedPendulumEnv",
|
||||||
@@ -176,6 +190,13 @@ register(
|
|||||||
reward_threshold=950.0,
|
reward_threshold=950.0,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
register(
|
||||||
|
id="InvertedPendulum-v4",
|
||||||
|
entry_point="gym.envs.mujoco.inverted_pendulum_v4:InvertedPendulumEnv",
|
||||||
|
max_episode_steps=1000,
|
||||||
|
reward_threshold=950.0,
|
||||||
|
)
|
||||||
|
|
||||||
register(
|
register(
|
||||||
id="InvertedDoublePendulum-v2",
|
id="InvertedDoublePendulum-v2",
|
||||||
entry_point="gym.envs.mujoco:InvertedDoublePendulumEnv",
|
entry_point="gym.envs.mujoco:InvertedDoublePendulumEnv",
|
||||||
@@ -183,6 +204,13 @@ register(
|
|||||||
reward_threshold=9100.0,
|
reward_threshold=9100.0,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
register(
|
||||||
|
id="InvertedDoublePendulum-v4",
|
||||||
|
entry_point="gym.envs.mujoco.inverted_double_pendulum_v4:InvertedDoublePendulumEnv",
|
||||||
|
max_episode_steps=1000,
|
||||||
|
reward_threshold=9100.0,
|
||||||
|
)
|
||||||
|
|
||||||
register(
|
register(
|
||||||
id="HalfCheetah-v2",
|
id="HalfCheetah-v2",
|
||||||
entry_point="gym.envs.mujoco:HalfCheetahEnv",
|
entry_point="gym.envs.mujoco:HalfCheetahEnv",
|
||||||
@@ -197,6 +225,13 @@ register(
|
|||||||
reward_threshold=4800.0,
|
reward_threshold=4800.0,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
register(
|
||||||
|
id="HalfCheetah-v4",
|
||||||
|
entry_point="gym.envs.mujoco.half_cheetah_v4:HalfCheetahEnv",
|
||||||
|
max_episode_steps=1000,
|
||||||
|
reward_threshold=4800.0,
|
||||||
|
)
|
||||||
|
|
||||||
register(
|
register(
|
||||||
id="Hopper-v2",
|
id="Hopper-v2",
|
||||||
entry_point="gym.envs.mujoco:HopperEnv",
|
entry_point="gym.envs.mujoco:HopperEnv",
|
||||||
@@ -211,6 +246,13 @@ register(
|
|||||||
reward_threshold=3800.0,
|
reward_threshold=3800.0,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
register(
|
||||||
|
id="Hopper-v4",
|
||||||
|
entry_point="gym.envs.mujoco.hopper_v4:HopperEnv",
|
||||||
|
max_episode_steps=1000,
|
||||||
|
reward_threshold=3800.0,
|
||||||
|
)
|
||||||
|
|
||||||
register(
|
register(
|
||||||
id="Swimmer-v2",
|
id="Swimmer-v2",
|
||||||
entry_point="gym.envs.mujoco:SwimmerEnv",
|
entry_point="gym.envs.mujoco:SwimmerEnv",
|
||||||
@@ -225,6 +267,13 @@ register(
|
|||||||
reward_threshold=360.0,
|
reward_threshold=360.0,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
register(
|
||||||
|
id="Swimmer-v4",
|
||||||
|
entry_point="gym.envs.mujoco.swimmer_v4:SwimmerEnv",
|
||||||
|
max_episode_steps=1000,
|
||||||
|
reward_threshold=360.0,
|
||||||
|
)
|
||||||
|
|
||||||
register(
|
register(
|
||||||
id="Walker2d-v2",
|
id="Walker2d-v2",
|
||||||
max_episode_steps=1000,
|
max_episode_steps=1000,
|
||||||
@@ -237,6 +286,12 @@ register(
|
|||||||
entry_point="gym.envs.mujoco.walker2d_v3:Walker2dEnv",
|
entry_point="gym.envs.mujoco.walker2d_v3:Walker2dEnv",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
register(
|
||||||
|
id="Walker2d-v4",
|
||||||
|
max_episode_steps=1000,
|
||||||
|
entry_point="gym.envs.mujoco.walker2d_v4:Walker2dEnv",
|
||||||
|
)
|
||||||
|
|
||||||
register(
|
register(
|
||||||
id="Ant-v2",
|
id="Ant-v2",
|
||||||
entry_point="gym.envs.mujoco:AntEnv",
|
entry_point="gym.envs.mujoco:AntEnv",
|
||||||
@@ -251,6 +306,13 @@ register(
|
|||||||
reward_threshold=6000.0,
|
reward_threshold=6000.0,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
register(
|
||||||
|
id="Ant-v4",
|
||||||
|
entry_point="gym.envs.mujoco.ant_v4:AntEnv",
|
||||||
|
max_episode_steps=1000,
|
||||||
|
reward_threshold=6000.0,
|
||||||
|
)
|
||||||
|
|
||||||
register(
|
register(
|
||||||
id="Humanoid-v2",
|
id="Humanoid-v2",
|
||||||
entry_point="gym.envs.mujoco:HumanoidEnv",
|
entry_point="gym.envs.mujoco:HumanoidEnv",
|
||||||
@@ -263,8 +325,20 @@ register(
|
|||||||
max_episode_steps=1000,
|
max_episode_steps=1000,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
register(
|
||||||
|
id="Humanoid-v4",
|
||||||
|
entry_point="gym.envs.mujoco.humanoid_v4:HumanoidEnv",
|
||||||
|
max_episode_steps=1000,
|
||||||
|
)
|
||||||
|
|
||||||
register(
|
register(
|
||||||
id="HumanoidStandup-v2",
|
id="HumanoidStandup-v2",
|
||||||
entry_point="gym.envs.mujoco:HumanoidStandupEnv",
|
entry_point="gym.envs.mujoco:HumanoidStandupEnv",
|
||||||
max_episode_steps=1000,
|
max_episode_steps=1000,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
register(
|
||||||
|
id="HumanoidStandup-v4",
|
||||||
|
entry_point="gym.envs.mujoco.humanoidstandup_v4:HumanoidStandupEnv",
|
||||||
|
max_episode_steps=1000,
|
||||||
|
)
|
||||||
|
@@ -9,6 +9,7 @@ from gym.envs.mujoco.humanoid import HumanoidEnv
|
|||||||
from gym.envs.mujoco.humanoidstandup import HumanoidStandupEnv
|
from gym.envs.mujoco.humanoidstandup import HumanoidStandupEnv
|
||||||
from gym.envs.mujoco.inverted_double_pendulum import InvertedDoublePendulumEnv
|
from gym.envs.mujoco.inverted_double_pendulum import InvertedDoublePendulumEnv
|
||||||
from gym.envs.mujoco.inverted_pendulum import InvertedPendulumEnv
|
from gym.envs.mujoco.inverted_pendulum import InvertedPendulumEnv
|
||||||
|
from gym.envs.mujoco.mujoco_rendering import RenderContextOffscreen, Viewer
|
||||||
from gym.envs.mujoco.pusher import PusherEnv
|
from gym.envs.mujoco.pusher import PusherEnv
|
||||||
from gym.envs.mujoco.reacher import ReacherEnv
|
from gym.envs.mujoco.reacher import ReacherEnv
|
||||||
from gym.envs.mujoco.swimmer import SwimmerEnv
|
from gym.envs.mujoco.swimmer import SwimmerEnv
|
||||||
|
@@ -6,7 +6,7 @@ from gym.envs.mujoco import mujoco_env
|
|||||||
|
|
||||||
class AntEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
class AntEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
mujoco_env.MujocoEnv.__init__(self, "ant.xml", 5)
|
mujoco_env.MujocoEnv.__init__(self, "ant.xml", 5, mujoco_bindings="mujoco_py")
|
||||||
utils.EzPickle.__init__(self)
|
utils.EzPickle.__init__(self)
|
||||||
|
|
||||||
def step(self, a):
|
def step(self, a):
|
||||||
|
@@ -9,167 +9,6 @@ DEFAULT_CAMERA_CONFIG = {
|
|||||||
|
|
||||||
|
|
||||||
class AntEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
class AntEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
||||||
"""
|
|
||||||
### Description
|
|
||||||
|
|
||||||
This environment is based on the environment introduced by Schulman,
|
|
||||||
Moritz, Levine, Jordan and Abbeel in ["High-Dimensional Continuous Control
|
|
||||||
Using Generalized Advantage Estimation"](https://arxiv.org/abs/1506.02438).
|
|
||||||
The ant is a 3D robot consisting of one torso (free rotational body) with
|
|
||||||
four legs attached to it with each leg having two links. The goal is to
|
|
||||||
coordinate the four legs to move in the forward (right) direction by applying
|
|
||||||
torques on the eight hinges connecting the two links of each leg and the torso
|
|
||||||
(nine parts and eight hinges).
|
|
||||||
|
|
||||||
### Action Space
|
|
||||||
The action space is a `Box(-1, 1, (8,), float32)`. An action represents the torques applied at the hinge joints.
|
|
||||||
|
|
||||||
| Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit |
|
|
||||||
|-----|----------------------|---------------|----------------|---------------------------------------|-------|------|
|
|
||||||
| 0 | Torque applied on the rotor between the torso and front left hip | -1 | 1 | hip_1 (front_left_leg) | hinge | torque (N m) |
|
|
||||||
| 1 | Torque applied on the rotor between the front left two links | -1 | 1 | angle_1 (front_left_leg) | hinge | torque (N m) |
|
|
||||||
| 2 | Torque applied on the rotor between the torso and front right hip | -1 | 1 | hip_2 (front_right_leg) | hinge | torque (N m) |
|
|
||||||
| 3 | Torque applied on the rotor between the front right two links | -1 | 1 | angle_2 (front_right_leg) | hinge | torque (N m) |
|
|
||||||
| 4 | Torque applied on the rotor between the torso and back left hip | -1 | 1 | hip_3 (back_leg) | hinge | torque (N m) |
|
|
||||||
| 5 | Torque applied on the rotor between the back left two links | -1 | 1 | angle_3 (back_leg) | hinge | torque (N m) |
|
|
||||||
| 6 | Torque applied on the rotor between the torso and back right hip | -1 | 1 | hip_4 (right_back_leg) | hinge | torque (N m) |
|
|
||||||
| 7 | Torque applied on the rotor between the back right two links | -1 | 1 | angle_4 (right_back_leg) | hinge | torque (N m) |
|
|
||||||
|
|
||||||
### Observation Space
|
|
||||||
|
|
||||||
Observations consist of positional values of different body parts of the ant,
|
|
||||||
followed by the velocities of those individual parts (their derivatives) with all
|
|
||||||
the positions ordered before all the velocities.
|
|
||||||
|
|
||||||
By default, observations do not include the x- and y-coordinates of the ant's torso. These may
|
|
||||||
be included by passing `exclude_current_positions_from_observation=False` during construction.
|
|
||||||
In that case, the observation space will have 113 dimensions where the first two dimensions
|
|
||||||
represent the x- and y- coordinates of the ant's torso.
|
|
||||||
Regardless of whether `exclude_current_positions_from_observation` was set to true or false, the x- and y-coordinates
|
|
||||||
of the torso will be returned in `info` with keys `"x_position"` and `"y_position"`, respectively.
|
|
||||||
|
|
||||||
However, by default, an observation is a `ndarray` with shape `(111,)`
|
|
||||||
where the elements correspond to the following:
|
|
||||||
|
|
||||||
| Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit |
|
|
||||||
|-----|-------------------------------------------------------------|----------------|-----------------|----------------------------------------|-------|------|
|
|
||||||
| 0 | z-coordinate of the torso (centre) | -Inf | Inf | torso | free | position (m) |
|
|
||||||
| 1 | x-orientation of the torso (centre) | -Inf | Inf | torso | free | angle (rad) |
|
|
||||||
| 2 | y-orientation of the torso (centre) | -Inf | Inf | torso | free | angle (rad) |
|
|
||||||
| 3 | z-orientation of the torso (centre) | -Inf | Inf | torso | free | angle (rad) |
|
|
||||||
| 4 | w-orientation of the torso (centre) | -Inf | Inf | torso | free | angle (rad) |
|
|
||||||
| 5 | angle between torso and first link on front left | -Inf | Inf | hip_1 (front_left_leg) | hinge | angle (rad) |
|
|
||||||
| 6 | angle between the two links on the front left | -Inf | Inf | ankle_1 (front_left_leg) | hinge | angle (rad) |
|
|
||||||
| 7 | angle between torso and first link on front right | -Inf | Inf | hip_2 (front_right_leg) | hinge | angle (rad) |
|
|
||||||
| 8 | angle between the two links on the front right | -Inf | Inf | ankle_2 (front_right_leg) | hinge | angle (rad) |
|
|
||||||
| 9 | angle between torso and first link on back left | -Inf | Inf | hip_3 (back_leg) | hinge | angle (rad) |
|
|
||||||
| 10 | angle between the two links on the back left | -Inf | Inf | ankle_3 (back_leg) | hinge | angle (rad) |
|
|
||||||
| 11 | angle between torso and first link on back right | -Inf | Inf | hip_4 (right_back_leg) | hinge | angle (rad) |
|
|
||||||
| 12 | angle between the two links on the back right | -Inf | Inf | ankle_4 (right_back_leg) | hinge | angle (rad) |
|
|
||||||
| 13 | x-coordinate velocity of the torso | -Inf | Inf | torso | free | velocity (m/s) |
|
|
||||||
| 14 | y-coordinate velocity of the torso | -Inf | Inf | torso | free | velocity (m/s) |
|
|
||||||
| 15 | z-coordinate velocity of the torso | -Inf | Inf | torso | free | velocity (m/s) |
|
|
||||||
| 16 | x-coordinate angular velocity of the torso | -Inf | Inf | torso | free | angular velocity (rad/s) |
|
|
||||||
| 17 | y-coordinate angular velocity of the torso | -Inf | Inf | torso | free | angular velocity (rad/s) |
|
|
||||||
| 18 | z-coordinate angular velocity of the torso | -Inf | Inf | torso | free | angular velocity (rad/s) |
|
|
||||||
| 19 | angular velocity of angle between torso and front left link | -Inf | Inf | hip_1 (front_left_leg) | hinge | angle (rad) |
|
|
||||||
| 20 | angular velocity of the angle between front left links | -Inf | Inf | ankle_1 (front_left_leg) | hinge | angle (rad) |
|
|
||||||
| 21 | angular velocity of angle between torso and front right link| -Inf | Inf | hip_2 (front_right_leg) | hinge | angle (rad) |
|
|
||||||
| 22 | angular velocity of the angle between front right links | -Inf | Inf | ankle_2 (front_right_leg) | hinge | angle (rad) |
|
|
||||||
| 23 | angular velocity of angle between torso and back left link | -Inf | Inf | hip_3 (back_leg) | hinge | angle (rad) |
|
|
||||||
| 24 | angular velocity of the angle between back left links | -Inf | Inf | ankle_3 (back_leg) | hinge | angle (rad) |
|
|
||||||
| 25 | angular velocity of angle between torso and back right link | -Inf | Inf | hip_4 (right_back_leg) | hinge | angle (rad) |
|
|
||||||
| 26 |angular velocity of the angle between back right links | -Inf | Inf | ankle_4 (right_back_leg) | hinge | angle (rad) |
|
|
||||||
|
|
||||||
|
|
||||||
The remaining 14*6 = 84 elements of the observation are contact forces
|
|
||||||
(external forces - force x, y, z and torque x, y, z) applied to the
|
|
||||||
center of mass of each of the links. The 14 links are: the ground link,
|
|
||||||
the torso link, and 3 links for each leg (1 + 1 + 12) with the 6 external forces.
|
|
||||||
|
|
||||||
The (x,y,z) coordinates are translational DOFs while the orientations are rotational
|
|
||||||
DOFs expressed as quaternions. One can read more about free joints on the [Mujoco Documentation](https://mujoco.readthedocs.io/en/latest/XMLreference.html).
|
|
||||||
|
|
||||||
|
|
||||||
**Note:** There have been reported issues that using a Mujoco-Py version > 2.0 results
|
|
||||||
in the contact forces always being 0. As such we recommend to use a Mujoco-Py version < 2.0
|
|
||||||
when using the Ant environment if you would like to report results with contact forces (if
|
|
||||||
contact forces are not used in your experiments, you can use version > 2.0).
|
|
||||||
|
|
||||||
### Rewards
|
|
||||||
The reward consists of three parts:
|
|
||||||
- *healthy_reward*: Every timestep that the ant is healthy (see definition in section "Episode Termination"), it gets a reward of fixed value `healthy_reward`
|
|
||||||
- *forward_reward*: A reward of moving forward which is measured as
|
|
||||||
*(x-coordinate before action - x-coordinate after action)/dt*. *dt* is the time
|
|
||||||
between actions and is dependent on the `frame_skip` parameter (default is 5),
|
|
||||||
where the frametime is 0.01 - making the default *dt = 5 * 0.01 = 0.05*.
|
|
||||||
This reward would be positive if the ant moves forward (in positive x direction).
|
|
||||||
- *ctrl_cost*: A negative reward for penalising the ant if it takes actions
|
|
||||||
that are too large. It is measured as *`ctrl_cost_weight` * sum(action<sup>2</sup>)*
|
|
||||||
where *`ctr_cost_weight`* is a parameter set for the control and has a default value of 0.5.
|
|
||||||
- *contact_cost*: A negative reward for penalising the ant if the external contact
|
|
||||||
force is too large. It is calculated *`contact_cost_weight` * sum(clip(external contact
|
|
||||||
force to `contact_force_range`)<sup>2</sup>)*.
|
|
||||||
|
|
||||||
The total reward returned is ***reward*** *=* *healthy_reward + forward_reward - ctrl_cost - contact_cost* and `info` will also contain the individual reward terms.
|
|
||||||
|
|
||||||
### Starting State
|
|
||||||
All observations start in state
|
|
||||||
(0.0, 0.0, 0.75, 1.0, 0.0 ... 0.0) with a uniform noise in the range
|
|
||||||
of [-`reset_noise_scale`, `reset_noise_scale`] added to the positional values and standard normal noise
|
|
||||||
with mean 0 and standard deviation `reset_noise_scale` added to the velocity values for
|
|
||||||
stochasticity. Note that the initial z coordinate is intentionally selected
|
|
||||||
to be slightly high, thereby indicating a standing up ant. The initial orientation
|
|
||||||
is designed to make it face forward as well.
|
|
||||||
|
|
||||||
### Episode Termination
|
|
||||||
The ant is said to be unhealthy if any of the following happens:
|
|
||||||
|
|
||||||
1. Any of the state space values is no longer finite
|
|
||||||
2. The z-coordinate of the torso is **not** in the closed interval given by `healthy_z_range` (defaults to [0.2, 1.0])
|
|
||||||
|
|
||||||
If `terminate_when_unhealthy=True` is passed during construction (which is the default),
|
|
||||||
the episode terminates when any of the following happens:
|
|
||||||
|
|
||||||
1. The episode duration reaches a 1000 timesteps
|
|
||||||
2. The ant is unhealthy
|
|
||||||
|
|
||||||
If `terminate_when_unhealthy=False` is passed, the episode is terminated only when 1000 timesteps are exceeded.
|
|
||||||
|
|
||||||
### Arguments
|
|
||||||
|
|
||||||
No additional arguments are currently supported in v2 and lower.
|
|
||||||
|
|
||||||
```
|
|
||||||
env = gym.make('Ant-v2')
|
|
||||||
```
|
|
||||||
|
|
||||||
v3 and beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
|
|
||||||
|
|
||||||
```
|
|
||||||
env = gym.make('Ant-v3', ctrl_cost_weight=0.1, ...)
|
|
||||||
```
|
|
||||||
|
|
||||||
| Parameter | Type | Default |Description |
|
|
||||||
|-------------------------|------------|--------------|-------------------------------|
|
|
||||||
| `xml_file` | **str** | `"ant.xml"` | Path to a MuJoCo model |
|
|
||||||
| `ctrl_cost_weight` | **float** | `0.5` | Weight for *ctrl_cost* term (see section on reward) |
|
|
||||||
| `contact_cost_weight` | **float** | `5e-4` | Weight for *contact_cost* term (see section on reward) |
|
|
||||||
| `healthy_reward` | **float** | `1` | Constant reward given if the ant is "healthy" after timestep |
|
|
||||||
| `terminate_when_unhealthy` | **bool**| `True` | If true, issue a done signal if the z-coordinate of the torso is no longer in the `healthy_z_range` |
|
|
||||||
| `healthy_z_range` | **tuple** | `(0.2, 1)` | The ant is considered healthy if the z-coordinate of the torso is in this range |
|
|
||||||
| `contact_force_range` | **tuple** | `(-1, 1)` | Contact forces are clipped to this range in the computation of *contact_cost* |
|
|
||||||
| `reset_noise_scale` | **float** | `0.1` | Scale of random perturbations of initial position and velocity (see section on Starting State) |
|
|
||||||
| `exclude_current_positions_from_observation`| **bool** | `True`| Whether or not to omit the x- and y-coordinates from observations. Excluding the position can serve as an inductive bias to induce position-agnostic behavior in policies |
|
|
||||||
|
|
||||||
### Version History
|
|
||||||
|
|
||||||
* v3: support for gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. rgb rendering comes from tracking camera (so agent does not run away from screen)
|
|
||||||
* v2: All continuous control environments now use mujoco_py >= 1.50
|
|
||||||
* v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments.
|
|
||||||
* v0: Initial versions release (1.0.0)
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
xml_file="ant.xml",
|
xml_file="ant.xml",
|
||||||
@@ -199,7 +38,7 @@ class AntEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
|||||||
exclude_current_positions_from_observation
|
exclude_current_positions_from_observation
|
||||||
)
|
)
|
||||||
|
|
||||||
mujoco_env.MujocoEnv.__init__(self, xml_file, 5)
|
mujoco_env.MujocoEnv.__init__(self, xml_file, 5, mujoco_bindings="mujoco_py")
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def healthy_reward(self):
|
def healthy_reward(self):
|
||||||
|
308
gym/envs/mujoco/ant_v4.py
Normal file
308
gym/envs/mujoco/ant_v4.py
Normal file
@@ -0,0 +1,308 @@
|
|||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from gym import utils
|
||||||
|
from gym.envs.mujoco import mujoco_env
|
||||||
|
|
||||||
|
DEFAULT_CAMERA_CONFIG = {
|
||||||
|
"distance": 4.0,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class AntEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
||||||
|
"""
|
||||||
|
### Description
|
||||||
|
|
||||||
|
This environment is based on the environment introduced by Schulman,
|
||||||
|
Moritz, Levine, Jordan and Abbeel in ["High-Dimensional Continuous Control
|
||||||
|
Using Generalized Advantage Estimation"](https://arxiv.org/abs/1506.02438).
|
||||||
|
The ant is a 3D robot consisting of one torso (free rotational body) with
|
||||||
|
four legs attached to it with each leg having two links. The goal is to
|
||||||
|
coordinate the four legs to move in the forward (right) direction by applying
|
||||||
|
torques on the eight hinges connecting the two links of each leg and the torso
|
||||||
|
(nine parts and eight hinges).
|
||||||
|
|
||||||
|
### Action Space
|
||||||
|
The agent take a 8-element vector for actions.
|
||||||
|
|
||||||
|
The action space is a continuous `(action, action, action, action, action, action,
|
||||||
|
action, action)` all in `[-1, 1]`, where `action` represents the numerical torques
|
||||||
|
applied at the hinge joints.
|
||||||
|
|
||||||
|
| Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit |
|
||||||
|
|-----|----------------------|---------------|----------------|---------------------------------------|-------|------|
|
||||||
|
| 0 | Torque applied on the rotor between the torso and front left hip | -1 | 1 | hip_1 (front_left_leg) | hinge | torque (N m) |
|
||||||
|
| 1 | Torque applied on the rotor between the front left two links | -1 | 1 | angle_1 (front_left_leg) | hinge | torque (N m) |
|
||||||
|
| 2 | Torque applied on the rotor between the torso and front right hip | -1 | 1 | hip_2 (front_right_leg) | hinge | torque (N m) |
|
||||||
|
| 3 | Torque applied on the rotor between the front right two links | -1 | 1 | angle_2 (front_right_leg) | hinge | torque (N m) |
|
||||||
|
| 4 | Torque applied on the rotor between the torso and back left hip | -1 | 1 | hip_3 (back_leg) | hinge | torque (N m) |
|
||||||
|
| 5 | Torque applied on the rotor between the back left two links | -1 | 1 | angle_3 (back_leg) | hinge | torque (N m) |
|
||||||
|
| 6 | Torque applied on the rotor between the torso and back right hip | -1 | 1 | hip_4 (right_back_leg) | hinge | torque (N m) |
|
||||||
|
| 7 | Torque applied on the rotor between the back right two links | -1 | 1 | angle_4 (right_back_leg) | hinge | torque (N m) |
|
||||||
|
|
||||||
|
### Observation Space
|
||||||
|
|
||||||
|
The state space consists of positional values of different body parts of the ant,
|
||||||
|
followed by the velocities of those individual parts (their derivatives) with all
|
||||||
|
the positions ordered before all the velocities.
|
||||||
|
|
||||||
|
The observation is a `ndarray` with shape `(111,)` where the elements correspond to the following:
|
||||||
|
|
||||||
|
| Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit |
|
||||||
|
|-----|-------------------------------------------------------------|----------------|-----------------|----------------------------------------|-------|------|
|
||||||
|
| 0 | x-coordinate of the torso (centre) | -Inf | Inf | torso | free | position (m) |
|
||||||
|
| 1 | y-coordinate of the torso (centre) | -Inf | Inf | torso | free | position (m) |
|
||||||
|
| 2 | z-coordinate of the torso (centre) | -Inf | Inf | torso | free | position (m) |
|
||||||
|
| 3 | x-orientation of the torso (centre) | -Inf | Inf | torso | free | angle (rad) |
|
||||||
|
| 4 | y-orientation of the torso (centre) | -Inf | Inf | torso | free | angle (rad) |
|
||||||
|
| 5 | z-orientation of the torso (centre) | -Inf | Inf | torso | free | angle (rad) |
|
||||||
|
| 6 | w-orientation of the torso (centre) | -Inf | Inf | torso | free | angle (rad) |
|
||||||
|
| 7 | angle between torso and first link on front left | -Inf | Inf | hip_1 (front_left_leg) | hinge | angle (rad) |
|
||||||
|
| 8 | angle between the two links on the front left | -Inf | Inf | ankle_1 (front_left_leg) | hinge | angle (rad) |
|
||||||
|
| 9 | angle between torso and first link on front right | -Inf | Inf | hip_2 (front_right_leg) | hinge | angle (rad) |
|
||||||
|
| 10 | angle between the two links on the front right | -Inf | Inf | ankle_2 (front_right_leg) | hinge | angle (rad) |
|
||||||
|
| 11 | angle between torso and first link on back left | -Inf | Inf | hip_3 (back_leg) | hinge | angle (rad) |
|
||||||
|
| 12 | angle between the two links on the back left | -Inf | Inf | ankle_3 (back_leg) | hinge | angle (rad) |
|
||||||
|
| 13 | angle between torso and first link on back right | -Inf | Inf | hip_4 (right_back_leg) | hinge | angle (rad) |
|
||||||
|
| 14 | angle between the two links on the back right | -Inf | Inf | ankle_4 (right_back_leg) | hinge | angle (rad) |
|
||||||
|
| 15 | x-coordinate velocity of the torso | -Inf | Inf | torso | free | velocity (m/s) |
|
||||||
|
| 16 | y-coordinate velocity of the torso | -Inf | Inf | torso | free | velocity (m/s) |
|
||||||
|
| 17 | z-coordinate velocity of the torso | -Inf | Inf | torso | free | velocity (m/s) |
|
||||||
|
| 18 | x-coordinate angular velocity of the torso | -Inf | Inf | torso | free | angular velocity (rad/s) |
|
||||||
|
| 19 | y-coordinate angular velocity of the torso | -Inf | Inf | torso | free | angular velocity (rad/s) |
|
||||||
|
| 20 | z-coordinate angular velocity of the torso | -Inf | Inf | torso | free | angular velocity (rad/s) |
|
||||||
|
| 21 | angular velocity of angle between torso and front left link | -Inf | Inf | hip_1 (front_left_leg) | hinge | angle (rad) |
|
||||||
|
| 22 | angular velocity of the angle between front left links | -Inf | Inf | ankle_1 (front_left_leg) | hinge | angle (rad) |
|
||||||
|
| 23 | angular velocity of angle between torso and front right link| -Inf | Inf | hip_2 (front_right_leg) | hinge | angle (rad) |
|
||||||
|
| 24 | angular velocity of the angle between front right links | -Inf | Inf | ankle_2 (front_right_leg) | hinge | angle (rad) |
|
||||||
|
| 25 | angular velocity of angle between torso and back left link | -Inf | Inf | hip_3 (back_leg) | hinge | angle (rad) |
|
||||||
|
| 26 | angular velocity of the angle between back left links | -Inf | Inf | ankle_3 (back_leg) | hinge | angle (rad) |
|
||||||
|
| 27 | angular velocity of angle between torso and back right link | -Inf | Inf | hip_4 (right_back_leg) | hinge | angle (rad) |
|
||||||
|
| 28 |angular velocity of the angle between back right links | -Inf | Inf | ankle_4 (right_back_leg) | hinge | angle (rad) |
|
||||||
|
|
||||||
|
|
||||||
|
The remaining 14*6 = 84 elements in the state are contact forces
|
||||||
|
(external forces - force x, y, z and torque x, y, z) applied to the
|
||||||
|
center of mass of each of the links. The 14 links are: the ground link,
|
||||||
|
the torso link, and 3 links for each leg (1 + 1 + 12) with the 6 external forces.
|
||||||
|
|
||||||
|
The (x,y,z) coordinates are translational DOFs while the orientations are rotational
|
||||||
|
DOFs expressed as quaternions. One can read more about free joints on the [Mujoco Documentation](https://mujoco.readthedocs.io/en/latest/XMLreference.html).
|
||||||
|
|
||||||
|
**Note:** There are 29 elements in the table above - giving rise to `(113,)` elements
|
||||||
|
in the state space. In practice (and Gym implementation), the first two positional
|
||||||
|
elements are omitted from the state space since the reward function is calculated based
|
||||||
|
on the x-coordinate value. This value is hidden from the algorithm, which in turn has to
|
||||||
|
develop an abstract understanding of it from the observed rewards. Therefore, observation
|
||||||
|
space has shape `(111,)` instead of `(113,)` and the table should not have the first two rows.
|
||||||
|
|
||||||
|
**Note:** Ant-v4 environment no longer has the following contact forces issue.
|
||||||
|
If using previous Ant versions from v4, there have been reported issues that using a Mujoco-Py version > 2.0 results
|
||||||
|
in the contact forces always being 0. As such we recommend to use a Mujoco-Py version < 2.0
|
||||||
|
when using the Ant environment if you would like to report results with contact forces (if
|
||||||
|
contact forces are not used in your experiments, you can use version > 2.0).
|
||||||
|
|
||||||
|
**Note:** Ant-v4 has the option of including contact forces in the observation space. To add contact forces set the argument
|
||||||
|
'use_contact_forces" to True. The default value is False. Also note that training including contact forces can perform worse
|
||||||
|
than not using them as shown in (https://github.com/openai/gym/pull/2762).
|
||||||
|
|
||||||
|
### Rewards
|
||||||
|
The reward consists of three parts:
|
||||||
|
- *survive_reward*: Every timestep that the ant is alive, it gets a reward of 1.
|
||||||
|
- *forward_reward*: A reward of moving forward which is measured as
|
||||||
|
*(x-coordinate before action - x-coordinate after action)/dt*. *dt* is the time
|
||||||
|
between actions and is dependent on the frame_skip parameter (default is 5),
|
||||||
|
where the *dt* for one frame is 0.01 - making the default *dt = 5 * 0.01 = 0.05*.
|
||||||
|
This reward would be positive if the ant moves forward (right) desired.
|
||||||
|
- *ctrl_cost*: A negative reward for penalising the ant if it takes actions
|
||||||
|
that are too large. It is measured as *coefficient **x** sum(action<sup>2</sup>)*
|
||||||
|
where *coefficient* is a parameter set for the control and has a default value of 0.5.
|
||||||
|
- *contact_cost*: A negative reward for penalising the ant if the external contact
|
||||||
|
force is too large. It is calculated *0.5 * 0.001 * sum(clip(external contact
|
||||||
|
force to [-1,1])<sup>2</sup>)*.
|
||||||
|
|
||||||
|
The total reward returned is ***reward*** *=* *alive survive_reward + forward_reward - ctrl_cost - contact_cost*
|
||||||
|
|
||||||
|
### Starting State
|
||||||
|
All observations start in state
|
||||||
|
(0.0, 0.0, 0.75, 1.0, 0.0 ... 0.0) with a uniform noise in the range
|
||||||
|
of [-0.1, 0.1] added to the positional values and standard normal noise
|
||||||
|
with 0 mean and 0.1 standard deviation added to the velocity values for
|
||||||
|
stochasticity. Note that the initial z coordinate is intentionally selected
|
||||||
|
to be slightly high, thereby indicating a standing up ant. The initial orientation
|
||||||
|
is designed to make it face forward as well.
|
||||||
|
|
||||||
|
### Episode Termination
|
||||||
|
The episode terminates when any of the following happens:
|
||||||
|
|
||||||
|
1. The episode duration reaches a 1000 timesteps
|
||||||
|
2. Any of the state space values is no longer finite
|
||||||
|
3. The y-orientation (index 2) in the state is **not** in the range `[0.2, 1.0]`
|
||||||
|
|
||||||
|
### Arguments
|
||||||
|
|
||||||
|
No additional arguments are currently supported (in v2 and lower), but modifications
|
||||||
|
can be made to the XML file in the assets folder (or by changing the path to a modified
|
||||||
|
XML file in another folder).
|
||||||
|
|
||||||
|
```
|
||||||
|
env = gym.make('Ant-v2')
|
||||||
|
```
|
||||||
|
|
||||||
|
v3 and beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
|
||||||
|
|
||||||
|
```
|
||||||
|
env = gym.make('Ant-v3', ctrl_cost_weight=0.1, ...)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Version History
|
||||||
|
* v4: all mujoco environments now use the mujoco binidings in mujoco>=2.1.3
|
||||||
|
* v3: support for gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. rgb rendering comes from tracking camera (so agent does not run away from screen)
|
||||||
|
* v2: All continuous control environments now use mujoco_py >= 1.50
|
||||||
|
* v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments.
|
||||||
|
* v0: Initial versions release (1.0.0)
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
xml_file="ant.xml",
|
||||||
|
ctrl_cost_weight=0.5,
|
||||||
|
use_contact_forces=False,
|
||||||
|
contact_cost_weight=5e-4,
|
||||||
|
healthy_reward=1.0,
|
||||||
|
terminate_when_unhealthy=True,
|
||||||
|
healthy_z_range=(0.2, 1.0),
|
||||||
|
contact_force_range=(-1.0, 1.0),
|
||||||
|
reset_noise_scale=0.1,
|
||||||
|
exclude_current_positions_from_observation=True,
|
||||||
|
):
|
||||||
|
utils.EzPickle.__init__(**locals())
|
||||||
|
|
||||||
|
self._ctrl_cost_weight = ctrl_cost_weight
|
||||||
|
self._contact_cost_weight = contact_cost_weight
|
||||||
|
|
||||||
|
self._healthy_reward = healthy_reward
|
||||||
|
self._terminate_when_unhealthy = terminate_when_unhealthy
|
||||||
|
self._healthy_z_range = healthy_z_range
|
||||||
|
|
||||||
|
self._contact_force_range = contact_force_range
|
||||||
|
|
||||||
|
self._reset_noise_scale = reset_noise_scale
|
||||||
|
|
||||||
|
self._use_contact_forces = use_contact_forces
|
||||||
|
|
||||||
|
self._exclude_current_positions_from_observation = (
|
||||||
|
exclude_current_positions_from_observation
|
||||||
|
)
|
||||||
|
|
||||||
|
mujoco_env.MujocoEnv.__init__(self, xml_file, 5)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def healthy_reward(self):
|
||||||
|
return (
|
||||||
|
float(self.is_healthy or self._terminate_when_unhealthy)
|
||||||
|
* self._healthy_reward
|
||||||
|
)
|
||||||
|
|
||||||
|
def control_cost(self, action):
|
||||||
|
control_cost = self._ctrl_cost_weight * np.sum(np.square(action))
|
||||||
|
return control_cost
|
||||||
|
|
||||||
|
@property
|
||||||
|
def contact_forces(self):
|
||||||
|
raw_contact_forces = self.data.cfrc_ext
|
||||||
|
min_value, max_value = self._contact_force_range
|
||||||
|
contact_forces = np.clip(raw_contact_forces, min_value, max_value)
|
||||||
|
return contact_forces
|
||||||
|
|
||||||
|
@property
|
||||||
|
def contact_cost(self):
|
||||||
|
contact_cost = self._contact_cost_weight * np.sum(
|
||||||
|
np.square(self.contact_forces)
|
||||||
|
)
|
||||||
|
return contact_cost
|
||||||
|
|
||||||
|
@property
|
||||||
|
def is_healthy(self):
|
||||||
|
state = self.state_vector()
|
||||||
|
min_z, max_z = self._healthy_z_range
|
||||||
|
is_healthy = np.isfinite(state).all() and min_z <= state[2] <= max_z
|
||||||
|
return is_healthy
|
||||||
|
|
||||||
|
@property
|
||||||
|
def done(self):
|
||||||
|
done = not self.is_healthy if self._terminate_when_unhealthy else False
|
||||||
|
return done
|
||||||
|
|
||||||
|
def step(self, action):
|
||||||
|
xy_position_before = self.get_body_com("torso")[:2].copy()
|
||||||
|
self.do_simulation(action, self.frame_skip)
|
||||||
|
xy_position_after = self.get_body_com("torso")[:2].copy()
|
||||||
|
|
||||||
|
xy_velocity = (xy_position_after - xy_position_before) / self.dt
|
||||||
|
x_velocity, y_velocity = xy_velocity
|
||||||
|
|
||||||
|
forward_reward = x_velocity
|
||||||
|
healthy_reward = self.healthy_reward
|
||||||
|
|
||||||
|
rewards = forward_reward + healthy_reward
|
||||||
|
|
||||||
|
costs = ctrl_cost = self.control_cost(action)
|
||||||
|
|
||||||
|
done = self.done
|
||||||
|
observation = self._get_obs()
|
||||||
|
info = {
|
||||||
|
"reward_forward": forward_reward,
|
||||||
|
"reward_ctrl": -ctrl_cost,
|
||||||
|
"reward_survive": healthy_reward,
|
||||||
|
"x_position": xy_position_after[0],
|
||||||
|
"y_position": xy_position_after[1],
|
||||||
|
"distance_from_origin": np.linalg.norm(xy_position_after, ord=2),
|
||||||
|
"x_velocity": x_velocity,
|
||||||
|
"y_velocity": y_velocity,
|
||||||
|
"forward_reward": forward_reward,
|
||||||
|
}
|
||||||
|
if self._use_contact_forces:
|
||||||
|
contact_cost = self.contact_cost
|
||||||
|
costs += contact_cost
|
||||||
|
info["reward_ctrl"] = -contact_cost
|
||||||
|
|
||||||
|
reward = rewards - costs
|
||||||
|
|
||||||
|
return observation, reward, done, info
|
||||||
|
|
||||||
|
def _get_obs(self):
|
||||||
|
position = self.data.qpos.flat.copy()
|
||||||
|
velocity = self.data.qvel.flat.copy()
|
||||||
|
|
||||||
|
if self._exclude_current_positions_from_observation:
|
||||||
|
position = position[2:]
|
||||||
|
|
||||||
|
if self._use_contact_forces:
|
||||||
|
contact_force = self.contact_forces.flat.copy()
|
||||||
|
return np.concatenate((position, velocity, contact_force))
|
||||||
|
else:
|
||||||
|
return np.concatenate((position, velocity))
|
||||||
|
|
||||||
|
def reset_model(self):
|
||||||
|
noise_low = -self._reset_noise_scale
|
||||||
|
noise_high = self._reset_noise_scale
|
||||||
|
|
||||||
|
qpos = self.init_qpos + self.np_random.uniform(
|
||||||
|
low=noise_low, high=noise_high, size=self.model.nq
|
||||||
|
)
|
||||||
|
qvel = (
|
||||||
|
self.init_qvel
|
||||||
|
+ self._reset_noise_scale * self.np_random.standard_normal(self.model.nv)
|
||||||
|
)
|
||||||
|
self.set_state(qpos, qvel)
|
||||||
|
|
||||||
|
observation = self._get_obs()
|
||||||
|
|
||||||
|
return observation
|
||||||
|
|
||||||
|
def viewer_setup(self):
|
||||||
|
for key, value in DEFAULT_CAMERA_CONFIG.items():
|
||||||
|
if isinstance(value, np.ndarray):
|
||||||
|
getattr(self.viewer.cam, key)[:] = value
|
||||||
|
else:
|
||||||
|
setattr(self.viewer.cam, key, value)
|
@@ -22,6 +22,6 @@
|
|||||||
</body>
|
</body>
|
||||||
</worldbody>
|
</worldbody>
|
||||||
<actuator>
|
<actuator>
|
||||||
<motor gear="100" joint="slider" name="slide"/>
|
<motor ctrllimited="true" ctrlrange="-3 3" gear="100" joint="slider" name="slide"/>
|
||||||
</actuator>
|
</actuator>
|
||||||
</mujoco>
|
</mujoco>
|
@@ -6,7 +6,9 @@ from gym.envs.mujoco import mujoco_env
|
|||||||
|
|
||||||
class HalfCheetahEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
class HalfCheetahEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
mujoco_env.MujocoEnv.__init__(self, "half_cheetah.xml", 5)
|
mujoco_env.MujocoEnv.__init__(
|
||||||
|
self, "half_cheetah.xml", 5, mujoco_bindings="mujoco_py"
|
||||||
|
)
|
||||||
utils.EzPickle.__init__(self)
|
utils.EzPickle.__init__(self)
|
||||||
|
|
||||||
def step(self, action):
|
def step(self, action):
|
||||||
|
@@ -10,125 +10,6 @@ DEFAULT_CAMERA_CONFIG = {
|
|||||||
|
|
||||||
|
|
||||||
class HalfCheetahEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
class HalfCheetahEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
||||||
"""
|
|
||||||
### Description
|
|
||||||
|
|
||||||
This environment is based on the work by P. Wawrzy´nski in
|
|
||||||
["A Cat-Like Robot Real-Time Learning to Run"](http://staff.elka.pw.edu.pl/~pwawrzyn/pub-s/0812_LSCLRR.pdf).
|
|
||||||
The HalfCheetah is a 2-dimensional robot consisting of 9 links and 8
|
|
||||||
joints connecting them (including two paws). The goal is to apply a torque
|
|
||||||
on the joints to make the cheetah run forward (right) as fast as possible,
|
|
||||||
with a positive reward allocated based on the distance moved forward and a
|
|
||||||
negative reward allocated for moving backward. The torso and head of the
|
|
||||||
cheetah are fixed, and the torque can only be applied on the other 6 joints
|
|
||||||
over the front and back thighs (connecting to the torso), shins
|
|
||||||
(connecting to the thighs) and feet (connecting to the shins).
|
|
||||||
|
|
||||||
### Action Space
|
|
||||||
The action space is a `Box(-1, 1, (6,), float32)`. An action represents the torques applied between *links*.
|
|
||||||
|
|
||||||
| Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit |
|
|
||||||
|-------|--------------------------------------|---------------|----------------|---------------------------------------|-------|------|
|
|
||||||
| 0 | Torque applied on the back thigh rotor | -1 | 1 | bthigh | hinge | torque (N m) |
|
|
||||||
| 1 | Torque applied on the back shin rotor | -1 | 1 | bshin | hinge | torque (N m) |
|
|
||||||
| 2 | Torque applied on the back foot rotor | -1 | 1 | bfoot | hinge | torque (N m) |
|
|
||||||
| 3 | Torque applied on the front thigh rotor| -1 | 1 | fthigh | hinge | torque (N m) |
|
|
||||||
| 4 | Torque applied on the front shin rotor | -1 | 1 | fshin | hinge | torque (N m) |
|
|
||||||
| 5 | Torque applied on the front foot rotor | -1 | 1 | ffoot | hinge | torque (N m) |
|
|
||||||
|
|
||||||
### Observation Space
|
|
||||||
|
|
||||||
Observations consist of positional values of different body parts of the
|
|
||||||
cheetah, followed by the velocities of those individual parts (their derivatives) with all the positions ordered before all the velocities.
|
|
||||||
|
|
||||||
By default, observations do not include the x-coordinate of the cheetah's center of mass. It may
|
|
||||||
be included by passing `exclude_current_positions_from_observation=False` during construction.
|
|
||||||
In that case, the observation space will have 18 dimensions where the first dimension
|
|
||||||
represents the x-coordinate of the cheetah's center of mass.
|
|
||||||
Regardless of whether `exclude_current_positions_from_observation` was set to true or false, the x-coordinate
|
|
||||||
will be returned in `info` with key `"x_position"`.
|
|
||||||
|
|
||||||
However, by default, the observation is a `ndarray` with shape `(17,)` where the elements correspond to the following:
|
|
||||||
|
|
||||||
|
|
||||||
| Num | Observation | Min | Max | Name (in corresponding XML file) | Joint| Unit |
|
|
||||||
|-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------|
|
|
||||||
| 0 | z-coordinate of the front tip | -Inf | Inf | rootz | slide | position (m) |
|
|
||||||
| 1 | angle of the front tip | -Inf | Inf | rooty | hinge | angle (rad) |
|
|
||||||
| 2 | angle of the second rotor | -Inf | Inf | bthigh | hinge | angle (rad) |
|
|
||||||
| 3 | angle of the second rotor | -Inf | Inf | bshin | hinge | angle (rad) |
|
|
||||||
| 4 | velocity of the tip along the x-axis | -Inf | Inf | bfoot | hinge | angle (rad) |
|
|
||||||
| 5 | velocity of the tip along the y-axis | -Inf | Inf | fthigh | hinge | angle (rad) |
|
|
||||||
| 6 | angular velocity of front tip | -Inf | Inf | fshin | hinge | angle (rad) |
|
|
||||||
| 7 | angular velocity of second rotor | -Inf | Inf | ffoot | hinge | angle (rad) |
|
|
||||||
| 8 | x-coordinate of the front tip | -Inf | Inf | rootx | slide | velocity (m/s) |
|
|
||||||
| 9 | y-coordinate of the front tip | -Inf | Inf | rootz | slide | velocity (m/s) |
|
|
||||||
| 10 | angle of the front tip | -Inf | Inf | rooty | hinge | angular velocity (rad/s) |
|
|
||||||
| 11 | angle of the second rotor | -Inf | Inf | bthigh | hinge | angular velocity (rad/s) |
|
|
||||||
| 12 | angle of the second rotor | -Inf | Inf | bshin | hinge | angular velocity (rad/s) |
|
|
||||||
| 13 | velocity of the tip along the x-axis | -Inf | Inf | bfoot | hinge | angular velocity (rad/s) |
|
|
||||||
| 14 | velocity of the tip along the y-axis | -Inf | Inf | fthigh | hinge |angular velocity (rad/s) |
|
|
||||||
| 15 | angular velocity of front tip | -Inf | Inf | fshin | hinge | angular velocity (rad/s) |
|
|
||||||
| 16 | angular velocity of second rotor | -Inf | Inf | ffoot | hinge | angular velocity (rad/s) |
|
|
||||||
|
|
||||||
### Rewards
|
|
||||||
The reward consists of two parts:
|
|
||||||
- *forward_reward*: A reward of moving forward which is measured
|
|
||||||
as *`forward_reward_weight` * (x-coordinate before action - x-coordinate after action)/dt*. *dt* is
|
|
||||||
the time between actions and is dependent on the frame_skip parameter
|
|
||||||
(fixed to 5), where the frametime is 0.01 - making the
|
|
||||||
default *dt = 5 * 0.01 = 0.05*. This reward would be positive if the cheetah
|
|
||||||
runs forward (right).
|
|
||||||
- *ctrl_cost*: A cost for penalising the cheetah if it takes
|
|
||||||
actions that are too large. It is measured as *`ctrl_cost_weight` *
|
|
||||||
sum(action<sup>2</sup>)* where *`ctrl_cost_weight`* is a parameter set for the
|
|
||||||
control and has a default value of 0.1
|
|
||||||
|
|
||||||
The total reward returned is ***reward*** *=* *forward_reward - ctrl_cost* and `info` will also contain the individual reward terms
|
|
||||||
|
|
||||||
### Starting State
|
|
||||||
All observations start in state (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
|
|
||||||
0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,) with a noise added to the
|
|
||||||
initial state for stochasticity. As seen before, the first 8 values in the
|
|
||||||
state are positional and the last 9 values are velocity. A uniform noise in
|
|
||||||
the range of [-`reset_noise_scale`, `reset_noise_scale`] is added to the positional values while a standard
|
|
||||||
normal noise with a mean of 0 and standard deviation of `reset_noise_scale` is added to the
|
|
||||||
initial velocity values of all zeros.
|
|
||||||
|
|
||||||
### Episode Termination
|
|
||||||
The episode terminates when the episode length is greater than 1000.
|
|
||||||
|
|
||||||
### Arguments
|
|
||||||
|
|
||||||
No additional arguments are currently supported in v2 and lower.
|
|
||||||
|
|
||||||
```
|
|
||||||
env = gym.make('HalfCheetah-v2')
|
|
||||||
```
|
|
||||||
|
|
||||||
v3 and beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
|
|
||||||
|
|
||||||
```
|
|
||||||
env = gym.make('HalfCheetah-v3', ctrl_cost_weight=0.1, ....)
|
|
||||||
```
|
|
||||||
|
|
||||||
| Parameter | Type | Default |Description |
|
|
||||||
|-------------------------|------------|----------------------|-------------------------------|
|
|
||||||
| `xml_file` | **str** | `"half_cheetah.xml"` | Path to a MuJoCo model |
|
|
||||||
| `forward_reward_weight` | **float** | `1.0` | Weight for *forward_reward* term (see section on reward) |
|
|
||||||
| `ctrl_cost_weight` | **float** | `0.1` | Weight for *ctrl_cost* weight (see section on reward) |
|
|
||||||
| `reset_noise_scale` | **float** | `0.1` | Scale of random perturbations of initial position and velocity (see section on Starting State) |
|
|
||||||
| `exclude_current_positions_from_observation`| **bool** | `True` | Whether or not to omit the x-coordinate from observations. Excluding the position can serve as an inductive bias to induce position-agnostic behavior in policies |
|
|
||||||
|
|
||||||
|
|
||||||
### Version History
|
|
||||||
|
|
||||||
* v3: support for gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. rgb rendering comes from tracking camera (so agent does not run away from screen)
|
|
||||||
* v2: All continuous control environments now use mujoco_py >= 1.50
|
|
||||||
* v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments.
|
|
||||||
* v0: Initial versions release (1.0.0)
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
xml_file="half_cheetah.xml",
|
xml_file="half_cheetah.xml",
|
||||||
@@ -149,7 +30,7 @@ class HalfCheetahEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
|||||||
exclude_current_positions_from_observation
|
exclude_current_positions_from_observation
|
||||||
)
|
)
|
||||||
|
|
||||||
mujoco_env.MujocoEnv.__init__(self, xml_file, 5)
|
mujoco_env.MujocoEnv.__init__(self, xml_file, 5, mujoco_bindings="mujoco_py")
|
||||||
|
|
||||||
def control_cost(self, action):
|
def control_cost(self, action):
|
||||||
control_cost = self._ctrl_cost_weight * np.sum(np.square(action))
|
control_cost = self._ctrl_cost_weight * np.sum(np.square(action))
|
||||||
|
230
gym/envs/mujoco/half_cheetah_v4.py
Normal file
230
gym/envs/mujoco/half_cheetah_v4.py
Normal file
@@ -0,0 +1,230 @@
|
|||||||
|
__credits__ = ["Rushiv Arora"]
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from gym import utils
|
||||||
|
from gym.envs.mujoco import mujoco_env
|
||||||
|
|
||||||
|
DEFAULT_CAMERA_CONFIG = {
|
||||||
|
"distance": 4.0,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class HalfCheetahEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
||||||
|
"""
|
||||||
|
### Description
|
||||||
|
|
||||||
|
This environment is based on the work by P. Wawrzy´nski in
|
||||||
|
["A Cat-Like Robot Real-Time Learning to Run"](http://staff.elka.pw.edu.pl/~pwawrzyn/pub-s/0812_LSCLRR.pdf).
|
||||||
|
The HalfCheetah is a 2-dimensional robot consisting of 9 links and 8
|
||||||
|
joints connecting them (including two paws). The goal is to apply a torque
|
||||||
|
on the joints to make the cheetah run forward (right) as fast as possible,
|
||||||
|
with a positive reward allocated based on the distance moved forward and a
|
||||||
|
negative reward allocated for moving backward. The torso and head of the
|
||||||
|
cheetah are fixed, and the torque can only be applied on the other 6 joints
|
||||||
|
over the front and back thighs (connecting to the torso), shins
|
||||||
|
(connecting to the thighs) and feet (connecting to the shins).
|
||||||
|
|
||||||
|
### Action Space
|
||||||
|
The agent take a 6-element vector for actions.
|
||||||
|
The action space is a continuous `(action, action, action, action, action, action)` all in `[-1.0, 1.0]`, where `action` represents the numerical torques applied between *links*
|
||||||
|
|
||||||
|
| Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit |
|
||||||
|
|-------|--------------------------------------|---------------|----------------|---------------------------------------|-------|------|
|
||||||
|
| 0 | Torque applied on the back thigh rotor | -1 | 1 | bthigh | hinge | torque (N m) |
|
||||||
|
| 1 | Torque applied on the back shin rotor | -1 | 1 | bshin | hinge | torque (N m) |
|
||||||
|
| 2 | Torque applied on the back foot rotor | -1 | 1 | bfoot | hinge | torque (N m) |
|
||||||
|
| 3 | Torque applied on the front thigh rotor| -1 | 1 | fthigh | hinge | torque (N m) |
|
||||||
|
| 4 | Torque applied on the front shin rotor | -1 | 1 | fshin | hinge | torque (N m) |
|
||||||
|
| 5 | Torque applied on the front foot rotor | -1 | 1 | ffoot | hinge | torque (N m) |
|
||||||
|
|
||||||
|
### Observation Space
|
||||||
|
|
||||||
|
The state space consists of positional values of different body parts of the
|
||||||
|
cheetah, followed by the velocities of those individual parts (their derivatives) with all the positions ordered before all the velocities.
|
||||||
|
|
||||||
|
The observation is a `ndarray` with shape `(17,)` where the elements correspond to the following:
|
||||||
|
|
||||||
|
| Num | Observation | Min | Max | Name (in corresponding XML file) | Joint| Unit |
|
||||||
|
|-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------|
|
||||||
|
| 0 | x-coordinate of the center of mass | -Inf | Inf | rootx | slide | position (m) |
|
||||||
|
| 1 | y-coordinate of the center of mass | -Inf | Inf | rootz | slide | position (m) |
|
||||||
|
| 2 | angle of the front tip | -Inf | Inf | rooty | hinge | angle (rad) |
|
||||||
|
| 3 | angle of the back thigh rotor | -Inf | Inf | bthigh | hinge | angle (rad) |
|
||||||
|
| 4 | angle of the back shin rotor | -Inf | Inf | bshin | hinge | angle (rad) |
|
||||||
|
| 5 | angle of the back foot rotor | -Inf | Inf | bfoot | hinge | angle (rad) |
|
||||||
|
| 6 | velocity of the tip along the y-axis | -Inf | Inf | fthigh | hinge | angle (rad) |
|
||||||
|
| 7 | angular velocity of front tip | -Inf | Inf | fshin | hinge | angle (rad) |
|
||||||
|
| 8 | angular velocity of second rotor | -Inf | Inf | ffoot | hinge | angle (rad) |
|
||||||
|
| 9 | x-coordinate of the front tip | -Inf | Inf | rootx | slide | velocity (m/s) |
|
||||||
|
| 10 | y-coordinate of the front tip | -Inf | Inf | rootz | slide | velocity (m/s) |
|
||||||
|
| 11 | angle of the front tip | -Inf | Inf | rooty | hinge | angular velocity (rad/s) |
|
||||||
|
| 12 | angle of the second rotor | -Inf | Inf | bthigh | hinge | angular velocity (rad/s) |
|
||||||
|
| 13 | angle of the second rotor | -Inf | Inf | bshin | hinge | angular velocity (rad/s) |
|
||||||
|
| 14 | velocity of the tip along the x-axis | -Inf | Inf | bfoot | hinge | angular velocity (rad/s) |
|
||||||
|
| 15 | velocity of the tip along the y-axis | -Inf | Inf | fthigh | hinge |angular velocity (rad/s) |
|
||||||
|
| 16 | angular velocity of front tip | -Inf | Inf | fshin | hinge | angular velocity (rad/s) |
|
||||||
|
| 17 | angular velocity of second rotor | -Inf | Inf | ffoot | hinge | angular velocity (rad/s) |
|
||||||
|
|
||||||
|
|
||||||
|
**Note:**
|
||||||
|
In practice (and Gym implementation), the first positional element is
|
||||||
|
omitted from the state space since the reward function is calculated based
|
||||||
|
on that value. This value is hidden from the algorithm, which in turn has
|
||||||
|
to develop an abstract understanding of it from the observed rewards.
|
||||||
|
Therefore, observation space has shape `(8,)` and looks like:
|
||||||
|
|
||||||
|
| Num | Observation | Min | Max | Name (in corresponding XML file) | Joint| Unit |
|
||||||
|
|-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------|
|
||||||
|
| 0 | y-coordinate of the front tip | -Inf | Inf | rootz | slide | position (m) |
|
||||||
|
| 1 | angle of the front tip | -Inf | Inf | rooty | hinge | angle (rad) |
|
||||||
|
| 2 | angle of the second rotor | -Inf | Inf | bthigh | hinge | angle (rad) |
|
||||||
|
| 3 | angle of the second rotor | -Inf | Inf | bshin | hinge | angle (rad) |
|
||||||
|
| 4 | velocity of the tip along the x-axis | -Inf | Inf | bfoot | hinge | angle (rad) |
|
||||||
|
| 5 | velocity of the tip along the y-axis | -Inf | Inf | fthigh | hinge | angle (rad) |
|
||||||
|
| 6 | angular velocity of front tip | -Inf | Inf | fshin | hinge | angle (rad) |
|
||||||
|
| 7 | angular velocity of second rotor | -Inf | Inf | ffoot | hinge | angle (rad) |
|
||||||
|
| 8 | x-coordinate of the front tip | -Inf | Inf | rootx | slide | velocity (m/s) |
|
||||||
|
| 9 | y-coordinate of the front tip | -Inf | Inf | rootz | slide | velocity (m/s) |
|
||||||
|
| 10 | angle of the front tip | -Inf | Inf | rooty | hinge | angular velocity (rad/s) |
|
||||||
|
| 11 | angle of the second rotor | -Inf | Inf | bthigh | hinge | angular velocity (rad/s) |
|
||||||
|
| 12 | angle of the second rotor | -Inf | Inf | bshin | hinge | angular velocity (rad/s) |
|
||||||
|
| 13 | velocity of the tip along the x-axis | -Inf | Inf | bfoot | hinge | angular velocity (rad/s) |
|
||||||
|
| 14 | velocity of the tip along the y-axis | -Inf | Inf | fthigh | hinge |angular velocity (rad/s) |
|
||||||
|
| 15 | angular velocity of front tip | -Inf | Inf | fshin | hinge | angular velocity (rad/s) |
|
||||||
|
| 16 | angular velocity of second rotor | -Inf | Inf | ffoot | hinge | angular velocity (rad/s) |
|
||||||
|
|
||||||
|
### Rewards
|
||||||
|
The reward consists of two parts:
|
||||||
|
- *reward_run*: A reward of moving forward which is measured
|
||||||
|
as *(x-coordinate before action - x-coordinate after action)/dt*. *dt* is
|
||||||
|
the time between actions and is dependeent on the frame_skip parameter
|
||||||
|
(default is 5), where the *dt* for one frame is 0.01 - making the
|
||||||
|
default *dt = 5*0.01 = 0.05*. This reward would be positive if the cheetah
|
||||||
|
runs forward (right) desired.
|
||||||
|
- *reward_control*: A negative reward for penalising the cheetah if it takes
|
||||||
|
actions that are too large. It is measured as *-coefficient x
|
||||||
|
sum(action<sup>2</sup>)* where *coefficient* is a parameter set for the
|
||||||
|
control and has a default value of 0.1
|
||||||
|
|
||||||
|
The total reward returned is ***reward*** *=* *reward_run + reward_control*
|
||||||
|
|
||||||
|
### Starting State
|
||||||
|
All observations start in state (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
|
||||||
|
0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,) with a noise added to the
|
||||||
|
initial state for stochasticity. As seen before, the first 8 values in the
|
||||||
|
state are positional and the last 9 values are velocity. A uniform noise in
|
||||||
|
the range of [-0.1, 0.1] is added to the positional values while a standard
|
||||||
|
normal noise with a mean of 0 and standard deviation of 0.1 is added to the
|
||||||
|
initial velocity values of all zeros.
|
||||||
|
|
||||||
|
### Episode Termination
|
||||||
|
The episode terminates when the episode length is greater than 1000.
|
||||||
|
|
||||||
|
### Arguments
|
||||||
|
|
||||||
|
No additional arguments are currently supported (in v2 and lower), but
|
||||||
|
modifications can be made to the XML file in the assets folder at
|
||||||
|
`gym/envs/mujoco/assets/half_cheetah.xml` (or by changing the path to a
|
||||||
|
modified XML file in another folder).
|
||||||
|
|
||||||
|
```
|
||||||
|
env = gym.make('HalfCheetah-v2')
|
||||||
|
```
|
||||||
|
|
||||||
|
v3 and beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
|
||||||
|
|
||||||
|
```
|
||||||
|
env = gym.make('HalfCheetah-v3', ctrl_cost_weight=0.1, ....)
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
### Version History
|
||||||
|
|
||||||
|
* v4: all mujoco environments now use the mujoco binidings in mujoco>=2.1.3
|
||||||
|
* v3: support for gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. rgb rendering comes from tracking camera (so agent does not run away from screen)
|
||||||
|
* v2: All continuous control environments now use mujoco_py >= 1.50
|
||||||
|
* v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments.
|
||||||
|
* v0: Initial versions release (1.0.0)
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
xml_file="half_cheetah.xml",
|
||||||
|
forward_reward_weight=1.0,
|
||||||
|
ctrl_cost_weight=0.1,
|
||||||
|
reset_noise_scale=0.1,
|
||||||
|
exclude_current_positions_from_observation=True,
|
||||||
|
):
|
||||||
|
utils.EzPickle.__init__(**locals())
|
||||||
|
|
||||||
|
self._forward_reward_weight = forward_reward_weight
|
||||||
|
|
||||||
|
self._ctrl_cost_weight = ctrl_cost_weight
|
||||||
|
|
||||||
|
self._reset_noise_scale = reset_noise_scale
|
||||||
|
|
||||||
|
self._exclude_current_positions_from_observation = (
|
||||||
|
exclude_current_positions_from_observation
|
||||||
|
)
|
||||||
|
|
||||||
|
mujoco_env.MujocoEnv.__init__(self, xml_file, 5)
|
||||||
|
|
||||||
|
def control_cost(self, action):
|
||||||
|
control_cost = self._ctrl_cost_weight * np.sum(np.square(action))
|
||||||
|
return control_cost
|
||||||
|
|
||||||
|
def step(self, action):
|
||||||
|
x_position_before = self.data.qpos[0]
|
||||||
|
self.do_simulation(action, self.frame_skip)
|
||||||
|
x_position_after = self.data.qpos[0]
|
||||||
|
x_velocity = (x_position_after - x_position_before) / self.dt
|
||||||
|
|
||||||
|
ctrl_cost = self.control_cost(action)
|
||||||
|
|
||||||
|
forward_reward = self._forward_reward_weight * x_velocity
|
||||||
|
|
||||||
|
observation = self._get_obs()
|
||||||
|
reward = forward_reward - ctrl_cost
|
||||||
|
done = False
|
||||||
|
info = {
|
||||||
|
"x_position": x_position_after,
|
||||||
|
"x_velocity": x_velocity,
|
||||||
|
"reward_run": forward_reward,
|
||||||
|
"reward_ctrl": -ctrl_cost,
|
||||||
|
}
|
||||||
|
|
||||||
|
return observation, reward, done, info
|
||||||
|
|
||||||
|
def _get_obs(self):
|
||||||
|
position = self.data.qpos.flat.copy()
|
||||||
|
velocity = self.data.qvel.flat.copy()
|
||||||
|
|
||||||
|
if self._exclude_current_positions_from_observation:
|
||||||
|
position = position[1:]
|
||||||
|
|
||||||
|
observation = np.concatenate((position, velocity)).ravel()
|
||||||
|
return observation
|
||||||
|
|
||||||
|
def reset_model(self):
|
||||||
|
noise_low = -self._reset_noise_scale
|
||||||
|
noise_high = self._reset_noise_scale
|
||||||
|
|
||||||
|
qpos = self.init_qpos + self.np_random.uniform(
|
||||||
|
low=noise_low, high=noise_high, size=self.model.nq
|
||||||
|
)
|
||||||
|
qvel = (
|
||||||
|
self.init_qvel
|
||||||
|
+ self._reset_noise_scale * self.np_random.standard_normal(self.model.nv)
|
||||||
|
)
|
||||||
|
|
||||||
|
self.set_state(qpos, qvel)
|
||||||
|
|
||||||
|
observation = self._get_obs()
|
||||||
|
return observation
|
||||||
|
|
||||||
|
def viewer_setup(self):
|
||||||
|
for key, value in DEFAULT_CAMERA_CONFIG.items():
|
||||||
|
if isinstance(value, np.ndarray):
|
||||||
|
getattr(self.viewer.cam, key)[:] = value
|
||||||
|
else:
|
||||||
|
setattr(self.viewer.cam, key, value)
|
@@ -6,7 +6,9 @@ from gym.envs.mujoco import mujoco_env
|
|||||||
|
|
||||||
class HopperEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
class HopperEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
mujoco_env.MujocoEnv.__init__(self, "hopper.xml", 4)
|
mujoco_env.MujocoEnv.__init__(
|
||||||
|
self, "hopper.xml", 4, mujoco_bindings="mujoco_py"
|
||||||
|
)
|
||||||
utils.EzPickle.__init__(self)
|
utils.EzPickle.__init__(self)
|
||||||
|
|
||||||
def step(self, a):
|
def step(self, a):
|
||||||
|
@@ -14,130 +14,6 @@ DEFAULT_CAMERA_CONFIG = {
|
|||||||
|
|
||||||
|
|
||||||
class HopperEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
class HopperEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
||||||
"""
|
|
||||||
### Description
|
|
||||||
|
|
||||||
This environment is based on the work done by Erez, Tassa, and Todorov in
|
|
||||||
["Infinite Horizon Model Predictive Control for Nonlinear Periodic Tasks"](http://www.roboticsproceedings.org/rss07/p10.pdf). The environment aims to
|
|
||||||
increase the number of independent state and control variables as compared to
|
|
||||||
the classic control environments. The hopper is a two-dimensional
|
|
||||||
one-legged figure that consist of four main body parts - the torso at the
|
|
||||||
top, the thigh in the middle, the leg in the bottom, and a single foot on
|
|
||||||
which the entire body rests. The goal is to make hops that move in the
|
|
||||||
forward (right) direction by applying torques on the three hinges
|
|
||||||
connecting the four body parts.
|
|
||||||
|
|
||||||
### Action Space
|
|
||||||
The action space is a `Box(-1, 1, (3,), float32)`. An action represents the torques applied between *links*
|
|
||||||
|
|
||||||
| Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit |
|
|
||||||
|-----|------------------------------------|-------------|-------------|----------------------------------|-------|--------------|
|
|
||||||
| 0 | Torque applied on the thigh rotor | -1 | 1 | thigh_joint | hinge | torque (N m) |
|
|
||||||
| 1 | Torque applied on the leg rotor | -1 | 1 | leg_joint | hinge | torque (N m) |
|
|
||||||
| 3 | Torque applied on the foot rotor | -1 | 1 | foot_joint | hinge | torque (N m) |
|
|
||||||
|
|
||||||
### Observation Space
|
|
||||||
|
|
||||||
Observations consist of positional values of different body parts of the
|
|
||||||
hopper, followed by the velocities of those individual parts
|
|
||||||
(their derivatives) with all the positions ordered before all the velocities.
|
|
||||||
|
|
||||||
By default, observations do not include the x-coordinate of the hopper. It may
|
|
||||||
be included by passing `exclude_current_positions_from_observation=False` during construction.
|
|
||||||
In that case, the observation space will have 12 dimensions where the first dimension
|
|
||||||
represents the x-coordinate of the hopper.
|
|
||||||
Regardless of whether `exclude_current_positions_from_observation` was set to true or false, the x-coordinate
|
|
||||||
will be returned in `info` with key `"x_position"`.
|
|
||||||
|
|
||||||
However, by default, the observation is a `ndarray` with shape `(11,)` where the elements
|
|
||||||
correspond to the following:
|
|
||||||
|
|
||||||
| Num | Observation | Min | Max | Name (in corresponding XML file) | Joint| Unit |
|
|
||||||
|-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------|
|
|
||||||
| 0 | z-coordinate of the top (height of hopper) | -Inf | Inf | rootz | slide | position (m) |
|
|
||||||
| 1 | angle of the top | -Inf | Inf | rooty | hinge | angle (rad) |
|
|
||||||
| 2 | angle of the thigh joint | -Inf | Inf | thigh_joint | hinge | angle (rad) |
|
|
||||||
| 3 | angle of the leg joint | -Inf | Inf | leg_joint | hinge | angle (rad) |
|
|
||||||
| 4 | angle of the foot joint | -Inf | Inf | foot_joint | hinge | angle (rad) |
|
|
||||||
| 5 | velocity of the x-coordinate of the top | -Inf | Inf | rootx | slide | velocity (m/s) |
|
|
||||||
| 6 | velocity of the z-coordinate (height) of the top | -Inf | Inf | rootz | slide | velocity (m/s) |
|
|
||||||
| 7 | angular velocity of the angle of the top | -Inf | Inf | rooty | hinge | angular velocity (rad/s) |
|
|
||||||
| 8 | angular velocity of the thigh hinge | -Inf | Inf | thigh_joint | hinge | angular velocity (rad/s) |
|
|
||||||
| 9 | angular velocity of the leg hinge | -Inf | Inf | leg_joint | hinge | angular velocity (rad/s) |
|
|
||||||
| 10 | angular velocity of the foot hinge | -Inf | Inf | foot_joint | hinge | angular velocity (rad/s) |
|
|
||||||
|
|
||||||
### Rewards
|
|
||||||
The reward consists of three parts:
|
|
||||||
- *healthy_reward*: Every timestep that the hopper is healthy (see definition in section "Episode Termination"), it gets a reward of fixed value `healthy_reward`.
|
|
||||||
- *forward_reward*: A reward of hopping forward which is measured
|
|
||||||
as *`forward_reward_weight` * (x-coordinate before action - x-coordinate after action)/dt*. *dt* is
|
|
||||||
the time between actions and is dependent on the frame_skip parameter
|
|
||||||
(fixed to 4), where the frametime is 0.002 - making the
|
|
||||||
default *dt = 4 * 0.002 = 0.008*. This reward would be positive if the hopper
|
|
||||||
hops forward (positive x direction).
|
|
||||||
- *ctrl_cost*: A cost for penalising the hopper if it takes
|
|
||||||
actions that are too large. It is measured as *`ctrl_cost_weight` *
|
|
||||||
sum(action<sup>2</sup>)* where *`ctrl_cost_weight`* is a parameter set for the
|
|
||||||
control and has a default value of 0.001
|
|
||||||
|
|
||||||
The total reward returned is ***reward*** *=* *healthy_reward + forward_reward - ctrl_cost* and `info` will also contain the individual reward terms
|
|
||||||
|
|
||||||
### Starting State
|
|
||||||
All observations start in state
|
|
||||||
(0.0, 1.25, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0) with a uniform noise
|
|
||||||
in the range of [-`reset_noise_scale`, `reset_noise_scale`] added to the values for stochasticity.
|
|
||||||
|
|
||||||
### Episode Termination
|
|
||||||
The hopper is said to be unhealthy if any of the following happens:
|
|
||||||
|
|
||||||
1. An element of `observation[1:]` (if `exclude_current_positions_from_observation=True`, else `observation[2:]`) is no longer contained in the closed interval specified by the argument `healthy_state_range`
|
|
||||||
2. The height of the hopper (`observation[0]` if `exclude_current_positions_from_observation=True`, else `observation[1]`) is no longer contained in the closed interval specified by the argument `healthy_z_range` (usually meaning that it has fallen)
|
|
||||||
3. The angle (`observation[1]` if `exclude_current_positions_from_observation=True`, else `observation[2]`) is no longer contained in the closed interval specified by the argument `healthy_angle_range`
|
|
||||||
|
|
||||||
If `terminate_when_unhealthy=True` is passed during construction (which is the default),
|
|
||||||
the episode terminates when any of the following happens:
|
|
||||||
|
|
||||||
1. The episode duration reaches a 1000 timesteps
|
|
||||||
2. The hopper is unhealthy
|
|
||||||
|
|
||||||
If `terminate_when_unhealthy=False` is passed, the episode is terminated only when 1000 timesteps are exceeded.
|
|
||||||
|
|
||||||
### Arguments
|
|
||||||
|
|
||||||
No additional arguments are currently supported in v2 and lower.
|
|
||||||
|
|
||||||
```
|
|
||||||
env = gym.make('Hopper-v2')
|
|
||||||
```
|
|
||||||
|
|
||||||
v3 and beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
|
|
||||||
|
|
||||||
```
|
|
||||||
env = gym.make('Hopper-v3', ctrl_cost_weight=0.1, ....)
|
|
||||||
```
|
|
||||||
|
|
||||||
| Parameter | Type | Default |Description |
|
|
||||||
|-------------------------|------------|----------------|-------------------------------|
|
|
||||||
| `xml_file` | **str** | `"hopper.xml"` | Path to a MuJoCo model |
|
|
||||||
| `forward_reward_weight` | **float** | `1.0` | Weight for *forward_reward* term (see section on reward) |
|
|
||||||
| `ctrl_cost_weight` | **float** | `0.001` | Weight for *ctrl_cost* reward (see section on reward) |
|
|
||||||
| `healthy_reward` | **float** | `1` | Constant reward given if the ant is "healthy" after timestep |
|
|
||||||
| `terminate_when_unhealthy` | **bool**| `True` | If true, issue a done signal if the hopper is no longer healthy |
|
|
||||||
| `healthy_state_range` | **tuple** | `(-100, 100)` | The elements of `observation[1:]` (if `exclude_current_positions_from_observation=True`, else `observation[2:]`) must be in this range for the hopper to be considered healthy |
|
|
||||||
| `healthy_z_range` | **tuple** | `(0.7, float("inf"))` | The z-coordinate must be in this range for the hopper to be considered healthy |
|
|
||||||
| `healthy_angle_range` | **tuple** | `(-0.2, 0.2)` | The angle given by `observation[1]` (if `exclude_current_positions_from_observation=True`, else `observation[2]`) must be in this range for the hopper to be considered healthy |
|
|
||||||
| `reset_noise_scale` | **float** | `5e-3` | Scale of random perturbations of initial position and velocity (see section on Starting State) |
|
|
||||||
| `exclude_current_positions_from_observation`| **bool** | `True`| Whether or not to omit the x-coordinate from observations. Excluding the position can serve as an inductive bias to induce position-agnostic behavior in policies |
|
|
||||||
|
|
||||||
|
|
||||||
### Version History
|
|
||||||
|
|
||||||
* v3: support for gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. rgb rendering comes from tracking camera (so agent does not run away from screen)
|
|
||||||
* v2: All continuous control environments now use mujoco_py >= 1.50
|
|
||||||
* v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments.
|
|
||||||
* v0: Initial versions release (1.0.0)
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
xml_file="hopper.xml",
|
xml_file="hopper.xml",
|
||||||
@@ -170,7 +46,7 @@ class HopperEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
|||||||
exclude_current_positions_from_observation
|
exclude_current_positions_from_observation
|
||||||
)
|
)
|
||||||
|
|
||||||
mujoco_env.MujocoEnv.__init__(self, xml_file, 4)
|
mujoco_env.MujocoEnv.__init__(self, xml_file, 4, mujoco_bindings="mujoco_py")
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def healthy_reward(self):
|
def healthy_reward(self):
|
||||||
|
263
gym/envs/mujoco/hopper_v4.py
Normal file
263
gym/envs/mujoco/hopper_v4.py
Normal file
@@ -0,0 +1,263 @@
|
|||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from gym import utils
|
||||||
|
from gym.envs.mujoco import mujoco_env
|
||||||
|
|
||||||
|
DEFAULT_CAMERA_CONFIG = {
|
||||||
|
"trackbodyid": 2,
|
||||||
|
"distance": 3.0,
|
||||||
|
"lookat": np.array((0.0, 0.0, 1.15)),
|
||||||
|
"elevation": -20.0,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class HopperEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
||||||
|
"""
|
||||||
|
### Description
|
||||||
|
|
||||||
|
This environment is based on the work done by Erez, Tassa, and Todorov in
|
||||||
|
["Infinite Horizon Model Predictive Control for Nonlinear Periodic Tasks"](http://www.roboticsproceedings.org/rss07/p10.pdf). The environment aims to
|
||||||
|
increase the number of independent state and control variables as compared to
|
||||||
|
the classic control environments. The hopper is a two-dimensional
|
||||||
|
one-legged figure that consist of four main body parts - the torso at the
|
||||||
|
top, the thigh in the middle, the leg in the bottom, and a single foot on
|
||||||
|
which the entire body rests. The goal is to make hops that move in the
|
||||||
|
forward (right) direction by applying torques on the three hinges
|
||||||
|
connecting the four body parts.
|
||||||
|
|
||||||
|
### Action Space
|
||||||
|
The agent take a 3-element vector for actions.
|
||||||
|
The action space is a continuous `(action, action, action)` all in `[-1, 1]`
|
||||||
|
, where `action` represents the numerical torques applied between *links*
|
||||||
|
|
||||||
|
| Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit |
|
||||||
|
|-----|------------------------------------|-------------|-------------|----------------------------------|-------|--------------|
|
||||||
|
| 0 | Torque applied on the thigh rotor | -1 | 1 | thigh_joint | hinge | torque (N m) |
|
||||||
|
| 1 | Torque applied on the leg rotor | -1 | 1 | leg_joint | hinge | torque (N m) |
|
||||||
|
| 3 | Torque applied on the foot rotor | -1 | 1 | foot_joint | hinge | torque (N m) |
|
||||||
|
|
||||||
|
### Observation Space
|
||||||
|
|
||||||
|
The state space consists of positional values of different body parts of the
|
||||||
|
hopper, followed by the velocities of those individual parts
|
||||||
|
(their derivatives) with all the positions ordered before all the velocities.
|
||||||
|
|
||||||
|
The observation is a `ndarray` with shape `(11,)` where the elements
|
||||||
|
correspond to the following:
|
||||||
|
|
||||||
|
| Num | Observation | Min | Max | Name (in corresponding XML file) | Joint| Unit |
|
||||||
|
|-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------|
|
||||||
|
| 0 | x-coordinate of the top | -Inf | Inf | rootx | slide | position (m) |
|
||||||
|
| 1 | z-coordinate of the top (height of hopper) | -Inf | Inf | rootz | slide | position (m) |
|
||||||
|
| 2 | angle of the top | -Inf | Inf | rooty | hinge | angle (rad) |
|
||||||
|
| 3 | angle of the thigh joint | -Inf | Inf | thigh_joint | hinge | angle (rad) |
|
||||||
|
| 4 | angle of the leg joint | -Inf | Inf | leg_joint | hinge | angle (rad) |
|
||||||
|
| 5 | angle of the foot joint | -Inf | Inf | foot_joint | hinge | angle (rad) |
|
||||||
|
| 6 | velocity of the x-coordinate of the top | -Inf | Inf | rootx | slide | velocity (m/s) |
|
||||||
|
| 7 | velocity of the z-coordinate (height) of the top | -Inf | Inf | rootz | slide | velocity (m/s) |
|
||||||
|
| 8 | angular velocity of the angle of the top | -Inf | Inf | rooty | hinge | angular velocity (rad/s) |
|
||||||
|
| 9 | angular velocity of the thigh hinge | -Inf | Inf | thigh_joint | hinge | angular velocity (rad/s) |
|
||||||
|
| 10 | angular velocity of the leg hinge | -Inf | Inf | leg_joint | hinge | angular velocity (rad/s) |
|
||||||
|
| 11 | angular velocity of the foot hinge | -Inf | Inf | foot_joint | hinge | angular velocity (rad/s) |
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
**Note:**
|
||||||
|
In practice (and Gym implementation), the first positional element is
|
||||||
|
omitted from the state space since the reward function is calculated based
|
||||||
|
on that value. This value is hidden from the algorithm, which in turn has
|
||||||
|
to develop an abstract understanding of it from the observed rewards.
|
||||||
|
Therefore, observation space has shape `(11,)` instead of `(12,)` and looks like:
|
||||||
|
|
||||||
|
| Num | Observation | Min | Max | Name (in corresponding XML file) | Joint| Unit |
|
||||||
|
|-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------|
|
||||||
|
| 0 | z-coordinate of the top (height of hopper) | -Inf | Inf | rootz | slide | position (m) |
|
||||||
|
| 1 | angle of the top | -Inf | Inf | rooty | hinge | angle (rad) |
|
||||||
|
| 2 | angle of the thigh joint | -Inf | Inf | thigh_joint | hinge | angle (rad) |
|
||||||
|
| 3 | angle of the leg joint | -Inf | Inf | leg_joint | hinge | angle (rad) |
|
||||||
|
| 4 | angle of the foot joint | -Inf | Inf | foot_joint | hinge | angle (rad) |
|
||||||
|
| 5 | velocity of the x-coordinate of the top | -Inf | Inf | rootx | slide | velocity (m/s) |
|
||||||
|
| 6 | velocity of the z-coordinate (height) of the top | -Inf | Inf | rootz | slide | velocity (m/s) |
|
||||||
|
| 7 | angular velocity of the angle of the top | -Inf | Inf | rooty | hinge | angular velocity (rad/s) |
|
||||||
|
| 8 | angular velocity of the thigh hinge | -Inf | Inf | thigh_joint | hinge | angular velocity (rad/s) |
|
||||||
|
| 9 | angular velocity of the leg hinge | -Inf | Inf | leg_joint | hinge | angular velocity (rad/s) |
|
||||||
|
| 10 | angular velocity of the foot hinge | -Inf | Inf | foot_joint | hinge | angular velocity (rad/s) |
|
||||||
|
|
||||||
|
### Rewards
|
||||||
|
The reward consists of three parts:
|
||||||
|
- *alive bonus*: Every timestep that the hopper is alive, it gets a reward of 1,
|
||||||
|
- *reward_forward*: A reward of hopping forward which is measured
|
||||||
|
as *(x-coordinate before action - x-coordinate after action)/dt*. *dt* is
|
||||||
|
the time between actions and is dependeent on the frame_skip parameter
|
||||||
|
(default is 4), where the *dt* for one frame is 0.002 - making the
|
||||||
|
default *dt = 4*0.002 = 0.008*. This reward would be positive if the hopper
|
||||||
|
hops forward (right) desired.
|
||||||
|
- *reward_control*: A negative reward for penalising the hopper if it takes
|
||||||
|
actions that are too large. It is measured as *-coefficient **x**
|
||||||
|
sum(action<sup>2</sup>)* where *coefficient* is a parameter set for the
|
||||||
|
control and has a default value of 0.001
|
||||||
|
|
||||||
|
The total reward returned is ***reward*** *=* *alive bonus + reward_forward + reward_control*
|
||||||
|
|
||||||
|
### Starting State
|
||||||
|
All observations start in state
|
||||||
|
(0.0, 1.25, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0) with a uniform noise
|
||||||
|
in the range of [-0.005, 0.005] added to the values for stochasticity.
|
||||||
|
|
||||||
|
### Episode Termination
|
||||||
|
The episode terminates when any of the following happens:
|
||||||
|
|
||||||
|
1. The episode duration reaches a 1000 timesteps
|
||||||
|
2. Any of the state space values is no longer finite
|
||||||
|
3. The absolute value of any of the state variable indexed (angle and beyond) is greater than 100
|
||||||
|
4. The height of the hopper becomes greater than 0.7 metres (hopper has hopped too high).
|
||||||
|
5. The absolute value of the angle (index 2) is less than 0.2 radians (hopper has fallen down).
|
||||||
|
|
||||||
|
### Arguments
|
||||||
|
|
||||||
|
No additional arguments are currently supported (in v2 and lower), but
|
||||||
|
modifications can be made to the XML file in the assets folder
|
||||||
|
(or by changing the path to a modified XML file in another folder).
|
||||||
|
|
||||||
|
```
|
||||||
|
env = gym.make('Hopper-v2')
|
||||||
|
```
|
||||||
|
|
||||||
|
v3 and beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
|
||||||
|
|
||||||
|
```
|
||||||
|
env = gym.make('Hopper-v3', ctrl_cost_weight=0.1, ....)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Version History
|
||||||
|
|
||||||
|
* v4: all mujoco environments now use the mujoco binidings in mujoco>=2.1.3
|
||||||
|
* v3: support for gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. rgb rendering comes from tracking camera (so agent does not run away from screen)
|
||||||
|
* v2: All continuous control environments now use mujoco_py >= 1.50
|
||||||
|
* v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments.
|
||||||
|
* v0: Initial versions release (1.0.0)
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
xml_file="hopper.xml",
|
||||||
|
forward_reward_weight=1.0,
|
||||||
|
ctrl_cost_weight=1e-3,
|
||||||
|
healthy_reward=1.0,
|
||||||
|
terminate_when_unhealthy=True,
|
||||||
|
healthy_state_range=(-100.0, 100.0),
|
||||||
|
healthy_z_range=(0.7, float("inf")),
|
||||||
|
healthy_angle_range=(-0.2, 0.2),
|
||||||
|
reset_noise_scale=5e-3,
|
||||||
|
exclude_current_positions_from_observation=True,
|
||||||
|
):
|
||||||
|
utils.EzPickle.__init__(**locals())
|
||||||
|
|
||||||
|
self._forward_reward_weight = forward_reward_weight
|
||||||
|
|
||||||
|
self._ctrl_cost_weight = ctrl_cost_weight
|
||||||
|
|
||||||
|
self._healthy_reward = healthy_reward
|
||||||
|
self._terminate_when_unhealthy = terminate_when_unhealthy
|
||||||
|
|
||||||
|
self._healthy_state_range = healthy_state_range
|
||||||
|
self._healthy_z_range = healthy_z_range
|
||||||
|
self._healthy_angle_range = healthy_angle_range
|
||||||
|
|
||||||
|
self._reset_noise_scale = reset_noise_scale
|
||||||
|
|
||||||
|
self._exclude_current_positions_from_observation = (
|
||||||
|
exclude_current_positions_from_observation
|
||||||
|
)
|
||||||
|
|
||||||
|
mujoco_env.MujocoEnv.__init__(self, xml_file, 4)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def healthy_reward(self):
|
||||||
|
return (
|
||||||
|
float(self.is_healthy or self._terminate_when_unhealthy)
|
||||||
|
* self._healthy_reward
|
||||||
|
)
|
||||||
|
|
||||||
|
def control_cost(self, action):
|
||||||
|
control_cost = self._ctrl_cost_weight * np.sum(np.square(action))
|
||||||
|
return control_cost
|
||||||
|
|
||||||
|
@property
|
||||||
|
def is_healthy(self):
|
||||||
|
z, angle = self.data.qpos[1:3]
|
||||||
|
state = self.state_vector()[2:]
|
||||||
|
|
||||||
|
min_state, max_state = self._healthy_state_range
|
||||||
|
min_z, max_z = self._healthy_z_range
|
||||||
|
min_angle, max_angle = self._healthy_angle_range
|
||||||
|
|
||||||
|
healthy_state = np.all(np.logical_and(min_state < state, state < max_state))
|
||||||
|
healthy_z = min_z < z < max_z
|
||||||
|
healthy_angle = min_angle < angle < max_angle
|
||||||
|
|
||||||
|
is_healthy = all((healthy_state, healthy_z, healthy_angle))
|
||||||
|
|
||||||
|
return is_healthy
|
||||||
|
|
||||||
|
@property
|
||||||
|
def done(self):
|
||||||
|
done = not self.is_healthy if self._terminate_when_unhealthy else False
|
||||||
|
return done
|
||||||
|
|
||||||
|
def _get_obs(self):
|
||||||
|
position = self.data.qpos.flat.copy()
|
||||||
|
velocity = np.clip(self.data.qvel.flat.copy(), -10, 10)
|
||||||
|
|
||||||
|
if self._exclude_current_positions_from_observation:
|
||||||
|
position = position[1:]
|
||||||
|
|
||||||
|
observation = np.concatenate((position, velocity)).ravel()
|
||||||
|
return observation
|
||||||
|
|
||||||
|
def step(self, action):
|
||||||
|
x_position_before = self.data.qpos[0]
|
||||||
|
self.do_simulation(action, self.frame_skip)
|
||||||
|
x_position_after = self.data.qpos[0]
|
||||||
|
x_velocity = (x_position_after - x_position_before) / self.dt
|
||||||
|
|
||||||
|
ctrl_cost = self.control_cost(action)
|
||||||
|
|
||||||
|
forward_reward = self._forward_reward_weight * x_velocity
|
||||||
|
healthy_reward = self.healthy_reward
|
||||||
|
|
||||||
|
rewards = forward_reward + healthy_reward
|
||||||
|
costs = ctrl_cost
|
||||||
|
|
||||||
|
observation = self._get_obs()
|
||||||
|
reward = rewards - costs
|
||||||
|
done = self.done
|
||||||
|
info = {
|
||||||
|
"x_position": x_position_after,
|
||||||
|
"x_velocity": x_velocity,
|
||||||
|
}
|
||||||
|
|
||||||
|
return observation, reward, done, info
|
||||||
|
|
||||||
|
def reset_model(self):
|
||||||
|
noise_low = -self._reset_noise_scale
|
||||||
|
noise_high = self._reset_noise_scale
|
||||||
|
|
||||||
|
qpos = self.init_qpos + self.np_random.uniform(
|
||||||
|
low=noise_low, high=noise_high, size=self.model.nq
|
||||||
|
)
|
||||||
|
qvel = self.init_qvel + self.np_random.uniform(
|
||||||
|
low=noise_low, high=noise_high, size=self.model.nv
|
||||||
|
)
|
||||||
|
|
||||||
|
self.set_state(qpos, qvel)
|
||||||
|
|
||||||
|
observation = self._get_obs()
|
||||||
|
return observation
|
||||||
|
|
||||||
|
def viewer_setup(self):
|
||||||
|
for key, value in DEFAULT_CAMERA_CONFIG.items():
|
||||||
|
if isinstance(value, np.ndarray):
|
||||||
|
getattr(self.viewer.cam, key)[:] = value
|
||||||
|
else:
|
||||||
|
setattr(self.viewer.cam, key, value)
|
@@ -12,7 +12,9 @@ def mass_center(model, sim):
|
|||||||
|
|
||||||
class HumanoidEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
class HumanoidEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
mujoco_env.MujocoEnv.__init__(self, "humanoid.xml", 5)
|
mujoco_env.MujocoEnv.__init__(
|
||||||
|
self, "humanoid.xml", 5, mujoco_bindings="mujoco_py"
|
||||||
|
)
|
||||||
utils.EzPickle.__init__(self)
|
utils.EzPickle.__init__(self)
|
||||||
|
|
||||||
def _get_obs(self):
|
def _get_obs(self):
|
||||||
|
@@ -18,198 +18,6 @@ def mass_center(model, sim):
|
|||||||
|
|
||||||
|
|
||||||
class HumanoidEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
class HumanoidEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
||||||
"""
|
|
||||||
### Description
|
|
||||||
|
|
||||||
This environment is based on the environment introduced by Tassa, Erez and Todorov
|
|
||||||
in ["Synthesis and stabilization of complex behaviors through online trajectory optimization"](https://ieeexplore.ieee.org/document/6386025).
|
|
||||||
The 3D bipedal robot is designed to simulate a human. It has a torso (abdomen) with a pair of
|
|
||||||
legs and arms. The legs each consist of two links, and so the arms (representing the knees and
|
|
||||||
elbows respectively). The goal of the environment is to walk forward as fast as possible without falling over.
|
|
||||||
|
|
||||||
### Action Space
|
|
||||||
The action space is a `Box(-1, 1, (17,), float32)`. An action represents the torques applied at the hinge joints.
|
|
||||||
|
|
||||||
| Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit |
|
|
||||||
|-----|----------------------|---------------|----------------|---------------------------------------|-------|------|
|
|
||||||
| 0 | Torque applied on the hinge in the y-coordinate of the abdomen | -0.4 | 0.4 | hip_1 (front_left_leg) | hinge | torque (N m) |
|
|
||||||
| 1 | Torque applied on the hinge in the z-coordinate of the abdomen | -0.4 | 0.4 | angle_1 (front_left_leg) | hinge | torque (N m) |
|
|
||||||
| 2 | Torque applied on the hinge in the x-coordinate of the abdomen | -0.4 | 0.4 | hip_2 (front_right_leg) | hinge | torque (N m) |
|
|
||||||
| 3 | Torque applied on the rotor between torso/abdomen and the right hip (x-coordinate) | -0.4 | 0.4 | right_hip_x (right_thigh) | hinge | torque (N m) |
|
|
||||||
| 4 | Torque applied on the rotor between torso/abdomen and the right hip (z-coordinate) | -0.4 | 0.4 | right_hip_z (right_thigh) | hinge | torque (N m) |
|
|
||||||
| 5 | Torque applied on the rotor between torso/abdomen and the right hip (y-coordinate) | -0.4 | 0.4 | right_hip_y (right_thigh) | hinge | torque (N m) |
|
|
||||||
| 6 | Torque applied on the rotor between the right hip/thigh and the right shin | -0.4 | 0.4 | right_knee | hinge | torque (N m) |
|
|
||||||
| 7 | Torque applied on the rotor between torso/abdomen and the left hip (x-coordinate) | -0.4 | 0.4 | left_hip_x (left_thigh) | hinge | torque (N m) |
|
|
||||||
| 8 | Torque applied on the rotor between torso/abdomen and the left hip (z-coordinate) | -0.4 | 0.4 | left_hip_z (left_thigh) | hinge | torque (N m) |
|
|
||||||
| 9 | Torque applied on the rotor between torso/abdomen and the left hip (y-coordinate) | -0.4 | 0.4 | left_hip_y (left_thigh) | hinge | torque (N m) |
|
|
||||||
| 10 | Torque applied on the rotor between the left hip/thigh and the left shin | -0.4 | 0.4 | left_knee | hinge | torque (N m) |
|
|
||||||
| 11 | Torque applied on the rotor between the torso and right upper arm (coordinate -1) | -0.4 | 0.4 | right_shoulder1 | hinge | torque (N m) |
|
|
||||||
| 12 | Torque applied on the rotor between the torso and right upper arm (coordinate -2) | -0.4 | 0.4 | right_shoulder2 | hinge | torque (N m) |
|
|
||||||
| 13 | Torque applied on the rotor between the right upper arm and right lower arm | -0.4 | 0.4 | right_elbow | hinge | torque (N m) |
|
|
||||||
| 14 | Torque applied on the rotor between the torso and left upper arm (coordinate -1) | -0.4 | 0.4 | left_shoulder1 | hinge | torque (N m) |
|
|
||||||
| 15 | Torque applied on the rotor between the torso and left upper arm (coordinate -2) | -0.4 | 0.4 | left_shoulder2 | hinge | torque (N m) |
|
|
||||||
| 16 | Torque applied on the rotor between the left upper arm and left lower arm | -0.4 | 0.4 | left_elbow | hinge | torque (N m) |
|
|
||||||
|
|
||||||
### Observation Space
|
|
||||||
|
|
||||||
Observations consist of positional values of different body parts of the Humanoid,
|
|
||||||
followed by the velocities of those individual parts (their derivatives) with all the
|
|
||||||
positions ordered before all the velocities.
|
|
||||||
|
|
||||||
By default, observations do not include the x- and y-coordinates of the torso. These may
|
|
||||||
be included by passing `exclude_current_positions_from_observation=False` during construction.
|
|
||||||
In that case, the observation space will have 378 dimensions where the first two dimensions
|
|
||||||
represent the x- and y-coordinates of the torso.
|
|
||||||
Regardless of whether `exclude_current_positions_from_observation` was set to true or false, the x- and y-coordinates
|
|
||||||
will be returned in `info` with keys `"x_position"` and `"y_position"`, respectively.
|
|
||||||
|
|
||||||
However, by default, the observation is a `ndarray` with shape `(376,)` where the elements correspond to the following:
|
|
||||||
|
|
||||||
| Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit |
|
|
||||||
|-----|---------------------------------------------------------|----------------|-----------------|----------------------------------------|-------|------|
|
|
||||||
| 0 | z-coordinate of the torso (centre) | -Inf | Inf | root | free | position (m) |
|
|
||||||
| 1 | x-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) |
|
|
||||||
| 2 | y-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) |
|
|
||||||
| 3 | z-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) |
|
|
||||||
| 4 | w-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) |
|
|
||||||
| 5 | z-angle of the abdomen (in lower_waist) | -Inf | Inf | abdomen_z | hinge | angle (rad) |
|
|
||||||
| 6 | y-angle of the abdomen (in lower_waist) | -Inf | Inf | abdomen_y | hinge | angle (rad) |
|
|
||||||
| 7 | x-angle of the abdomen (in pelvis) | -Inf | Inf | abdomen_x | hinge | angle (rad) |
|
|
||||||
| 8 | x-coordinate of angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_x | hinge | angle (rad) |
|
|
||||||
| 9 | z-coordinate of angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_z | hinge | angle (rad) |
|
|
||||||
| 19 | y-coordinate of angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_y | hinge | angle (rad) |
|
|
||||||
| 11 | angle between right hip and the right shin (in right_knee) | -Inf | Inf | right_knee | hinge | angle (rad) |
|
|
||||||
| 12 | x-coordinate of angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_x | hinge | angle (rad) |
|
|
||||||
| 13 | z-coordinate of angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_z | hinge | angle (rad) |
|
|
||||||
| 14 | y-coordinate of angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_y | hinge | angle (rad) |
|
|
||||||
| 15 | angle between left hip and the left shin (in left_knee) | -Inf | Inf | left_knee | hinge | angle (rad) |
|
|
||||||
| 16 | coordinate-1 (multi-axis) angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder1 | hinge | angle (rad) |
|
|
||||||
| 17 | coordinate-2 (multi-axis) angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder2 | hinge | angle (rad) |
|
|
||||||
| 18 | angle between right upper arm and right_lower_arm | -Inf | Inf | right_elbow | hinge | angle (rad) |
|
|
||||||
| 19 | coordinate-1 (multi-axis) angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder1 | hinge | angle (rad) |
|
|
||||||
| 20 | coordinate-2 (multi-axis) angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder2 | hinge | angle (rad) |
|
|
||||||
| 21 | angle between left upper arm and left_lower_arm | -Inf | Inf | left_elbow | hinge | angle (rad) |
|
|
||||||
| 22 | x-coordinate velocity of the torso (centre) | -Inf | Inf | root | free | velocity (m/s) |
|
|
||||||
| 23 | y-coordinate velocity of the torso (centre) | -Inf | Inf | root | free | velocity (m/s) |
|
|
||||||
| 24 | z-coordinate velocity of the torso (centre) | -Inf | Inf | root | free | velocity (m/s) |
|
|
||||||
| 25 | x-coordinate angular velocity of the torso (centre) | -Inf | Inf | root | free | anglular velocity (rad/s) |
|
|
||||||
| 26 | y-coordinate angular velocity of the torso (centre) | -Inf | Inf | root | free | anglular velocity (rad/s) |
|
|
||||||
| 27 | z-coordinate angular velocity of the torso (centre) | -Inf | Inf | root | free | anglular velocity (rad/s) |
|
|
||||||
| 28 | z-coordinate of angular velocity of the abdomen (in lower_waist) | -Inf | Inf | abdomen_z | hinge | anglular velocity (rad/s) |
|
|
||||||
| 29 | y-coordinate of angular velocity of the abdomen (in lower_waist) | -Inf | Inf | abdomen_y | hinge | anglular velocity (rad/s) |
|
|
||||||
| 30 | x-coordinate of angular velocity of the abdomen (in pelvis) | -Inf | Inf | abdomen_x | hinge | aanglular velocity (rad/s) |
|
|
||||||
| 31 | x-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_x | hinge | anglular velocity (rad/s) |
|
|
||||||
| 32 | z-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_z | hinge | anglular velocity (rad/s) |
|
|
||||||
| 33 | y-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_y | hinge | anglular velocity (rad/s) |
|
|
||||||
| 34 | angular velocity of the angle between right hip and the right shin (in right_knee) | -Inf | Inf | right_knee | hinge | anglular velocity (rad/s) |
|
|
||||||
| 35 | x-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_x | hinge | anglular velocity (rad/s) |
|
|
||||||
| 36 | z-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_z | hinge | anglular velocity (rad/s) |
|
|
||||||
| 37 | y-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_y | hinge | anglular velocity (rad/s) |
|
|
||||||
| 38 | angular velocity of the angle between left hip and the left shin (in left_knee) | -Inf | Inf | left_knee | hinge | anglular velocity (rad/s) |
|
|
||||||
| 39 | coordinate-1 (multi-axis) of the angular velocity of the angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder1 | hinge | anglular velocity (rad/s) |
|
|
||||||
| 40 | coordinate-2 (multi-axis) of the angular velocity of the angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder2 | hinge | anglular velocity (rad/s) |
|
|
||||||
| 41 | angular velocity of the angle between right upper arm and right_lower_arm | -Inf | Inf | right_elbow | hinge | anglular velocity (rad/s) |
|
|
||||||
| 42 | coordinate-1 (multi-axis) of the angular velocity of the angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder1 | hinge | anglular velocity (rad/s) |
|
|
||||||
| 43 | coordinate-2 (multi-axis) of the angular velocity of the angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder2 | hinge | anglular velocity (rad/s) |
|
|
||||||
| 44 | angular velocitty of the angle between left upper arm and left_lower_arm | -Inf | Inf | left_elbow | hinge | anglular velocity (rad/s) |
|
|
||||||
|
|
||||||
Additionally, after all the positional and velocity based values in the table,
|
|
||||||
the observation contains (in order):
|
|
||||||
- *cinert:* Mass and inertia of a single rigid body relative to the center of mass
|
|
||||||
(this is an intermediate result of transition). It has shape 14*10 (*nbody * 10*)
|
|
||||||
and hence adds to another 140 elements in the state space.
|
|
||||||
- *cvel:* Center of mass based velocity. It has shape 14 * 6 (*nbody * 6*) and hence
|
|
||||||
adds another 84 elements in the state space
|
|
||||||
- *qfrc_actuator:* Constraint force generated as the actuator force. This has shape
|
|
||||||
`(23,)` *(nv * 1)* and hence adds another 23 elements to the state space.
|
|
||||||
- *cfrc_ext:* This is the center of mass based external force on the body. It has shape
|
|
||||||
14 * 6 (*nbody * 6*) and hence adds to another 84 elements in the state space.
|
|
||||||
where *nbody* stands for the number of bodies in the robot and *nv* stands for the
|
|
||||||
number of degrees of freedom (*= dim(qvel)*)
|
|
||||||
|
|
||||||
The (x,y,z) coordinates are translational DOFs while the orientations are rotational
|
|
||||||
DOFs expressed as quaternions. One can read more about free joints on the
|
|
||||||
[Mujoco Documentation](https://mujoco.readthedocs.io/en/latest/XMLreference.html).
|
|
||||||
|
|
||||||
**Note:** There have been reported issues that using a Mujoco-Py version > 2.0
|
|
||||||
results in the contact forces always being 0. As such we recommend to use a Mujoco-Py
|
|
||||||
version < 2.0 when using the Humanoid environment if you would like to report results
|
|
||||||
with contact forces (if contact forces are not used in your experiments, you can use
|
|
||||||
version > 2.0).
|
|
||||||
|
|
||||||
### Rewards
|
|
||||||
The reward consists of three parts:
|
|
||||||
- *healthy_reward*: Every timestep that the humanoid is alive (see section Episode Termination for definition), it gets a reward of fixed value `healthy_reward`
|
|
||||||
- *forward_reward*: A reward of walking forward which is measured as *`forward_reward_weight` *
|
|
||||||
(average center of mass before action - average center of mass after action)/dt*.
|
|
||||||
*dt* is the time between actions and is dependent on the frame_skip parameter
|
|
||||||
(default is 5), where the frametime is 0.003 - making the default *dt = 5 * 0.003 = 0.015*.
|
|
||||||
This reward would be positive if the humanoid walks forward (in positive x-direction). The calculation
|
|
||||||
for the center of mass is defined in the `.py` file for the Humanoid.
|
|
||||||
- *ctrl_cost*: A negative reward for penalising the humanoid if it has too
|
|
||||||
large of a control force. If there are *nu* actuators/controls, then the control has
|
|
||||||
shape `nu x 1`. It is measured as *`ctrl_cost_weight` * sum(control<sup>2</sup>)*.
|
|
||||||
- *contact_cost*: A negative reward for penalising the humanoid if the external
|
|
||||||
contact force is too large. It is calculated by clipping
|
|
||||||
*`contact_cost_weight` * sum(external contact force<sup>2</sup>)* to the interval specified by `contact_cost_range`.
|
|
||||||
|
|
||||||
The total reward returned is ***reward*** *=* *healthy_reward + forward_reward - ctrl_cost - contact_cost* and `info` will also contain the individual reward terms
|
|
||||||
|
|
||||||
### Starting State
|
|
||||||
All observations start in state
|
|
||||||
(0.0, 0.0, 1.4, 1.0, 0.0 ... 0.0) with a uniform noise in the range
|
|
||||||
of [-`reset_noise_scale`, `reset_noise_scale`] added to the positional and velocity values (values in the table)
|
|
||||||
for stochasticity. Note that the initial z coordinate is intentionally
|
|
||||||
selected to be high, thereby indicating a standing up humanoid. The initial
|
|
||||||
orientation is designed to make it face forward as well.
|
|
||||||
|
|
||||||
### Episode Termination
|
|
||||||
The humanoid is said to be unhealthy if the z-position of the torso is no longer contained in the
|
|
||||||
closed interval specified by the argument `healthy_z_range`.
|
|
||||||
|
|
||||||
If `terminate_when_unhealthy=True` is passed during construction (which is the default),
|
|
||||||
the episode terminates when any of the following happens:
|
|
||||||
|
|
||||||
1. The episode duration reaches a 1000 timesteps
|
|
||||||
3. The humanoid is unhealthy
|
|
||||||
|
|
||||||
If `terminate_when_unhealthy=False` is passed, the episode is terminated only when 1000 timesteps are exceeded.
|
|
||||||
|
|
||||||
|
|
||||||
### Arguments
|
|
||||||
|
|
||||||
No additional arguments are currently supported in v2 and lower.
|
|
||||||
|
|
||||||
```
|
|
||||||
env = gym.make('Humanoid-v2')
|
|
||||||
```
|
|
||||||
|
|
||||||
v3 and beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
|
|
||||||
|
|
||||||
```
|
|
||||||
env = gym.make('Humanoid-v3', ctrl_cost_weight=0.1, ....)
|
|
||||||
```
|
|
||||||
|
|
||||||
| Parameter | Type | Default |Description |
|
|
||||||
|-------------------------|------------|-------------------|-------------------------------|
|
|
||||||
| `xml_file` | **str** | `"humanoid.xml"` | Path to a MuJoCo model |
|
|
||||||
| `forward_reward_weight` | **float** | `1.25` | Weight for *forward_reward* term (see section on reward) |
|
|
||||||
| `ctrl_cost_weight` | **float** | `0.1` | Weight for *ctrl_cost* term (see section on reward) |
|
|
||||||
| `contact_cost_weight` | **float** | `5e-7` | Weight for *contact_cost* term (see section on reward) |
|
|
||||||
| `healthy_reward` | **float** | `5.0` | Constant reward given if the humanoid is "healthy" after timestep |
|
|
||||||
| `terminate_when_unhealthy` | **bool**| `True` | If true, issue a done signal if the z-coordinate of the torso is no longer in the `healthy_z_range` |
|
|
||||||
| `healthy_z_range` | **tuple** | `(1.0, 2.0)` | The humanoid is considered healthy if the z-coordinate of the torso is in this range |
|
|
||||||
| `reset_noise_scale` | **float** | `1e-2` | Scale of random perturbations of initial position and velocity (see section on Starting State) |
|
|
||||||
| `exclude_current_positions_from_observation`| **bool** | `True`| Whether or not to omit the x- and y-coordinates from observations. Excluding the position can serve as an inductive bias to induce position-agnostic behavior in policies |
|
|
||||||
|
|
||||||
### Version History
|
|
||||||
|
|
||||||
* v3: support for gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. rgb rendering comes from tracking camera (so agent does not run away from screen)
|
|
||||||
* v2: All continuous control environments now use mujoco_py >= 1.50
|
|
||||||
* v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments.
|
|
||||||
* v0: Initial versions release (1.0.0)
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
xml_file="humanoid.xml",
|
xml_file="humanoid.xml",
|
||||||
@@ -239,7 +47,7 @@ class HumanoidEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
|||||||
exclude_current_positions_from_observation
|
exclude_current_positions_from_observation
|
||||||
)
|
)
|
||||||
|
|
||||||
mujoco_env.MujocoEnv.__init__(self, xml_file, 5)
|
mujoco_env.MujocoEnv.__init__(self, xml_file, 5, mujoco_bindings="mujoco_py")
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def healthy_reward(self):
|
def healthy_reward(self):
|
||||||
|
331
gym/envs/mujoco/humanoid_v4.py
Normal file
331
gym/envs/mujoco/humanoid_v4.py
Normal file
@@ -0,0 +1,331 @@
|
|||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from gym import utils
|
||||||
|
from gym.envs.mujoco import mujoco_env
|
||||||
|
|
||||||
|
DEFAULT_CAMERA_CONFIG = {
|
||||||
|
"trackbodyid": 1,
|
||||||
|
"distance": 4.0,
|
||||||
|
"lookat": np.array((0.0, 0.0, 2.0)),
|
||||||
|
"elevation": -20.0,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def mass_center(model, data):
|
||||||
|
mass = np.expand_dims(model.body_mass, axis=1)
|
||||||
|
xpos = data.xipos
|
||||||
|
return (np.sum(mass * xpos, axis=0) / np.sum(mass))[0:2].copy()
|
||||||
|
|
||||||
|
|
||||||
|
class HumanoidEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
||||||
|
"""
|
||||||
|
### Description
|
||||||
|
|
||||||
|
This environment is based on the environment introduced by Tassa, Erez and Todorov
|
||||||
|
in ["Synthesis and stabilization of complex behaviors through online trajectory optimization"](https://ieeexplore.ieee.org/document/6386025).
|
||||||
|
The 3D bipedal robot is designed to simulate a human. It has a torso (abdomen) with a pair of
|
||||||
|
legs and arms. The legs each consist of two links, and so the arms (representing the knees and
|
||||||
|
elbows respectively). The goal of the environment is to walk forward as fast as possible without falling over.
|
||||||
|
|
||||||
|
### Action Space
|
||||||
|
The agent take a 17-element vector for actions.
|
||||||
|
The action space is a continuous `(action, ...)` all in `[-1, 1]`, where `action`
|
||||||
|
represents the numerical torques applied at the hinge joints.
|
||||||
|
|
||||||
|
| Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit |
|
||||||
|
|-----|----------------------|---------------|----------------|---------------------------------------|-------|------|
|
||||||
|
| 0 | Torque applied on the hinge in the y-coordinate of the abdomen | -0.4 | 0.4 | hip_1 (front_left_leg) | hinge | torque (N m) |
|
||||||
|
| 1 | Torque applied on the hinge in the z-coordinate of the abdomen | -0.4 | 0.4 | angle_1 (front_left_leg) | hinge | torque (N m) |
|
||||||
|
| 2 | Torque applied on the hinge in the x-coordinate of the abdomen | -0.4 | 0.4 | hip_2 (front_right_leg) | hinge | torque (N m) |
|
||||||
|
| 3 | Torque applied on the rotor between torso/abdomen and the right hip (x-coordinate) | -0.4 | 0.4 | right_hip_x (right_thigh) | hinge | torque (N m) |
|
||||||
|
| 4 | Torque applied on the rotor between torso/abdomen and the right hip (z-coordinate) | -0.4 | 0.4 | right_hip_z (right_thigh) | hinge | torque (N m) |
|
||||||
|
| 5 | Torque applied on the rotor between torso/abdomen and the right hip (y-coordinate) | -0.4 | 0.4 | right_hip_y (right_thigh) | hinge | torque (N m) |
|
||||||
|
| 6 | Torque applied on the rotor between the right hip/thigh and the right shin | -0.4 | 0.4 | right_knee | hinge | torque (N m) |
|
||||||
|
| 7 | Torque applied on the rotor between torso/abdomen and the left hip (x-coordinate) | -0.4 | 0.4 | left_hip_x (left_thigh) | hinge | torque (N m) |
|
||||||
|
| 8 | Torque applied on the rotor between torso/abdomen and the left hip (z-coordinate) | -0.4 | 0.4 | left_hip_z (left_thigh) | hinge | torque (N m) |
|
||||||
|
| 9 | Torque applied on the rotor between torso/abdomen and the left hip (y-coordinate) | -0.4 | 0.4 | left_hip_y (left_thigh) | hinge | torque (N m) |
|
||||||
|
| 10 | Torque applied on the rotor between the left hip/thigh and the left shin | -0.4 | 0.4 | left_knee | hinge | torque (N m) |
|
||||||
|
| 11 | Torque applied on the rotor between the torso and right upper arm (coordinate -1) | -0.4 | 0.4 | right_shoulder1 | hinge | torque (N m) |
|
||||||
|
| 12 | Torque applied on the rotor between the torso and right upper arm (coordinate -2) | -0.4 | 0.4 | right_shoulder2 | hinge | torque (N m) |
|
||||||
|
| 13 | Torque applied on the rotor between the right upper arm and right lower arm | -0.4 | 0.4 | right_elbow | hinge | torque (N m) |
|
||||||
|
| 14 | Torque applied on the rotor between the torso and left upper arm (coordinate -1) | -0.4 | 0.4 | left_shoulder1 | hinge | torque (N m) |
|
||||||
|
| 15 | Torque applied on the rotor between the torso and left upper arm (coordinate -2) | -0.4 | 0.4 | left_shoulder2 | hinge | torque (N m) |
|
||||||
|
| 16 | Torque applied on the rotor between the left upper arm and left lower arm | -0.4 | 0.4 | left_elbow | hinge | torque (N m) |
|
||||||
|
|
||||||
|
### Observation Space
|
||||||
|
|
||||||
|
The state space consists of positional values of different body parts of the Humanoid,
|
||||||
|
followed by the velocities of those individual parts (their derivatives) with all the
|
||||||
|
positions ordered before all the velocities.
|
||||||
|
|
||||||
|
The observation is a `ndarray` with shape `(376,)` where the elements correspond to the following:
|
||||||
|
|
||||||
|
| Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit |
|
||||||
|
|-----|---------------------------------------------------------|----------------|-----------------|----------------------------------------|-------|------|
|
||||||
|
| 0 | x-coordinate of the torso (centre) | -Inf | Inf | root | free | position (m) |
|
||||||
|
| 1 | y-coordinate of the torso (centre) | -Inf | Inf | root | free | position (m) |
|
||||||
|
| 2 | z-coordinate of the torso (centre) | -Inf | Inf | root | free | position (m) |
|
||||||
|
| 3 | x-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) |
|
||||||
|
| 4 | y-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) |
|
||||||
|
| 5 | z-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) |
|
||||||
|
| 6 | w-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) |
|
||||||
|
| 7 | z-angle of the abdomen (in lower_waist) | -Inf | Inf | abdomen_z | hinge | angle (rad) |
|
||||||
|
| 8 | y-angle of the abdomen (in lower_waist) | -Inf | Inf | abdomen_y | hinge | angle (rad) |
|
||||||
|
| 9 | x-angle of the abdomen (in pelvis) | -Inf | Inf | abdomen_x | hinge | angle (rad) |
|
||||||
|
| 10 | x-coordinate of angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_x | hinge | angle (rad) |
|
||||||
|
| 11 | z-coordinate of angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_z | hinge | angle (rad) |
|
||||||
|
| 12 | y-coordinate of angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_y | hinge | angle (rad) |
|
||||||
|
| 13 | angle between right hip and the right shin (in right_knee) | -Inf | Inf | right_knee | hinge | angle (rad) |
|
||||||
|
| 14 | x-coordinate of angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_x | hinge | angle (rad) |
|
||||||
|
| 15 | z-coordinate of angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_z | hinge | angle (rad) |
|
||||||
|
| 16 | y-coordinate of angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_y | hinge | angle (rad) |
|
||||||
|
| 17 | angle between left hip and the left shin (in left_knee) | -Inf | Inf | left_knee | hinge | angle (rad) |
|
||||||
|
| 18 | coordinate-1 (multi-axis) angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder1 | hinge | angle (rad) |
|
||||||
|
| 19 | coordinate-2 (multi-axis) angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder2 | hinge | angle (rad) |
|
||||||
|
| 20 | angle between right upper arm and right_lower_arm | -Inf | Inf | right_elbow | hinge | angle (rad) |
|
||||||
|
| 21 | coordinate-1 (multi-axis) angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder1 | hinge | angle (rad) |
|
||||||
|
| 22 | coordinate-2 (multi-axis) angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder2 | hinge | angle (rad) |
|
||||||
|
| 23 | angle between left upper arm and left_lower_arm | -Inf | Inf | left_elbow | hinge | angle (rad) |
|
||||||
|
| 24 | x-coordinate velocity of the torso (centre) | -Inf | Inf | root | free | velocity (m/s) |
|
||||||
|
| 25 | y-coordinate velocity of the torso (centre) | -Inf | Inf | root | free | velocity (m/s) |
|
||||||
|
| 26 | z-coordinate velocity of the torso (centre) | -Inf | Inf | root | free | velocity (m/s) |
|
||||||
|
| 27 | x-coordinate angular velocity of the torso (centre) | -Inf | Inf | root | free | anglular velocity (rad/s) |
|
||||||
|
| 28 | y-coordinate angular velocity of the torso (centre) | -Inf | Inf | root | free | anglular velocity (rad/s) |
|
||||||
|
| 29 | z-coordinate angular velocity of the torso (centre) | -Inf | Inf | root | free | anglular velocity (rad/s) |
|
||||||
|
| 30 | z-coordinate of angular velocity of the abdomen (in lower_waist) | -Inf | Inf | abdomen_z | hinge | anglular velocity (rad/s) |
|
||||||
|
| 31 | y-coordinate of angular velocity of the abdomen (in lower_waist) | -Inf | Inf | abdomen_y | hinge | anglular velocity (rad/s) |
|
||||||
|
| 32 | x-coordinate of angular velocity of the abdomen (in pelvis) | -Inf | Inf | abdomen_x | hinge | aanglular velocity (rad/s) |
|
||||||
|
| 33 | x-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_x | hinge | anglular velocity (rad/s) |
|
||||||
|
| 34 | z-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_z | hinge | anglular velocity (rad/s) |
|
||||||
|
| 35 | y-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_y | hinge | anglular velocity (rad/s) |
|
||||||
|
| 36 | angular velocity of the angle between right hip and the right shin (in right_knee) | -Inf | Inf | right_knee | hinge | anglular velocity (rad/s) |
|
||||||
|
| 37 | x-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_x | hinge | anglular velocity (rad/s) |
|
||||||
|
| 38 | z-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_z | hinge | anglular velocity (rad/s) |
|
||||||
|
| 39 | y-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_y | hinge | anglular velocity (rad/s) |
|
||||||
|
| 40 | angular velocity of the angle between left hip and the left shin (in left_knee) | -Inf | Inf | left_knee | hinge | anglular velocity (rad/s) |
|
||||||
|
| 41 | coordinate-1 (multi-axis) of the angular velocity of the angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder1 | hinge | anglular velocity (rad/s) |
|
||||||
|
| 42 | coordinate-2 (multi-axis) of the angular velocity of the angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder2 | hinge | anglular velocity (rad/s) |
|
||||||
|
| 43 | angular velocity of the angle between right upper arm and right_lower_arm | -Inf | Inf | right_elbow | hinge | anglular velocity (rad/s) |
|
||||||
|
| 44 | coordinate-1 (multi-axis) of the angular velocity of the angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder1 | hinge | anglular velocity (rad/s) |
|
||||||
|
| 45 | coordinate-2 (multi-axis) of the angular velocity of the angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder2 | hinge | anglular velocity (rad/s) |
|
||||||
|
| 46 | angular velocitty of the angle between left upper arm and left_lower_arm | -Inf | Inf | left_elbow | hinge | anglular velocity (rad/s) |
|
||||||
|
|
||||||
|
Additionally, after all the positional and velocity based values in the table,
|
||||||
|
the state_space consists of (in order):
|
||||||
|
- *cinert:* Mass and inertia of a single rigid body relative to the center of mass
|
||||||
|
(this is an intermediate result of transition). It has shape 14*10 (*nbody * 10*)
|
||||||
|
and hence adds to another 140 elements in the state space.
|
||||||
|
- *cvel:* Center of mass based velocity. It has shape 14 * 6 (*nbody * 6*) and hence
|
||||||
|
adds another 84 elements in the state space
|
||||||
|
- *qfrc_actuator:* Constraint force generated as the actuator force. This has shape
|
||||||
|
`(23,)` *(nv * 1)* and hence adds another 23 elements to the state space.
|
||||||
|
- *cfrc_ext:* This is the center of mass based external force on the body. It has shape
|
||||||
|
14 * 6 (*nbody * 6*) and hence adds to another 84 elements in the state space.
|
||||||
|
where *nbody* stands for the number of bodies in the robot and *nv* stands for the
|
||||||
|
number of degrees of freedom (*= dim(qvel)*)
|
||||||
|
|
||||||
|
The (x,y,z) coordinates are translational DOFs while the orientations are rotational
|
||||||
|
DOFs expressed as quaternions. One can read more about free joints on the
|
||||||
|
[Mujoco Documentation](https://mujoco.readthedocs.io/en/latest/XMLreference.html).
|
||||||
|
|
||||||
|
**Note:** There are 47 elements in the table above - giving rise to `(378,)`
|
||||||
|
elements in the state space. In practice (and Gym implementation), the first two
|
||||||
|
positional elements are omitted from the state space since the reward function is
|
||||||
|
calculated based on the x-coordinate value. This value is hidden from the algorithm,
|
||||||
|
which in turn has to develop an abstract understanding of it from the observed rewards.
|
||||||
|
Therefore, observation space has shape `(376,)` instead of `(378,)` and the table should
|
||||||
|
not have the first two rows.
|
||||||
|
|
||||||
|
**Note:** Humanoid-v4 environment no longer has the following contact forces issue.
|
||||||
|
If using previous Humanoid versions from v4, there have been reported issues that using
|
||||||
|
a Mujoco-Py version > 2.0 results in the contact forces always being 0. As such we recommend
|
||||||
|
to use a Mujoco-Py version < 2.0 when using the Humanoid environment if you would like to report
|
||||||
|
results with contact forces (if contact forces are not used in your experiments, you can use
|
||||||
|
version > 2.0).
|
||||||
|
|
||||||
|
### Rewards
|
||||||
|
The reward consists of three parts:
|
||||||
|
- *alive_bonus*: Every timestep that the humanoid is alive, it gets a reward of 5.
|
||||||
|
- *lin_vel_cost*: A reward of walking forward which is measured as *1.25 *
|
||||||
|
(average center of mass before action - average center of mass after action)/dt*.
|
||||||
|
*dt* is the time between actions and is dependent on the frame_skip parameter
|
||||||
|
(default is 5), where the *dt* for one frame is 0.003 - making the default *dt = 5 * 0.003 = 0.015*.
|
||||||
|
This reward would be positive if the humanoid walks forward (right) desired. The calculation
|
||||||
|
for the center of mass is defined in the `.py` file for the Humanoid.
|
||||||
|
- *quad_ctrl_cost*: A negative reward for penalising the humanoid if it has too
|
||||||
|
large of a control force. If there are *nu* actuators/controls, then the control has
|
||||||
|
shape `nu x 1`. It is measured as *0.1 **x** sum(control<sup>2</sup>)*.
|
||||||
|
- *quad_impact_cost*: A negative reward for penalising the humanoid if the external
|
||||||
|
contact force is too large. It is calculated as
|
||||||
|
*min(0.5 * 0.000001 * sum(external contact force<sup>2</sup>), 10)*.
|
||||||
|
|
||||||
|
The total reward returned is ***reward*** *=* *alive_bonus + lin_vel_cost - quad_ctrl_cost - quad_impact_cost*
|
||||||
|
|
||||||
|
### Starting State
|
||||||
|
All observations start in state
|
||||||
|
(0.0, 0.0, 1.4, 1.0, 0.0 ... 0.0) with a uniform noise in the range
|
||||||
|
of [-0.01, 0.01] added to the positional and velocity values (values in the table)
|
||||||
|
for stochasticity. Note that the initial z coordinate is intentionally
|
||||||
|
selected to be high, thereby indicating a standing up humanoid. The initial
|
||||||
|
orientation is designed to make it face forward as well.
|
||||||
|
|
||||||
|
### Episode Termination
|
||||||
|
The episode terminates when any of the following happens:
|
||||||
|
|
||||||
|
1. The episode duration reaches a 1000 timesteps
|
||||||
|
2. Any of the state space values is no longer finite
|
||||||
|
3. The z-coordinate of the torso (index 0 in state space OR index 2 in the table) is **not** in the range `[1.0, 2.0]` (the humanoid has fallen or is about to fall beyond recovery).
|
||||||
|
|
||||||
|
### Arguments
|
||||||
|
|
||||||
|
No additional arguments are currently supported (in v2 and lower), but
|
||||||
|
modifications can be made to the XML file in the assets folder (or by changing
|
||||||
|
the path to a modified XML file in another folder)..
|
||||||
|
|
||||||
|
```
|
||||||
|
env = gym.make('Ant-v2')
|
||||||
|
```
|
||||||
|
|
||||||
|
v3 and beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
|
||||||
|
|
||||||
|
```
|
||||||
|
env = gym.make('Ant-v3', ctrl_cost_weight=0.1, ....)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Version History
|
||||||
|
|
||||||
|
* v4: all mujoco environments now use the mujoco binidings in mujoco>=2.1.3
|
||||||
|
* v3: support for gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. rgb rendering comes from tracking camera (so agent does not run away from screen)
|
||||||
|
* v2: All continuous control environments now use mujoco_py >= 1.50
|
||||||
|
* v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments.
|
||||||
|
* v0: Initial versions release (1.0.0)
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
xml_file="humanoid.xml",
|
||||||
|
forward_reward_weight=1.25,
|
||||||
|
ctrl_cost_weight=0.1,
|
||||||
|
healthy_reward=5.0,
|
||||||
|
terminate_when_unhealthy=True,
|
||||||
|
healthy_z_range=(1.0, 2.0),
|
||||||
|
reset_noise_scale=1e-2,
|
||||||
|
exclude_current_positions_from_observation=True,
|
||||||
|
):
|
||||||
|
utils.EzPickle.__init__(**locals())
|
||||||
|
|
||||||
|
self._forward_reward_weight = forward_reward_weight
|
||||||
|
self._ctrl_cost_weight = ctrl_cost_weight
|
||||||
|
self._healthy_reward = healthy_reward
|
||||||
|
self._terminate_when_unhealthy = terminate_when_unhealthy
|
||||||
|
self._healthy_z_range = healthy_z_range
|
||||||
|
|
||||||
|
self._reset_noise_scale = reset_noise_scale
|
||||||
|
|
||||||
|
self._exclude_current_positions_from_observation = (
|
||||||
|
exclude_current_positions_from_observation
|
||||||
|
)
|
||||||
|
|
||||||
|
mujoco_env.MujocoEnv.__init__(self, xml_file, 5)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def healthy_reward(self):
|
||||||
|
return (
|
||||||
|
float(self.is_healthy or self._terminate_when_unhealthy)
|
||||||
|
* self._healthy_reward
|
||||||
|
)
|
||||||
|
|
||||||
|
def control_cost(self, action):
|
||||||
|
control_cost = self._ctrl_cost_weight * np.sum(np.square(self.data.ctrl))
|
||||||
|
return control_cost
|
||||||
|
|
||||||
|
@property
|
||||||
|
def is_healthy(self):
|
||||||
|
min_z, max_z = self._healthy_z_range
|
||||||
|
is_healthy = min_z < self.data.qpos[2] < max_z
|
||||||
|
|
||||||
|
return is_healthy
|
||||||
|
|
||||||
|
@property
|
||||||
|
def done(self):
|
||||||
|
done = (not self.is_healthy) if self._terminate_when_unhealthy else False
|
||||||
|
return done
|
||||||
|
|
||||||
|
def _get_obs(self):
|
||||||
|
position = self.data.qpos.flat.copy()
|
||||||
|
velocity = self.data.qvel.flat.copy()
|
||||||
|
|
||||||
|
com_inertia = self.data.cinert.flat.copy()
|
||||||
|
com_velocity = self.data.cvel.flat.copy()
|
||||||
|
|
||||||
|
actuator_forces = self.data.qfrc_actuator.flat.copy()
|
||||||
|
external_contact_forces = self.data.cfrc_ext.flat.copy()
|
||||||
|
|
||||||
|
if self._exclude_current_positions_from_observation:
|
||||||
|
position = position[2:]
|
||||||
|
|
||||||
|
return np.concatenate(
|
||||||
|
(
|
||||||
|
position,
|
||||||
|
velocity,
|
||||||
|
com_inertia,
|
||||||
|
com_velocity,
|
||||||
|
actuator_forces,
|
||||||
|
external_contact_forces,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
def step(self, action):
|
||||||
|
xy_position_before = mass_center(self.model, self.data)
|
||||||
|
self.do_simulation(action, self.frame_skip)
|
||||||
|
xy_position_after = mass_center(self.model, self.data)
|
||||||
|
|
||||||
|
xy_velocity = (xy_position_after - xy_position_before) / self.dt
|
||||||
|
x_velocity, y_velocity = xy_velocity
|
||||||
|
|
||||||
|
ctrl_cost = self.control_cost(action)
|
||||||
|
|
||||||
|
forward_reward = self._forward_reward_weight * x_velocity
|
||||||
|
healthy_reward = self.healthy_reward
|
||||||
|
|
||||||
|
rewards = forward_reward + healthy_reward
|
||||||
|
|
||||||
|
observation = self._get_obs()
|
||||||
|
reward = rewards - ctrl_cost
|
||||||
|
done = self.done
|
||||||
|
info = {
|
||||||
|
"reward_linvel": forward_reward,
|
||||||
|
"reward_quadctrl": -ctrl_cost,
|
||||||
|
"reward_alive": healthy_reward,
|
||||||
|
"x_position": xy_position_after[0],
|
||||||
|
"y_position": xy_position_after[1],
|
||||||
|
"distance_from_origin": np.linalg.norm(xy_position_after, ord=2),
|
||||||
|
"x_velocity": x_velocity,
|
||||||
|
"y_velocity": y_velocity,
|
||||||
|
"forward_reward": forward_reward,
|
||||||
|
}
|
||||||
|
|
||||||
|
return observation, reward, done, info
|
||||||
|
|
||||||
|
def reset_model(self):
|
||||||
|
noise_low = -self._reset_noise_scale
|
||||||
|
noise_high = self._reset_noise_scale
|
||||||
|
|
||||||
|
qpos = self.init_qpos + self.np_random.uniform(
|
||||||
|
low=noise_low, high=noise_high, size=self.model.nq
|
||||||
|
)
|
||||||
|
qvel = self.init_qvel + self.np_random.uniform(
|
||||||
|
low=noise_low, high=noise_high, size=self.model.nv
|
||||||
|
)
|
||||||
|
self.set_state(qpos, qvel)
|
||||||
|
|
||||||
|
observation = self._get_obs()
|
||||||
|
return observation
|
||||||
|
|
||||||
|
def viewer_setup(self):
|
||||||
|
for key, value in DEFAULT_CAMERA_CONFIG.items():
|
||||||
|
if isinstance(value, np.ndarray):
|
||||||
|
getattr(self.viewer.cam, key)[:] = value
|
||||||
|
else:
|
||||||
|
setattr(self.viewer.cam, key, value)
|
@@ -5,178 +5,10 @@ from gym.envs.mujoco import mujoco_env
|
|||||||
|
|
||||||
|
|
||||||
class HumanoidStandupEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
class HumanoidStandupEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
||||||
"""
|
|
||||||
### Description
|
|
||||||
|
|
||||||
This environment is based on the environment introduced by Tassa, Erez and Todorov
|
|
||||||
in ["Synthesis and stabilization of complex behaviors through online trajectory optimization"](https://ieeexplore.ieee.org/document/6386025).
|
|
||||||
The 3D bipedal robot is designed to simulate a human. It has a torso (abdomen) with a
|
|
||||||
pair of legs and arms. The legs each consist of two links, and so the arms (representing the
|
|
||||||
knees and elbows respectively). The environment starts with the humanoid layiing on the ground,
|
|
||||||
and then the goal of the environment is to make the humanoid standup and then keep it standing
|
|
||||||
by applying torques on the various hinges.
|
|
||||||
|
|
||||||
### Action Space
|
|
||||||
The agent take a 17-element vector for actions.
|
|
||||||
|
|
||||||
The action space is a continuous `(action, ...)` all in `[-1, 1]`, where `action`
|
|
||||||
represents the numerical torques applied at the hinge joints.
|
|
||||||
|
|
||||||
| Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit |
|
|
||||||
|-----|----------------------|---------------|----------------|---------------------------------------|-------|------|
|
|
||||||
| 0 | Torque applied on the hinge in the y-coordinate of the abdomen | -0.4 | 0.4 | hip_1 (front_left_leg) | hinge | torque (N m) |
|
|
||||||
| 1 | Torque applied on the hinge in the z-coordinate of the abdomen | -0.4 | 0.4 | angle_1 (front_left_leg) | hinge | torque (N m) |
|
|
||||||
| 2 | Torque applied on the hinge in the x-coordinate of the abdomen | -0.4 | 0.4 | hip_2 (front_right_leg) | hinge | torque (N m) |
|
|
||||||
| 3 | Torque applied on the rotor between torso/abdomen and the right hip (x-coordinate) | -0.4 | 0.4 | right_hip_x (right_thigh) | hinge | torque (N m) |
|
|
||||||
| 4 | Torque applied on the rotor between torso/abdomen and the right hip (z-coordinate) | -0.4 | 0.4 | right_hip_z (right_thigh) | hinge | torque (N m) |
|
|
||||||
| 5 | Torque applied on the rotor between torso/abdomen and the right hip (y-coordinate) | -0.4 | 0.4 | right_hip_y (right_thigh) | hinge | torque (N m) |
|
|
||||||
| 6 | Torque applied on the rotor between the right hip/thigh and the right shin | -0.4 | 0.4 | right_knee | hinge | torque (N m) |
|
|
||||||
| 7 | Torque applied on the rotor between torso/abdomen and the left hip (x-coordinate) | -0.4 | 0.4 | left_hip_x (left_thigh) | hinge | torque (N m) |
|
|
||||||
| 8 | Torque applied on the rotor between torso/abdomen and the left hip (z-coordinate) | -0.4 | 0.4 | left_hip_z (left_thigh) | hinge | torque (N m) |
|
|
||||||
| 9 | Torque applied on the rotor between torso/abdomen and the left hip (y-coordinate) | -0.4 | 0.4 | left_hip_y (left_thigh) | hinge | torque (N m) |
|
|
||||||
| 10 | Torque applied on the rotor between the left hip/thigh and the left shin | -0.4 | 0.4 | left_knee | hinge | torque (N m) |
|
|
||||||
| 11 | Torque applied on the rotor between the torso and right upper arm (coordinate -1) | -0.4 | 0.4 | right_shoulder1 | hinge | torque (N m) |
|
|
||||||
| 12 | Torque applied on the rotor between the torso and right upper arm (coordinate -2) | -0.4 | 0.4 | right_shoulder2 | hinge | torque (N m) |
|
|
||||||
| 13 | Torque applied on the rotor between the right upper arm and right lower arm | -0.4 | 0.4 | right_elbow | hinge | torque (N m) |
|
|
||||||
| 14 | Torque applied on the rotor between the torso and left upper arm (coordinate -1) | -0.4 | 0.4 | left_shoulder1 | hinge | torque (N m) |
|
|
||||||
| 15 | Torque applied on the rotor between the torso and left upper arm (coordinate -2) | -0.4 | 0.4 | left_shoulder2 | hinge | torque (N m) |
|
|
||||||
| 16 | Torque applied on the rotor between the left upper arm and left lower arm | -0.4 | 0.4 | left_elbow | hinge | torque (N m) |
|
|
||||||
|
|
||||||
### Observation Space
|
|
||||||
|
|
||||||
The state space consists of positional values of different body parts of the Humanoid,
|
|
||||||
followed by the velocities of those individual parts (their derivatives) with all the positions ordered before all the velocities.
|
|
||||||
|
|
||||||
**Note:** The x- and y-coordinates of the torso are being omitted to produce position-agnostic behavior in policies
|
|
||||||
|
|
||||||
The observation is a `ndarray` with shape `(376,)` where the elements correspond to the following:
|
|
||||||
|
|
||||||
| Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit |
|
|
||||||
|-----|---------------------------------------------------------|----------------|-----------------|----------------------------------------|-------|------|
|
|
||||||
| 0 | z-coordinate of the torso (centre) | -Inf | Inf | root | free | position (m) |
|
|
||||||
| 1 | x-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) |
|
|
||||||
| 2 | y-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) |
|
|
||||||
| 3 | z-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) |
|
|
||||||
| 4 | w-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) |
|
|
||||||
| 5 | z-angle of the abdomen (in lower_waist) | -Inf | Inf | abdomen_z | hinge | angle (rad) |
|
|
||||||
| 6 | y-angle of the abdomen (in lower_waist) | -Inf | Inf | abdomen_y | hinge | angle (rad) |
|
|
||||||
| 7 | x-angle of the abdomen (in pelvis) | -Inf | Inf | abdomen_x | hinge | angle (rad) |
|
|
||||||
| 8 | x-coordinate of angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_x | hinge | angle (rad) |
|
|
||||||
| 9 | z-coordinate of angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_z | hinge | angle (rad) |
|
|
||||||
| 10 | y-coordinate of angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_y | hinge | angle (rad) |
|
|
||||||
| 11 | angle between right hip and the right shin (in right_knee) | -Inf | Inf | right_knee | hinge | angle (rad) |
|
|
||||||
| 12 | x-coordinate of angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_x | hinge | angle (rad) |
|
|
||||||
| 13 | z-coordinate of angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_z | hinge | angle (rad) |
|
|
||||||
| 14 | y-coordinate of angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_y | hinge | angle (rad) |
|
|
||||||
| 15 | angle between left hip and the left shin (in left_knee) | -Inf | Inf | left_knee | hinge | angle (rad) |
|
|
||||||
| 16 | coordinate-1 (multi-axis) angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder1 | hinge | angle (rad) |
|
|
||||||
| 17 | coordinate-2 (multi-axis) angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder2 | hinge | angle (rad) |
|
|
||||||
| 18 | angle between right upper arm and right_lower_arm | -Inf | Inf | right_elbow | hinge | angle (rad) |
|
|
||||||
| 19 | coordinate-1 (multi-axis) angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder1 | hinge | angle (rad) |
|
|
||||||
| 20 | coordinate-2 (multi-axis) angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder2 | hinge | angle (rad) |
|
|
||||||
| 21 | angle between left upper arm and left_lower_arm | -Inf | Inf | left_elbow | hinge | angle (rad) |
|
|
||||||
| 22 | x-coordinate velocity of the torso (centre) | -Inf | Inf | root | free | velocity (m/s) |
|
|
||||||
| 23 | y-coordinate velocity of the torso (centre) | -Inf | Inf | root | free | velocity (m/s) |
|
|
||||||
| 24 | z-coordinate velocity of the torso (centre) | -Inf | Inf | root | free | velocity (m/s) |
|
|
||||||
| 25 | x-coordinate angular velocity of the torso (centre) | -Inf | Inf | root | free | anglular velocity (rad/s) |
|
|
||||||
| 26 | y-coordinate angular velocity of the torso (centre) | -Inf | Inf | root | free | anglular velocity (rad/s) |
|
|
||||||
| 27 | z-coordinate angular velocity of the torso (centre) | -Inf | Inf | root | free | anglular velocity (rad/s) |
|
|
||||||
| 28 | z-coordinate of angular velocity of the abdomen (in lower_waist) | -Inf | Inf | abdomen_z | hinge | anglular velocity (rad/s) |
|
|
||||||
| 29 | y-coordinate of angular velocity of the abdomen (in lower_waist) | -Inf | Inf | abdomen_y | hinge | anglular velocity (rad/s) |
|
|
||||||
| 30 | x-coordinate of angular velocity of the abdomen (in pelvis) | -Inf | Inf | abdomen_x | hinge | aanglular velocity (rad/s) |
|
|
||||||
| 31 | x-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_x | hinge | anglular velocity (rad/s) |
|
|
||||||
| 32 | z-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_z | hinge | anglular velocity (rad/s) |
|
|
||||||
| 33 | y-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_y | hinge | anglular velocity (rad/s) |
|
|
||||||
| 35 | angular velocity of the angle between right hip and the right shin (in right_knee) | -Inf | Inf | right_knee | hinge | anglular velocity (rad/s) |
|
|
||||||
| 36 | x-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_x | hinge | anglular velocity (rad/s) |
|
|
||||||
| 37 | z-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_z | hinge | anglular velocity (rad/s) |
|
|
||||||
| 38 | y-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_y | hinge | anglular velocity (rad/s) |
|
|
||||||
| 39 | angular velocity of the angle between left hip and the left shin (in left_knee) | -Inf | Inf | left_knee | hinge | anglular velocity (rad/s) |
|
|
||||||
| 40 | coordinate-1 (multi-axis) of the angular velocity of the angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder1 | hinge | anglular velocity (rad/s) |
|
|
||||||
| 41 | coordinate-2 (multi-axis) of the angular velocity of the angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder2 | hinge | anglular velocity (rad/s) |
|
|
||||||
| 42 | angular velocity of the angle between right upper arm and right_lower_arm | -Inf | Inf | right_elbow | hinge | anglular velocity (rad/s) |
|
|
||||||
| 43 | coordinate-1 (multi-axis) of the angular velocity of the angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder1 | hinge | anglular velocity (rad/s) |
|
|
||||||
| 44 | coordinate-2 (multi-axis) of the angular velocity of the angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder2 | hinge | anglular velocity (rad/s) |
|
|
||||||
| 45 | angular velocitty of the angle between left upper arm and left_lower_arm | -Inf | Inf | left_elbow | hinge | anglular velocity (rad/s) |
|
|
||||||
|
|
||||||
|
|
||||||
Additionally, after all the positional and velocity based values in the table,
|
|
||||||
the state_space consists of (in order):
|
|
||||||
- *cinert:* Mass and inertia of a single rigid body relative to the center of mass
|
|
||||||
(this is an intermediate result of transition). It has shape 14*10 (*nbody * 10*)
|
|
||||||
and hence adds to another 140 elements in the state space.
|
|
||||||
- *cvel:* Center of mass based velocity. It has shape 14 * 6 (*nbody * 6*) and hence
|
|
||||||
adds another 84 elements in the state space
|
|
||||||
- *qfrc_actuator:* Constraint force generated as the actuator force. This has shape
|
|
||||||
`(23,)` *(nv * 1)* and hence adds another 23 elements to the state space.
|
|
||||||
- *cfrc_ext:* This is the center of mass based external force on the body. It has shape
|
|
||||||
14 * 6 (*nbody * 6*) and hence adds to another 84 elements in the state space.
|
|
||||||
where *nbody* stands for the number of bodies in the robot and *nv* stands for the number
|
|
||||||
of degrees of freedom (*= dim(qvel)*)
|
|
||||||
|
|
||||||
The (x,y,z) coordinates are translational DOFs while the orientations are rotational
|
|
||||||
DOFs expressed as quaternions. One can read more about free joints on the
|
|
||||||
[Mujoco Documentation](https://mujoco.readthedocs.io/en/latest/XMLreference.html).
|
|
||||||
|
|
||||||
**Note:** There have been reported issues that using a Mujoco-Py version > 2.0 results
|
|
||||||
in the contact forces always being 0. As such we recommend to use a Mujoco-Py version < 2.0
|
|
||||||
when using the Humanoid environment if you would like to report results with contact forces
|
|
||||||
(if contact forces are not used in your experiments, you can use version > 2.0).
|
|
||||||
|
|
||||||
### Rewards
|
|
||||||
The reward consists of three parts:
|
|
||||||
- *uph_cost*: A reward for moving upward (in an attempt to stand up). This is not a relative
|
|
||||||
reward which meaures how much upward it has moved from the last timestep, but it is an
|
|
||||||
absolute reward which measures how much upward the Humanoid has moved overall. It is
|
|
||||||
measured as *(z coordinate after action - 0)/(atomic timestep)*, where *z coordinate after
|
|
||||||
action* is index 0 in the state/index 2 in the table, and *atomic timestep* is the time for
|
|
||||||
one frame of movement even though the simulation has a framerate of 5 (done in order to inflate
|
|
||||||
rewards a little for fassteer learning).
|
|
||||||
- *quad_ctrl_cost*: A negative reward for penalising the humanoid if it has too large of
|
|
||||||
a control force. If there are *nu* actuators/controls, then the control has shape `nu x 1`.
|
|
||||||
It is measured as *0.1 **x** sum(control<sup>2</sup>)*.
|
|
||||||
- *quad_impact_cost*: A negative reward for penalising the humanoid if the external
|
|
||||||
contact force is too large. It is calculated as *min(0.5 * 0.000001 * sum(external
|
|
||||||
contact force<sup>2</sup>), 10)*.
|
|
||||||
|
|
||||||
The total reward returned is ***reward*** *=* *uph_cost + 1 - quad_ctrl_cost - quad_impact_cost*
|
|
||||||
|
|
||||||
### Starting State
|
|
||||||
All observations start in state
|
|
||||||
(0.0, 0.0, 0.105, 1.0, 0.0 ... 0.0) with a uniform noise in the range of
|
|
||||||
[-0.01, 0.01] added to the positional and velocity values (values in the table)
|
|
||||||
for stochasticity. Note that the initial z coordinate is intentionally selected
|
|
||||||
to be low, thereby indicating a laying down humanoid. The initial orientation is
|
|
||||||
designed to make it face forward as well.
|
|
||||||
|
|
||||||
### Episode Termination
|
|
||||||
The episode terminates when any of the following happens:
|
|
||||||
|
|
||||||
1. The episode duration reaches a 1000 timesteps
|
|
||||||
2. Any of the state space values is no longer finite
|
|
||||||
|
|
||||||
### Arguments
|
|
||||||
|
|
||||||
No additional arguments are currently supported.
|
|
||||||
|
|
||||||
```
|
|
||||||
env = gym.make('HumanoidStandup-v2')
|
|
||||||
```
|
|
||||||
|
|
||||||
There is no v3 for HumanoidStandup, unlike the robot environments where a v3 and
|
|
||||||
beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
|
|
||||||
|
|
||||||
|
|
||||||
### Version History
|
|
||||||
|
|
||||||
* v2: All continuous control environments now use mujoco_py >= 1.50
|
|
||||||
* v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments.
|
|
||||||
* v0: Initial versions release (1.0.0)
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
mujoco_env.MujocoEnv.__init__(self, "humanoidstandup.xml", 5)
|
mujoco_env.MujocoEnv.__init__(
|
||||||
|
self, "humanoidstandup.xml", 5, mujoco_bindings="mujoco_py"
|
||||||
|
)
|
||||||
utils.EzPickle.__init__(self)
|
utils.EzPickle.__init__(self)
|
||||||
|
|
||||||
def _get_obs(self):
|
def _get_obs(self):
|
||||||
|
247
gym/envs/mujoco/humanoidstandup_v4.py
Normal file
247
gym/envs/mujoco/humanoidstandup_v4.py
Normal file
@@ -0,0 +1,247 @@
|
|||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from gym import utils
|
||||||
|
from gym.envs.mujoco import mujoco_env
|
||||||
|
|
||||||
|
|
||||||
|
class HumanoidStandupEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
||||||
|
"""
|
||||||
|
### Description
|
||||||
|
|
||||||
|
This environment is based on the environment introduced by Tassa, Erez and Todorov
|
||||||
|
in ["Synthesis and stabilization of complex behaviors through online trajectory optimization"](https://ieeexplore.ieee.org/document/6386025).
|
||||||
|
The 3D bipedal robot is designed to simulate a human. It has a torso (abdomen) with a
|
||||||
|
pair of legs and arms. The legs each consist of two links, and so the arms (representing the
|
||||||
|
knees and elbows respectively). The environment starts with the humanoid layiing on the ground,
|
||||||
|
and then the goal of the environment is to make the humanoid standup and then keep it standing
|
||||||
|
by applying torques on the various hinges.
|
||||||
|
|
||||||
|
### Action Space
|
||||||
|
The agent take a 17-element vector for actions.
|
||||||
|
|
||||||
|
The action space is a continuous `(action, ...)` all in `[-1, 1]`, where `action`
|
||||||
|
represents the numerical torques applied at the hinge joints.
|
||||||
|
|
||||||
|
| Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit |
|
||||||
|
|-----|----------------------|---------------|----------------|---------------------------------------|-------|------|
|
||||||
|
| 0 | Torque applied on the hinge in the y-coordinate of the abdomen | -0.4 | 0.4 | hip_1 (front_left_leg) | hinge | torque (N m) |
|
||||||
|
| 1 | Torque applied on the hinge in the z-coordinate of the abdomen | -0.4 | 0.4 | angle_1 (front_left_leg) | hinge | torque (N m) |
|
||||||
|
| 2 | Torque applied on the hinge in the x-coordinate of the abdomen | -0.4 | 0.4 | hip_2 (front_right_leg) | hinge | torque (N m) |
|
||||||
|
| 3 | Torque applied on the rotor between torso/abdomen and the right hip (x-coordinate) | -0.4 | 0.4 | right_hip_x (right_thigh) | hinge | torque (N m) |
|
||||||
|
| 4 | Torque applied on the rotor between torso/abdomen and the right hip (z-coordinate) | -0.4 | 0.4 | right_hip_z (right_thigh) | hinge | torque (N m) |
|
||||||
|
| 5 | Torque applied on the rotor between torso/abdomen and the right hip (y-coordinate) | -0.4 | 0.4 | right_hip_y (right_thigh) | hinge | torque (N m) |
|
||||||
|
| 6 | Torque applied on the rotor between the right hip/thigh and the right shin | -0.4 | 0.4 | right_knee | hinge | torque (N m) |
|
||||||
|
| 7 | Torque applied on the rotor between torso/abdomen and the left hip (x-coordinate) | -0.4 | 0.4 | left_hip_x (left_thigh) | hinge | torque (N m) |
|
||||||
|
| 8 | Torque applied on the rotor between torso/abdomen and the left hip (z-coordinate) | -0.4 | 0.4 | left_hip_z (left_thigh) | hinge | torque (N m) |
|
||||||
|
| 9 | Torque applied on the rotor between torso/abdomen and the left hip (y-coordinate) | -0.4 | 0.4 | left_hip_y (left_thigh) | hinge | torque (N m) |
|
||||||
|
| 10 | Torque applied on the rotor between the left hip/thigh and the left shin | -0.4 | 0.4 | left_knee | hinge | torque (N m) |
|
||||||
|
| 11 | Torque applied on the rotor between the torso and right upper arm (coordinate -1) | -0.4 | 0.4 | right_shoulder1 | hinge | torque (N m) |
|
||||||
|
| 12 | Torque applied on the rotor between the torso and right upper arm (coordinate -2) | -0.4 | 0.4 | right_shoulder2 | hinge | torque (N m) |
|
||||||
|
| 13 | Torque applied on the rotor between the right upper arm and right lower arm | -0.4 | 0.4 | right_elbow | hinge | torque (N m) |
|
||||||
|
| 14 | Torque applied on the rotor between the torso and left upper arm (coordinate -1) | -0.4 | 0.4 | left_shoulder1 | hinge | torque (N m) |
|
||||||
|
| 15 | Torque applied on the rotor between the torso and left upper arm (coordinate -2) | -0.4 | 0.4 | left_shoulder2 | hinge | torque (N m) |
|
||||||
|
| 16 | Torque applied on the rotor between the left upper arm and left lower arm | -0.4 | 0.4 | left_elbow | hinge | torque (N m) |
|
||||||
|
|
||||||
|
### Observation Space
|
||||||
|
|
||||||
|
The state space consists of positional values of different body parts of the Humanoid,
|
||||||
|
followed by the velocities of those individual parts (their derivatives) with all the positions ordered before all the velocities.
|
||||||
|
|
||||||
|
The observation is a `ndarray` with shape `(376,)` where the elements correspond to the following:
|
||||||
|
|
||||||
|
| Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit |
|
||||||
|
|-----|---------------------------------------------------------|----------------|-----------------|----------------------------------------|-------|------|
|
||||||
|
| 0 | x-coordinate of the torso (centre) | -Inf | Inf | root | free | position (m) |
|
||||||
|
| 1 | y-coordinate of the torso (centre) | -Inf | Inf | root | free | position (m) |
|
||||||
|
| 2 | z-coordinate of the torso (centre) | -Inf | Inf | root | free | position (m) |
|
||||||
|
| 3 | x-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) |
|
||||||
|
| 4 | y-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) |
|
||||||
|
| 5 | z-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) |
|
||||||
|
| 6 | w-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) |
|
||||||
|
| 7 | z-angle of the abdomen (in lower_waist) | -Inf | Inf | abdomen_z | hinge | angle (rad) |
|
||||||
|
| 8 | y-angle of the abdomen (in lower_waist) | -Inf | Inf | abdomen_y | hinge | angle (rad) |
|
||||||
|
| 9 | x-angle of the abdomen (in pelvis) | -Inf | Inf | abdomen_x | hinge | angle (rad) |
|
||||||
|
| 10 | x-coordinate of angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_x | hinge | angle (rad) |
|
||||||
|
| 11 | z-coordinate of angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_z | hinge | angle (rad) |
|
||||||
|
| 12 | y-coordinate of angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_y | hinge | angle (rad) |
|
||||||
|
| 13 | angle between right hip and the right shin (in right_knee) | -Inf | Inf | right_knee | hinge | angle (rad) |
|
||||||
|
| 14 | x-coordinate of angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_x | hinge | angle (rad) |
|
||||||
|
| 15 | z-coordinate of angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_z | hinge | angle (rad) |
|
||||||
|
| 16 | y-coordinate of angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_y | hinge | angle (rad) |
|
||||||
|
| 17 | angle between left hip and the left shin (in left_knee) | -Inf | Inf | left_knee | hinge | angle (rad) |
|
||||||
|
| 18 | coordinate-1 (multi-axis) angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder1 | hinge | angle (rad) |
|
||||||
|
| 19 | coordinate-2 (multi-axis) angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder2 | hinge | angle (rad) |
|
||||||
|
| 20 | angle between right upper arm and right_lower_arm | -Inf | Inf | right_elbow | hinge | angle (rad) |
|
||||||
|
| 21 | coordinate-1 (multi-axis) angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder1 | hinge | angle (rad) |
|
||||||
|
| 22 | coordinate-2 (multi-axis) angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder2 | hinge | angle (rad) |
|
||||||
|
| 23 | angle between left upper arm and left_lower_arm | -Inf | Inf | left_elbow | hinge | angle (rad) |
|
||||||
|
| 24 | x-coordinate velocity of the torso (centre) | -Inf | Inf | root | free | velocity (m/s) |
|
||||||
|
| 25 | y-coordinate velocity of the torso (centre) | -Inf | Inf | root | free | velocity (m/s) |
|
||||||
|
| 26 | z-coordinate velocity of the torso (centre) | -Inf | Inf | root | free | velocity (m/s) |
|
||||||
|
| 27 | x-coordinate angular velocity of the torso (centre) | -Inf | Inf | root | free | anglular velocity (rad/s) |
|
||||||
|
| 28 | y-coordinate angular velocity of the torso (centre) | -Inf | Inf | root | free | anglular velocity (rad/s) |
|
||||||
|
| 29 | z-coordinate angular velocity of the torso (centre) | -Inf | Inf | root | free | anglular velocity (rad/s) |
|
||||||
|
| 30 | z-coordinate of angular velocity of the abdomen (in lower_waist) | -Inf | Inf | abdomen_z | hinge | anglular velocity (rad/s) |
|
||||||
|
| 31 | y-coordinate of angular velocity of the abdomen (in lower_waist) | -Inf | Inf | abdomen_y | hinge | anglular velocity (rad/s) |
|
||||||
|
| 32 | x-coordinate of angular velocity of the abdomen (in pelvis) | -Inf | Inf | abdomen_x | hinge | aanglular velocity (rad/s) |
|
||||||
|
| 33 | x-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_x | hinge | anglular velocity (rad/s) |
|
||||||
|
| 34 | z-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_z | hinge | anglular velocity (rad/s) |
|
||||||
|
| 35 | y-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_y | hinge | anglular velocity (rad/s) |
|
||||||
|
| 36 | angular velocity of the angle between right hip and the right shin (in right_knee) | -Inf | Inf | right_knee | hinge | anglular velocity (rad/s) |
|
||||||
|
| 37 | x-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_x | hinge | anglular velocity (rad/s) |
|
||||||
|
| 38 | z-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_z | hinge | anglular velocity (rad/s) |
|
||||||
|
| 39 | y-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_y | hinge | anglular velocity (rad/s) |
|
||||||
|
| 40 | angular velocity of the angle between left hip and the left shin (in left_knee) | -Inf | Inf | left_knee | hinge | anglular velocity (rad/s) |
|
||||||
|
| 41 | coordinate-1 (multi-axis) of the angular velocity of the angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder1 | hinge | anglular velocity (rad/s) |
|
||||||
|
| 42 | coordinate-2 (multi-axis) of the angular velocity of the angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder2 | hinge | anglular velocity (rad/s) |
|
||||||
|
| 43 | angular velocity of the angle between right upper arm and right_lower_arm | -Inf | Inf | right_elbow | hinge | anglular velocity (rad/s) |
|
||||||
|
| 44 | coordinate-1 (multi-axis) of the angular velocity of the angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder1 | hinge | anglular velocity (rad/s) |
|
||||||
|
| 45 | coordinate-2 (multi-axis) of the angular velocity of the angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder2 | hinge | anglular velocity (rad/s) |
|
||||||
|
| 46 | angular velocitty of the angle between left upper arm and left_lower_arm | -Inf | Inf | left_elbow | hinge | anglular velocity (rad/s) |
|
||||||
|
|
||||||
|
|
||||||
|
Additionally, after all the positional and velocity based values in the table,
|
||||||
|
the state_space consists of (in order):
|
||||||
|
- *cinert:* Mass and inertia of a single rigid body relative to the center of mass
|
||||||
|
(this is an intermediate result of transition). It has shape 14*10 (*nbody * 10*)
|
||||||
|
and hence adds to another 140 elements in the state space.
|
||||||
|
- *cvel:* Center of mass based velocity. It has shape 14 * 6 (*nbody * 6*) and hence
|
||||||
|
adds another 84 elements in the state space
|
||||||
|
- *qfrc_actuator:* Constraint force generated as the actuator force. This has shape
|
||||||
|
`(23,)` *(nv * 1)* and hence adds another 23 elements to the state space.
|
||||||
|
- *cfrc_ext:* This is the center of mass based external force on the body. It has shape
|
||||||
|
14 * 6 (*nbody * 6*) and hence adds to another 84 elements in the state space.
|
||||||
|
where *nbody* stands for the number of bodies in the robot and *nv* stands for the number
|
||||||
|
of degrees of freedom (*= dim(qvel)*)
|
||||||
|
|
||||||
|
The (x,y,z) coordinates are translational DOFs while the orientations are rotational
|
||||||
|
DOFs expressed as quaternions. One can read more about free joints on the
|
||||||
|
[Mujoco Documentation](https://mujoco.readthedocs.io/en/latest/XMLreference.html).
|
||||||
|
|
||||||
|
**Note:** There are 47 elements in the table above - giving rise to `(378,)`
|
||||||
|
elements in the state space. In practice (and Gym implementation), the first two
|
||||||
|
positional elements are omitted from the state space since the reward function is
|
||||||
|
calculated based on the x-coordinate value. This value is hidden from the algorithm,
|
||||||
|
which in turn has to develop an abstract understanding of it from the observed rewards.
|
||||||
|
Therefore, observation space has shape `(376,)` instead of `(378,)` and the table should not have the first two rows.
|
||||||
|
|
||||||
|
**Note:** HumanoidStandup-v4 environment no longer has the following contact forces issue.
|
||||||
|
If using previous HumanoidStandup versions from v4, there have been reported issues that using a Mujoco-Py version > 2.0 results
|
||||||
|
in the contact forces always being 0. As such we recommend to use a Mujoco-Py version < 2.0
|
||||||
|
when using the Humanoid environment if you would like to report results with contact forces
|
||||||
|
(if contact forces are not used in your experiments, you can use version > 2.0).
|
||||||
|
|
||||||
|
### Rewards
|
||||||
|
The reward consists of three parts:
|
||||||
|
- *uph_cost*: A reward for moving upward (in an attempt to stand up). This is not a relative
|
||||||
|
reward which meaures how much upward it has moved from the last timestep, but it is an
|
||||||
|
absolute reward which measures how much upward the Humanoid has moved overall. It is
|
||||||
|
measured as *(z coordinate after action - 0)/(atomic timestep)*, where *z coordinate after
|
||||||
|
action* is index 0 in the state/index 2 in the table, and *atomic timestep* is the time for
|
||||||
|
one frame of movement even though the simulation has a framerate of 5 (done in order to inflate
|
||||||
|
rewards a little for fassteer learning).
|
||||||
|
- *quad_ctrl_cost*: A negative reward for penalising the humanoid if it has too large of
|
||||||
|
a control force. If there are *nu* actuators/controls, then the control has shape `nu x 1`.
|
||||||
|
It is measured as *0.1 **x** sum(control<sup>2</sup>)*.
|
||||||
|
- *quad_impact_cost*: A negative reward for penalising the humanoid if the external
|
||||||
|
contact force is too large. It is calculated as *min(0.5 * 0.000001 * sum(external
|
||||||
|
contact force<sup>2</sup>), 10)*.
|
||||||
|
|
||||||
|
The total reward returned is ***reward*** *=* *uph_cost + 1 - quad_ctrl_cost - quad_impact_cost*
|
||||||
|
|
||||||
|
### Starting State
|
||||||
|
All observations start in state
|
||||||
|
(0.0, 0.0, 0.105, 1.0, 0.0 ... 0.0) with a uniform noise in the range of
|
||||||
|
[-0.01, 0.01] added to the positional and velocity values (values in the table)
|
||||||
|
for stochasticity. Note that the initial z coordinate is intentionally selected
|
||||||
|
to be low, thereby indicating a laying down humanoid. The initial orientation is
|
||||||
|
designed to make it face forward as well.
|
||||||
|
|
||||||
|
### Episode Termination
|
||||||
|
The episode terminates when any of the following happens:
|
||||||
|
|
||||||
|
1. The episode duration reaches a 1000 timesteps
|
||||||
|
2. Any of the state space values is no longer finite
|
||||||
|
|
||||||
|
### Arguments
|
||||||
|
|
||||||
|
No additional arguments are currently supported (in v2 and lower), but
|
||||||
|
modifications can be made to the XML file in the assets folder
|
||||||
|
(or by changing the path to a modified XML file in another folder)..
|
||||||
|
|
||||||
|
```
|
||||||
|
env = gym.make('HumanoidStandup-v2')
|
||||||
|
```
|
||||||
|
|
||||||
|
There is no v3 for HumanoidStandup, unlike the robot environments where a v3 and
|
||||||
|
beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
|
||||||
|
|
||||||
|
|
||||||
|
### Version History
|
||||||
|
|
||||||
|
* v4: all mujoco environments now use the mujoco binidings in mujoco>=2.1.3
|
||||||
|
* v3: support for gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. rgb rendering comes from tracking camera (so agent does not run away from screen)
|
||||||
|
* v2: All continuous control environments now use mujoco_py >= 1.50
|
||||||
|
* v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments.
|
||||||
|
* v0: Initial versions release (1.0.0)
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
mujoco_env.MujocoEnv.__init__(self, "humanoidstandup.xml", 5)
|
||||||
|
utils.EzPickle.__init__(self)
|
||||||
|
|
||||||
|
def _get_obs(self):
|
||||||
|
data = self.data
|
||||||
|
return np.concatenate(
|
||||||
|
[
|
||||||
|
data.qpos.flat[2:],
|
||||||
|
data.qvel.flat,
|
||||||
|
data.cinert.flat,
|
||||||
|
data.cvel.flat,
|
||||||
|
data.qfrc_actuator.flat,
|
||||||
|
data.cfrc_ext.flat,
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
def step(self, a):
|
||||||
|
self.do_simulation(a, self.frame_skip)
|
||||||
|
pos_after = self.data.qpos[2]
|
||||||
|
data = self.data
|
||||||
|
uph_cost = (pos_after - 0) / self.model.opt.timestep
|
||||||
|
|
||||||
|
quad_ctrl_cost = 0.1 * np.square(data.ctrl).sum()
|
||||||
|
quad_impact_cost = 0.5e-6 * np.square(data.cfrc_ext).sum()
|
||||||
|
quad_impact_cost = min(quad_impact_cost, 10)
|
||||||
|
reward = uph_cost - quad_ctrl_cost - quad_impact_cost + 1
|
||||||
|
|
||||||
|
done = bool(False)
|
||||||
|
return (
|
||||||
|
self._get_obs(),
|
||||||
|
reward,
|
||||||
|
done,
|
||||||
|
dict(
|
||||||
|
reward_linup=uph_cost,
|
||||||
|
reward_quadctrl=-quad_ctrl_cost,
|
||||||
|
reward_impact=-quad_impact_cost,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
def reset_model(self):
|
||||||
|
c = 0.01
|
||||||
|
self.set_state(
|
||||||
|
self.init_qpos + self.np_random.uniform(low=-c, high=c, size=self.model.nq),
|
||||||
|
self.init_qvel
|
||||||
|
+ self.np_random.uniform(
|
||||||
|
low=-c,
|
||||||
|
high=c,
|
||||||
|
size=self.model.nv,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
return self._get_obs()
|
||||||
|
|
||||||
|
def viewer_setup(self):
|
||||||
|
self.viewer.cam.trackbodyid = 1
|
||||||
|
self.viewer.cam.distance = self.model.stat.extent * 1.0
|
||||||
|
self.viewer.cam.lookat[2] = 0.8925
|
||||||
|
self.viewer.cam.elevation = -20
|
@@ -5,113 +5,10 @@ from gym.envs.mujoco import mujoco_env
|
|||||||
|
|
||||||
|
|
||||||
class InvertedDoublePendulumEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
class InvertedDoublePendulumEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
||||||
"""
|
|
||||||
### Description
|
|
||||||
|
|
||||||
This environment originates from control theory and builds on the cartpole
|
|
||||||
environment based on the work done by Barto, Sutton, and Anderson in
|
|
||||||
["Neuronlike adaptive elements that can solve difficult learning control problems"](https://ieeexplore.ieee.org/document/6313077),
|
|
||||||
powered by the Mujoco physics simulator - allowing for more complex experiments
|
|
||||||
(such as varying the effects of gravity or constraints). This environment involves a cart that can
|
|
||||||
moved linearly, with a pole fixed on it and a second pole fixed on the other end of the first one
|
|
||||||
(leaving the second pole as the only one with one free end). The cart can be pushed left or right,
|
|
||||||
and the goal is to balance the second pole on top of the first pole, which is in turn on top of the
|
|
||||||
cart, by applying continuous forces on the cart.
|
|
||||||
|
|
||||||
### Action Space
|
|
||||||
The agent take a 1-element vector for actions.
|
|
||||||
The action space is a continuous `(action)` in `[-1, 1]`, where `action` represents the
|
|
||||||
numerical force applied to the cart (with magnitude representing the amount of force and
|
|
||||||
sign representing the direction)
|
|
||||||
|
|
||||||
| Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit |
|
|
||||||
|-----|---------------------------|-------------|-------------|----------------------------------|-------|-----------|
|
|
||||||
| 0 | Force applied on the cart | -1 | 1 | slider | slide | Force (N) |
|
|
||||||
|
|
||||||
### Observation Space
|
|
||||||
|
|
||||||
The state space consists of positional values of different body parts of the pendulum system,
|
|
||||||
followed by the velocities of those individual parts (their derivatives) with all the
|
|
||||||
positions ordered before all the velocities.
|
|
||||||
|
|
||||||
The observation is a `ndarray` with shape `(11,)` where the elements correspond to the following:
|
|
||||||
|
|
||||||
| Num | Observation | Min | Max | Name (in corresponding XML file) | Joint| Unit |
|
|
||||||
|-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------|
|
|
||||||
| 0 | position of the cart along the linear surface | -Inf | Inf | slider | slide | position (m) |
|
|
||||||
| 1 | sine of the angle between the cart and the first pole | -Inf | Inf | sin(hinge) | hinge | unitless |
|
|
||||||
| 2 | sine of the angle between the two poles | -Inf | Inf | sin(hinge2) | hinge | unitless |
|
|
||||||
| 3 | cosine of the angle between the cart and the first pole | -Inf | Inf | cos(hinge) | hinge | unitless |
|
|
||||||
| 4 | cosine of the angle between the two poles | -Inf | Inf | cos(hinge2) | hinge | unitless |
|
|
||||||
| 5 | velocity of the cart | -Inf | Inf | slider | slide | velocity (m/s) |
|
|
||||||
| 6 | angular velocity of the angle between the cart and the first pole | -Inf | Inf | hinge | hinge | angular velocity (rad/s) |
|
|
||||||
| 7 | angular velocity of the angle between the two poles | -Inf | Inf | hinge2 | hinge | angular velocity (rad/s) |
|
|
||||||
| 8 | constraint force - 1 | -Inf | Inf | | | Force (N) |
|
|
||||||
| 9 | constraint force - 2 | -Inf | Inf | | | Force (N) |
|
|
||||||
| 10 | constraint force - 3 | -Inf | Inf | | | Force (N) |
|
|
||||||
|
|
||||||
|
|
||||||
There is physical contact between the robots and their environment - and Mujoco
|
|
||||||
attempts at getting realisitic physics simulations for the possible physical contact
|
|
||||||
dynamics by aiming for physical accuracy and computational efficiency.
|
|
||||||
|
|
||||||
There is one constraint force for contacts for each degree of freedom (3).
|
|
||||||
The approach and handling of constraints by Mujoco is unique to the simulator
|
|
||||||
and is based on their research. Once can find more information in their
|
|
||||||
[*documentation*](https://mujoco.readthedocs.io/en/latest/computation.html)
|
|
||||||
or in their paper
|
|
||||||
["Analytically-invertible dynamics with contacts and constraints: Theory and implementation in MuJoCo"](https://homes.cs.washington.edu/~todorov/papers/TodorovICRA14.pdf).
|
|
||||||
|
|
||||||
|
|
||||||
### Rewards
|
|
||||||
|
|
||||||
The reward consists of two parts:
|
|
||||||
- *alive_bonus*: The goal is to make the second inverted pendulum stand upright
|
|
||||||
(within a certain angle limit) as long as possible - as such a reward of +10 is awarded
|
|
||||||
for each timestep that the second pole is upright.
|
|
||||||
- *distance_penalty*: This reward is a measure of how far the *tip* of the second pendulum
|
|
||||||
(the only free end) moves, and it is calculated as
|
|
||||||
*0.01 * x<sup>2</sup> + (y - 2)<sup>2</sup>*, where *x* is the x-coordinate of the tip
|
|
||||||
and *y* is the y-coordinate of the tip of the second pole.
|
|
||||||
- *velocity_penalty*: A negative reward for penalising the agent if it moves too
|
|
||||||
fast *0.001 * v<sub>1</sub><sup>2</sup> + 0.005 * v<sub>2</sub> <sup>2</sup>*
|
|
||||||
|
|
||||||
The total reward returned is ***reward*** *=* *alive_bonus - distance_penalty - velocity_penalty*
|
|
||||||
|
|
||||||
### Starting State
|
|
||||||
All observations start in state
|
|
||||||
(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0) with a uniform noise in the range
|
|
||||||
of [-0.1, 0.1] added to the positional values (cart position and pole angles) and standard
|
|
||||||
normal force with a standard deviation of 0.1 added to the velocity values for stochasticity.
|
|
||||||
|
|
||||||
### Episode Termination
|
|
||||||
The episode terminates when any of the following happens:
|
|
||||||
|
|
||||||
1. The episode duration reaches 1000 timesteps.
|
|
||||||
2. Any of the state space values is no longer finite.
|
|
||||||
3. The y_coordinate of the tip of the second pole *is less than or equal* to 1. The maximum standing height of the system is 1.196 m when all the parts are perpendicularly vertical on top of each other).
|
|
||||||
|
|
||||||
### Arguments
|
|
||||||
|
|
||||||
No additional arguments are currently supported.
|
|
||||||
|
|
||||||
```
|
|
||||||
env = gym.make('InvertedDoublePendulum-v2')
|
|
||||||
```
|
|
||||||
There is no v3 for InvertedPendulum, unlike the robot environments where a v3 and
|
|
||||||
beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
|
|
||||||
|
|
||||||
|
|
||||||
### Version History
|
|
||||||
|
|
||||||
* v2: All continuous control environments now use mujoco_py >= 1.50
|
|
||||||
* v1: max_time_steps raised to 1000 for robot based tasks (including inverted pendulum)
|
|
||||||
* v0: Initial versions release (1.0.0)
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
mujoco_env.MujocoEnv.__init__(self, "inverted_double_pendulum.xml", 5)
|
mujoco_env.MujocoEnv.__init__(
|
||||||
|
self, "inverted_double_pendulum.xml", 5, mujoco_bindings="mujoco_py"
|
||||||
|
)
|
||||||
utils.EzPickle.__init__(self)
|
utils.EzPickle.__init__(self)
|
||||||
|
|
||||||
def step(self, action):
|
def step(self, action):
|
||||||
|
156
gym/envs/mujoco/inverted_double_pendulum_v4.py
Normal file
156
gym/envs/mujoco/inverted_double_pendulum_v4.py
Normal file
@@ -0,0 +1,156 @@
|
|||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from gym import utils
|
||||||
|
from gym.envs.mujoco import mujoco_env
|
||||||
|
|
||||||
|
|
||||||
|
class InvertedDoublePendulumEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
||||||
|
"""
|
||||||
|
### Description
|
||||||
|
|
||||||
|
This environment originates from control theory and builds on the cartpole
|
||||||
|
environment based on the work done by Barto, Sutton, and Anderson in
|
||||||
|
["Neuronlike adaptive elements that can solve difficult learning control problems"](https://ieeexplore.ieee.org/document/6313077),
|
||||||
|
powered by the Mujoco physics simulator - allowing for more complex experiments
|
||||||
|
(such as varying the effects of gravity or constraints). This environment involves a cart that can
|
||||||
|
moved linearly, with a pole fixed on it and a second pole fixed on the other end of the first one
|
||||||
|
(leaving the second pole as the only one with one free end). The cart can be pushed left or right,
|
||||||
|
and the goal is to balance the second pole on top of the first pole, which is in turn on top of the
|
||||||
|
cart, by applying continuous forces on the cart.
|
||||||
|
|
||||||
|
### Action Space
|
||||||
|
The agent take a 1-element vector for actions.
|
||||||
|
The action space is a continuous `(action)` in `[-1, 1]`, where `action` represents the
|
||||||
|
numerical force applied to the cart (with magnitude representing the amount of force and
|
||||||
|
sign representing the direction)
|
||||||
|
|
||||||
|
| Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit |
|
||||||
|
|-----|---------------------------|-------------|-------------|----------------------------------|-------|-----------|
|
||||||
|
| 0 | Force applied on the cart | -1 | 1 | slider | slide | Force (N) |
|
||||||
|
|
||||||
|
### Observation Space
|
||||||
|
|
||||||
|
The state space consists of positional values of different body parts of the pendulum system,
|
||||||
|
followed by the velocities of those individual parts (their derivatives) with all the
|
||||||
|
positions ordered before all the velocities.
|
||||||
|
|
||||||
|
The observation is a `ndarray` with shape `(11,)` where the elements correspond to the following:
|
||||||
|
|
||||||
|
| Num | Observation | Min | Max | Name (in corresponding XML file) | Joint| Unit |
|
||||||
|
|-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------|
|
||||||
|
| 0 | position of the cart along the linear surface | -Inf | Inf | slider | slide | position (m) |
|
||||||
|
| 1 | sine of the angle between the cart and the first pole | -Inf | Inf | sin(hinge) | hinge | unitless |
|
||||||
|
| 2 | sine of the angle between the two poles | -Inf | Inf | sin(hinge2) | hinge | unitless |
|
||||||
|
| 3 | cosine of the angle between the cart and the first pole | -Inf | Inf | cos(hinge) | hinge | unitless |
|
||||||
|
| 4 | cosine of the angle between the two poles | -Inf | Inf | cos(hinge2) | hinge | unitless |
|
||||||
|
| 5 | velocity of the cart | -Inf | Inf | slider | slide | velocity (m/s) |
|
||||||
|
| 6 | angular velocity of the angle between the cart and the first pole | -Inf | Inf | hinge | hinge | angular velocity (rad/s) |
|
||||||
|
| 7 | angular velocity of the angle between the two poles | -Inf | Inf | hinge2 | hinge | angular velocity (rad/s) |
|
||||||
|
| 8 | constraint force - 1 | -Inf | Inf | | | Force (N) |
|
||||||
|
| 9 | constraint force - 2 | -Inf | Inf | | | Force (N) |
|
||||||
|
| 10 | constraint force - 3 | -Inf | Inf | | | Force (N) |
|
||||||
|
|
||||||
|
|
||||||
|
There is physical contact between the robots and their environment - and Mujoco
|
||||||
|
attempts at getting realisitic physics simulations for the possible physical contact
|
||||||
|
dynamics by aiming for physical accuracy and computational efficiency.
|
||||||
|
|
||||||
|
There is one constraint force for contacts for each degree of freedom (3).
|
||||||
|
The approach and handling of constraints by Mujoco is unique to the simulator
|
||||||
|
and is based on their research. Once can find more information in their
|
||||||
|
[*documentation*](https://mujoco.readthedocs.io/en/latest/computation.html)
|
||||||
|
or in their paper
|
||||||
|
["Analytically-invertible dynamics with contacts and constraints: Theory and implementation in MuJoCo"](https://homes.cs.washington.edu/~todorov/papers/TodorovICRA14.pdf).
|
||||||
|
|
||||||
|
|
||||||
|
### Rewards
|
||||||
|
|
||||||
|
The reward consists of two parts:
|
||||||
|
- *alive_bonus*: The goal is to make the second inverted pendulum stand upright
|
||||||
|
(within a certain angle limit) as long as possible - as such a reward of +10 is awarded
|
||||||
|
for each timestep that the second pole is upright.
|
||||||
|
- *distance_penalty*: This reward is a measure of how far the *tip* of the second pendulum
|
||||||
|
(the only free end) moves, and it is calculated as
|
||||||
|
*0.01 * x<sup>2</sup> + (y - 2)<sup>2</sup>*, where *x* is the x-coordinate of the tip
|
||||||
|
and *y* is the y-coordinate of the tip of the second pole.
|
||||||
|
- *velocity_penalty*: A negative reward for penalising the agent if it moves too
|
||||||
|
fast *0.001 * v<sub>1</sub><sup>2</sup> + 0.005 * v<sub>2</sub> <sup>2</sup>*
|
||||||
|
|
||||||
|
The total reward returned is ***reward*** *=* *alive_bonus - distance_penalty - velocity_penalty*
|
||||||
|
|
||||||
|
### Starting State
|
||||||
|
All observations start in state
|
||||||
|
(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0) with a uniform noise in the range
|
||||||
|
of [-0.1, 0.1] added to the positional values (cart position and pole angles) and standard
|
||||||
|
normal force with a standard deviation of 0.1 added to the velocity values for stochasticity.
|
||||||
|
|
||||||
|
### Episode Termination
|
||||||
|
The episode terminates when any of the following happens:
|
||||||
|
|
||||||
|
1. The episode duration reaches 1000 timesteps.
|
||||||
|
2. Any of the state space values is no longer finite.
|
||||||
|
3. The y_coordinate of the tip of the second pole *is less than or equal* to 1. The maximum standing height of the system is 1.196 m when all the parts are perpendicularly vertical on top of each other).
|
||||||
|
|
||||||
|
### Arguments
|
||||||
|
|
||||||
|
No additional arguments are currently supported (in v2 and lower), but modifications can
|
||||||
|
be made to the XML file in the assets folder (or by changing the path to a modified XML
|
||||||
|
file in another folder)..
|
||||||
|
|
||||||
|
```
|
||||||
|
env = gym.make('InvertedDoublePendulum-v2')
|
||||||
|
```
|
||||||
|
There is no v3 for InvertedPendulum, unlike the robot environments where a v3 and
|
||||||
|
beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
|
||||||
|
|
||||||
|
|
||||||
|
### Version History
|
||||||
|
|
||||||
|
* v4: all mujoco environments now use the mujoco binidings in mujoco>=2.1.3
|
||||||
|
* v3: support for gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. rgb rendering comes from tracking camera (so agent does not run away from screen)
|
||||||
|
* v2: All continuous control environments now use mujoco_py >= 1.50
|
||||||
|
* v1: max_time_steps raised to 1000 for robot based tasks (including inverted pendulum)
|
||||||
|
* v0: Initial versions release (1.0.0)
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
mujoco_env.MujocoEnv.__init__(self, "inverted_double_pendulum.xml", 5)
|
||||||
|
utils.EzPickle.__init__(self)
|
||||||
|
|
||||||
|
def step(self, action):
|
||||||
|
self.do_simulation(action, self.frame_skip)
|
||||||
|
ob = self._get_obs()
|
||||||
|
x, _, y = self.data.site_xpos[0]
|
||||||
|
dist_penalty = 0.01 * x**2 + (y - 2) ** 2
|
||||||
|
v1, v2 = self.data.qvel[1:3]
|
||||||
|
vel_penalty = 1e-3 * v1**2 + 5e-3 * v2**2
|
||||||
|
alive_bonus = 10
|
||||||
|
r = alive_bonus - dist_penalty - vel_penalty
|
||||||
|
done = bool(y <= 1)
|
||||||
|
return ob, r, done, {}
|
||||||
|
|
||||||
|
def _get_obs(self):
|
||||||
|
return np.concatenate(
|
||||||
|
[
|
||||||
|
self.data.qpos[:1], # cart x pos
|
||||||
|
np.sin(self.data.qpos[1:]), # link angles
|
||||||
|
np.cos(self.data.qpos[1:]),
|
||||||
|
np.clip(self.data.qvel, -10, 10),
|
||||||
|
np.clip(self.data.qfrc_constraint, -10, 10),
|
||||||
|
]
|
||||||
|
).ravel()
|
||||||
|
|
||||||
|
def reset_model(self):
|
||||||
|
self.set_state(
|
||||||
|
self.init_qpos
|
||||||
|
+ self.np_random.uniform(low=-0.1, high=0.1, size=self.model.nq),
|
||||||
|
self.init_qvel + self.np_random.standard_normal(self.model.nv) * 0.1,
|
||||||
|
)
|
||||||
|
return self._get_obs()
|
||||||
|
|
||||||
|
def viewer_setup(self):
|
||||||
|
v = self.viewer
|
||||||
|
v.cam.trackbodyid = 0
|
||||||
|
v.cam.distance = self.model.stat.extent * 0.5
|
||||||
|
v.cam.lookat[2] = 0.12250000000000005 # v.model.stat.center[2]
|
@@ -5,85 +5,11 @@ from gym.envs.mujoco import mujoco_env
|
|||||||
|
|
||||||
|
|
||||||
class InvertedPendulumEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
class InvertedPendulumEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
||||||
"""
|
|
||||||
### Description
|
|
||||||
|
|
||||||
This environment is the cartpole environment based on the work done by
|
|
||||||
Barto, Sutton, and Anderson in ["Neuronlike adaptive elements that can
|
|
||||||
solve difficult learning control problems"](https://ieeexplore.ieee.org/document/6313077),
|
|
||||||
just like in the classic environments but now powered by the Mujoco physics simulator -
|
|
||||||
allowing for more complex experiments (such as varying the effects of gravity).
|
|
||||||
This environment involves a cart that can moved linearly, with a pole fixed on it
|
|
||||||
at one end and having another end free. The cart can be pushed left or right, and the
|
|
||||||
goal is to balance the pole on the top of the cart by applying forces on the cart.
|
|
||||||
|
|
||||||
### Action Space
|
|
||||||
The agent take a 1-element vector for actions.
|
|
||||||
|
|
||||||
The action space is a continuous `(action)` in `[-3, 3]`, where `action` represents
|
|
||||||
the numerical force applied to the cart (with magnitude representing the amount of
|
|
||||||
force and sign representing the direction)
|
|
||||||
|
|
||||||
| Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit |
|
|
||||||
|-----|---------------------------|-------------|-------------|----------------------------------|-------|-----------|
|
|
||||||
| 0 | Force applied on the cart | -3 | 3 | slider | slide | Force (N) |
|
|
||||||
|
|
||||||
### Observation Space
|
|
||||||
|
|
||||||
The state space consists of positional values of different body parts of
|
|
||||||
the pendulum system, followed by the velocities of those individual parts (their derivatives)
|
|
||||||
with all the positions ordered before all the velocities.
|
|
||||||
|
|
||||||
The observation is a `ndarray` with shape `(4,)` where the elements correspond to the following:
|
|
||||||
|
|
||||||
| Num | Observation | Min | Max | Name (in corresponding XML file) | Joint| Unit |
|
|
||||||
|-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------|
|
|
||||||
| 0 | position of the cart along the linear surface | -Inf | Inf | slider | slide | position (m) |
|
|
||||||
| 1 | vertical angle of the pole on the cart | -Inf | Inf | hinge | hinge | angle (rad) |
|
|
||||||
| 2 | linear velocity of the cart | -Inf | Inf | slider | slide | velocity (m/s) |
|
|
||||||
| 3 | angular velocity of the pole on the cart | -Inf | Inf | hinge | hinge | anglular velocity (rad/s) |
|
|
||||||
|
|
||||||
|
|
||||||
### Rewards
|
|
||||||
|
|
||||||
The goal is to make the inverted pendulum stand upright (within a certain angle limit)
|
|
||||||
as long as possible - as such a reward of +1 is awarded for each timestep that
|
|
||||||
the pole is upright.
|
|
||||||
|
|
||||||
### Starting State
|
|
||||||
All observations start in state
|
|
||||||
(0.0, 0.0, 0.0, 0.0) with a uniform noise in the range
|
|
||||||
of [-0.01, 0.01] added to the values for stochasticity.
|
|
||||||
|
|
||||||
### Episode Termination
|
|
||||||
The episode terminates when any of the following happens:
|
|
||||||
|
|
||||||
1. The episode duration reaches 1000 timesteps.
|
|
||||||
2. Any of the state space values is no longer finite.
|
|
||||||
3. The absolutely value of the vertical angle between the pole and the cart is greater than 0.2 radian.
|
|
||||||
|
|
||||||
### Arguments
|
|
||||||
|
|
||||||
No additional arguments are currently supported.
|
|
||||||
|
|
||||||
```
|
|
||||||
env = gym.make('InvertedPendulum-v2')
|
|
||||||
```
|
|
||||||
There is no v3 for InvertedPendulum, unlike the robot environments where a
|
|
||||||
v3 and beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
|
|
||||||
|
|
||||||
|
|
||||||
### Version History
|
|
||||||
|
|
||||||
* v2: All continuous control environments now use mujoco_py >= 1.50
|
|
||||||
* v1: max_time_steps raised to 1000 for robot based tasks (including inverted pendulum)
|
|
||||||
* v0: Initial versions release (1.0.0)
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
utils.EzPickle.__init__(self)
|
utils.EzPickle.__init__(self)
|
||||||
mujoco_env.MujocoEnv.__init__(self, "inverted_pendulum.xml", 2)
|
mujoco_env.MujocoEnv.__init__(
|
||||||
|
self, "inverted_pendulum.xml", 2, mujoco_bindings="mujoco_py"
|
||||||
|
)
|
||||||
|
|
||||||
def step(self, a):
|
def step(self, a):
|
||||||
reward = 1.0
|
reward = 1.0
|
||||||
|
116
gym/envs/mujoco/inverted_pendulum_v4.py
Normal file
116
gym/envs/mujoco/inverted_pendulum_v4.py
Normal file
@@ -0,0 +1,116 @@
|
|||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from gym import utils
|
||||||
|
from gym.envs.mujoco import mujoco_env
|
||||||
|
|
||||||
|
|
||||||
|
class InvertedPendulumEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
||||||
|
"""
|
||||||
|
### Description
|
||||||
|
|
||||||
|
This environment is the cartpole environment based on the work done by
|
||||||
|
Barto, Sutton, and Anderson in ["Neuronlike adaptive elements that can
|
||||||
|
solve difficult learning control problems"](https://ieeexplore.ieee.org/document/6313077),
|
||||||
|
just like in the classic environments but now powered by the Mujoco physics simulator -
|
||||||
|
allowing for more complex experiments (such as varying the effects of gravity).
|
||||||
|
This environment involves a cart that can moved linearly, with a pole fixed on it
|
||||||
|
at one end and having another end free. The cart can be pushed left or right, and the
|
||||||
|
goal is to balance the pole on the top of the cart by applying forces on the cart.
|
||||||
|
|
||||||
|
### Action Space
|
||||||
|
The agent take a 1-element vector for actions.
|
||||||
|
|
||||||
|
The action space is a continuous `(action)` in `[-3, 3]`, where `action` represents
|
||||||
|
the numerical force applied to the cart (with magnitude representing the amount of
|
||||||
|
force and sign representing the direction)
|
||||||
|
|
||||||
|
| Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit |
|
||||||
|
|-----|---------------------------|-------------|-------------|----------------------------------|-------|-----------|
|
||||||
|
| 0 | Force applied on the cart | -3 | 3 | slider | slide | Force (N) |
|
||||||
|
|
||||||
|
### Observation Space
|
||||||
|
|
||||||
|
The state space consists of positional values of different body parts of
|
||||||
|
the pendulum system, followed by the velocities of those individual parts (their derivatives)
|
||||||
|
with all the positions ordered before all the velocities.
|
||||||
|
|
||||||
|
The observation is a `ndarray` with shape `(4,)` where the elements correspond to the following:
|
||||||
|
|
||||||
|
| Num | Observation | Min | Max | Name (in corresponding XML file) | Joint| Unit |
|
||||||
|
|-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------|
|
||||||
|
| 0 | position of the cart along the linear surface | -Inf | Inf | slider | slide | position (m) |
|
||||||
|
| 1 | vertical angle of the pole on the cart | -Inf | Inf | hinge | hinge | angle (rad) |
|
||||||
|
| 2 | linear velocity of the cart | -Inf | Inf | slider | slide | velocity (m/s) |
|
||||||
|
| 3 | angular velocity of the pole on the cart | -Inf | Inf | hinge | hinge | anglular velocity (rad/s) |
|
||||||
|
|
||||||
|
|
||||||
|
### Rewards
|
||||||
|
|
||||||
|
The goal is to make the inverted pendulum stand upright (within a certain angle limit)
|
||||||
|
as long as possible - as such a reward of +1 is awarded for each timestep that
|
||||||
|
the pole is upright.
|
||||||
|
|
||||||
|
### Starting State
|
||||||
|
All observations start in state
|
||||||
|
(0.0, 0.0, 0.0, 0.0) with a uniform noise in the range
|
||||||
|
of [-0.01, 0.01] added to the values for stochasticity.
|
||||||
|
|
||||||
|
### Episode Termination
|
||||||
|
The episode terminates when any of the following happens:
|
||||||
|
|
||||||
|
1. The episode duration reaches 1000 timesteps.
|
||||||
|
2. Any of the state space values is no longer finite.
|
||||||
|
3. The absolutely value of the vertical angle between the pole and the cart is greater than 0.2 radian.
|
||||||
|
|
||||||
|
### Arguments
|
||||||
|
|
||||||
|
No additional arguments are currently supported (in v2 and lower),
|
||||||
|
but modifications can be made to the XML file in the assets folder
|
||||||
|
(or by changing the path to a modified XML file in another folder).
|
||||||
|
|
||||||
|
```
|
||||||
|
env = gym.make('InvertedPendulum-v2')
|
||||||
|
```
|
||||||
|
There is no v3 for InvertedPendulum, unlike the robot environments where a
|
||||||
|
v3 and beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
|
||||||
|
|
||||||
|
|
||||||
|
### Version History
|
||||||
|
|
||||||
|
* v4: all mujoco environments now use the mujoco binidings in mujoco>=2.1.3
|
||||||
|
* v3: support for gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. rgb rendering comes from tracking camera (so agent does not run away from screen)
|
||||||
|
* v2: All continuous control environments now use mujoco_py >= 1.50
|
||||||
|
* v1: max_time_steps raised to 1000 for robot based tasks (including inverted pendulum)
|
||||||
|
* v0: Initial versions release (1.0.0)
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
utils.EzPickle.__init__(self)
|
||||||
|
mujoco_env.MujocoEnv.__init__(self, "inverted_pendulum.xml", 2)
|
||||||
|
|
||||||
|
def step(self, a):
|
||||||
|
reward = 1.0
|
||||||
|
self.do_simulation(a, self.frame_skip)
|
||||||
|
ob = self._get_obs()
|
||||||
|
notdone = np.isfinite(ob).all() and (np.abs(ob[1]) <= 0.2)
|
||||||
|
done = not notdone
|
||||||
|
return ob, reward, done, {}
|
||||||
|
|
||||||
|
def reset_model(self):
|
||||||
|
qpos = self.init_qpos + self.np_random.uniform(
|
||||||
|
size=self.model.nq, low=-0.01, high=0.01
|
||||||
|
)
|
||||||
|
qvel = self.init_qvel + self.np_random.uniform(
|
||||||
|
size=self.model.nv, low=-0.01, high=0.01
|
||||||
|
)
|
||||||
|
self.set_state(qpos, qvel)
|
||||||
|
return self._get_obs()
|
||||||
|
|
||||||
|
def _get_obs(self):
|
||||||
|
return np.concatenate([self.data.qpos, self.data.qvel]).ravel()
|
||||||
|
|
||||||
|
def viewer_setup(self):
|
||||||
|
v = self.viewer
|
||||||
|
v.cam.trackbodyid = 0
|
||||||
|
v.cam.distance = self.model.stat.extent
|
@@ -1,4 +1,3 @@
|
|||||||
import os
|
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
from os import path
|
from os import path
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
@@ -6,18 +5,9 @@ from typing import Optional
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
import gym
|
import gym
|
||||||
from gym import error, spaces
|
from gym import error, logger, spaces
|
||||||
|
|
||||||
try:
|
DEFAULT_SIZE = 480
|
||||||
import mujoco_py
|
|
||||||
except ImportError as e:
|
|
||||||
raise error.DependencyNotInstalled(
|
|
||||||
"{}. (HINT: you need to install mujoco_py, and also perform the setup instructions here: https://github.com/openai/mujoco-py/.)".format(
|
|
||||||
e
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
DEFAULT_SIZE = 500
|
|
||||||
|
|
||||||
|
|
||||||
def convert_observation_to_space(observation):
|
def convert_observation_to_space(observation):
|
||||||
@@ -43,28 +33,64 @@ def convert_observation_to_space(observation):
|
|||||||
class MujocoEnv(gym.Env):
|
class MujocoEnv(gym.Env):
|
||||||
"""Superclass for all MuJoCo environments."""
|
"""Superclass for all MuJoCo environments."""
|
||||||
|
|
||||||
def __init__(self, model_path, frame_skip):
|
def __init__(self, model_path, frame_skip, mujoco_bindings="mujoco"):
|
||||||
|
|
||||||
if model_path.startswith("/"):
|
if model_path.startswith("/"):
|
||||||
fullpath = model_path
|
fullpath = model_path
|
||||||
else:
|
else:
|
||||||
fullpath = os.path.join(os.path.dirname(__file__), "assets", model_path)
|
fullpath = path.join(path.dirname(__file__), "assets", model_path)
|
||||||
if not path.exists(fullpath):
|
if not path.exists(fullpath):
|
||||||
raise OSError(f"File {fullpath} does not exist")
|
raise OSError(f"File {fullpath} does not exist")
|
||||||
self.frame_skip = frame_skip
|
|
||||||
self.model = mujoco_py.load_model_from_path(fullpath)
|
if mujoco_bindings == "mujoco_py":
|
||||||
self.sim = mujoco_py.MjSim(self.model)
|
logger.warn(
|
||||||
self.data = self.sim.data
|
"This version of the mujoco environments depends "
|
||||||
self.viewer = None
|
"on the mujoco-py bindings, which are no longer maintained "
|
||||||
|
"and may stop working. Please upgrade to the v4 versions of "
|
||||||
|
"the environments (which depend on the mujoco python bindings instead), unless "
|
||||||
|
"you are trying to precisely replicate previous works)."
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
import mujoco_py
|
||||||
|
|
||||||
|
self._mujoco_bindings = mujoco_py
|
||||||
|
except ImportError as e:
|
||||||
|
raise error.DependencyNotInstalled(
|
||||||
|
"{}. (HINT: you need to install mujoco_py, and also perform the setup instructions here: https://github.com/openai/mujoco-py/.)".format(
|
||||||
|
e
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
self.model = self._mujoco_bindings.load_model_from_path(fullpath)
|
||||||
|
self.sim = self._mujoco_bindings.MjSim(self.model)
|
||||||
|
self.data = self.sim.data
|
||||||
|
|
||||||
|
elif mujoco_bindings == "mujoco":
|
||||||
|
try:
|
||||||
|
import mujoco
|
||||||
|
|
||||||
|
self._mujoco_bindings = mujoco
|
||||||
|
|
||||||
|
except ImportError as e:
|
||||||
|
raise error.DependencyNotInstalled(
|
||||||
|
f"{e}. (HINT: you need to install mujoco)"
|
||||||
|
)
|
||||||
|
self.model = self._mujoco_bindings.MjModel.from_xml_path(fullpath)
|
||||||
|
self.data = self._mujoco_bindings.MjData(self.model)
|
||||||
|
|
||||||
|
self.init_qpos = self.data.qpos.ravel().copy()
|
||||||
|
self.init_qvel = self.data.qvel.ravel().copy()
|
||||||
self._viewers = {}
|
self._viewers = {}
|
||||||
|
|
||||||
|
self.frame_skip = frame_skip
|
||||||
|
|
||||||
|
self.viewer = None
|
||||||
|
|
||||||
self.metadata = {
|
self.metadata = {
|
||||||
"render_modes": ["human", "rgb_array", "depth_array"],
|
"render_modes": ["human", "rgb_array", "depth_array"],
|
||||||
"render_fps": int(np.round(1.0 / self.dt)),
|
"render_fps": int(np.round(1.0 / self.dt)),
|
||||||
}
|
}
|
||||||
|
|
||||||
self.init_qpos = self.sim.data.qpos.ravel().copy()
|
|
||||||
self.init_qvel = self.sim.data.qvel.ravel().copy()
|
|
||||||
|
|
||||||
self._set_action_space()
|
self._set_action_space()
|
||||||
|
|
||||||
action = self.action_space.sample()
|
action = self.action_space.sample()
|
||||||
@@ -111,7 +137,12 @@ class MujocoEnv(gym.Env):
|
|||||||
options: Optional[dict] = None,
|
options: Optional[dict] = None,
|
||||||
):
|
):
|
||||||
super().reset(seed=seed)
|
super().reset(seed=seed)
|
||||||
self.sim.reset()
|
|
||||||
|
if self._mujoco_bindings.__name__ == "mujoco_py":
|
||||||
|
self.sim.reset()
|
||||||
|
else:
|
||||||
|
self._mujoco_bindings.mj_resetData(self.model, self.data)
|
||||||
|
|
||||||
ob = self.reset_model()
|
ob = self.reset_model()
|
||||||
if not return_info:
|
if not return_info:
|
||||||
return ob
|
return ob
|
||||||
@@ -120,12 +151,19 @@ class MujocoEnv(gym.Env):
|
|||||||
|
|
||||||
def set_state(self, qpos, qvel):
|
def set_state(self, qpos, qvel):
|
||||||
assert qpos.shape == (self.model.nq,) and qvel.shape == (self.model.nv,)
|
assert qpos.shape == (self.model.nq,) and qvel.shape == (self.model.nv,)
|
||||||
old_state = self.sim.get_state()
|
if self._mujoco_bindings.__name__ == "mujoco_py":
|
||||||
new_state = mujoco_py.MjSimState(
|
state = self.sim.get_state()
|
||||||
old_state.time, qpos, qvel, old_state.act, old_state.udd_state
|
state = self._mujoco_bindings.MjSimState(
|
||||||
)
|
state.time, qpos, qvel, state.act, state.udd_state
|
||||||
self.sim.set_state(new_state)
|
)
|
||||||
self.sim.forward()
|
self.sim.set_state(state)
|
||||||
|
self.sim.forward()
|
||||||
|
else:
|
||||||
|
self.data.qpos[:] = np.copy(qpos)
|
||||||
|
self.data.qvel[:] = np.copy(qvel)
|
||||||
|
if self.model.na == 0:
|
||||||
|
self.data.act[:] = None
|
||||||
|
self._mujoco_bindings.mj_forward(self.model, self.data)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def dt(self):
|
def dt(self):
|
||||||
@@ -135,9 +173,21 @@ class MujocoEnv(gym.Env):
|
|||||||
if np.array(ctrl).shape != self.action_space.shape:
|
if np.array(ctrl).shape != self.action_space.shape:
|
||||||
raise ValueError("Action dimension mismatch")
|
raise ValueError("Action dimension mismatch")
|
||||||
|
|
||||||
self.sim.data.ctrl[:] = ctrl
|
if self._mujoco_bindings.__name__ == "mujoco_py":
|
||||||
|
self.sim.data.ctrl[:] = ctrl
|
||||||
|
else:
|
||||||
|
self.data.ctrl[:] = ctrl
|
||||||
for _ in range(n_frames):
|
for _ in range(n_frames):
|
||||||
self.sim.step()
|
if self._mujoco_bindings.__name__ == "mujoco_py":
|
||||||
|
self.sim.step()
|
||||||
|
else:
|
||||||
|
self._mujoco_bindings.mj_step(self.model, self.data)
|
||||||
|
|
||||||
|
# As of MuJoCo 2.0, force-related quantities like cacc are not computed
|
||||||
|
# unless there's a force sensor in the model.
|
||||||
|
# See https://github.com/openai/gym/issues/1541
|
||||||
|
if self._mujoco_bindings.__name__ != "mujoco_py":
|
||||||
|
self._mujoco_bindings.mj_rnePostConstraint(self.model, self.data)
|
||||||
|
|
||||||
def render(
|
def render(
|
||||||
self,
|
self,
|
||||||
@@ -158,19 +208,25 @@ class MujocoEnv(gym.Env):
|
|||||||
if no_camera_specified:
|
if no_camera_specified:
|
||||||
camera_name = "track"
|
camera_name = "track"
|
||||||
|
|
||||||
if camera_id is None and camera_name in self.model._camera_name2id:
|
if camera_id is None:
|
||||||
camera_id = self.model.camera_name2id(camera_name)
|
if self._mujoco_bindings.__name__ == "mujoco_py":
|
||||||
|
if camera_name in self.model._camera_name2id:
|
||||||
|
camera_id = self.model.camera_name2id(camera_name)
|
||||||
|
else:
|
||||||
|
camera_id = self._mujoco_bindings.mj_name2id(
|
||||||
|
self.model,
|
||||||
|
self._mujoco_bindings.mjtObj.mjOBJ_CAMERA,
|
||||||
|
camera_name,
|
||||||
|
)
|
||||||
|
|
||||||
self._get_viewer(mode).render(width, height, camera_id=camera_id)
|
self._get_viewer(mode).render(width, height, camera_id=camera_id)
|
||||||
|
|
||||||
if mode == "rgb_array":
|
if mode == "rgb_array":
|
||||||
# window size used for old mujoco-py:
|
|
||||||
data = self._get_viewer(mode).read_pixels(width, height, depth=False)
|
data = self._get_viewer(mode).read_pixels(width, height, depth=False)
|
||||||
# original image is upside-down, so flip it
|
# original image is upside-down, so flip it
|
||||||
return data[::-1, :, :]
|
return data[::-1, :, :]
|
||||||
elif mode == "depth_array":
|
elif mode == "depth_array":
|
||||||
self._get_viewer(mode).render(width, height)
|
self._get_viewer(mode).render(width, height)
|
||||||
# window size used for old mujoco-py:
|
|
||||||
# Extract depth part of the read_pixels() tuple
|
# Extract depth part of the read_pixels() tuple
|
||||||
data = self._get_viewer(mode).read_pixels(width, height, depth=True)[1]
|
data = self._get_viewer(mode).read_pixels(width, height, depth=True)[1]
|
||||||
# original image is upside-down, so flip it
|
# original image is upside-down, so flip it
|
||||||
@@ -180,24 +236,40 @@ class MujocoEnv(gym.Env):
|
|||||||
|
|
||||||
def close(self):
|
def close(self):
|
||||||
if self.viewer is not None:
|
if self.viewer is not None:
|
||||||
# self.viewer.finish()
|
|
||||||
self.viewer = None
|
self.viewer = None
|
||||||
self._viewers = {}
|
self._viewers = {}
|
||||||
|
|
||||||
def _get_viewer(self, mode):
|
def _get_viewer(self, mode, width=DEFAULT_SIZE, height=DEFAULT_SIZE):
|
||||||
self.viewer = self._viewers.get(mode)
|
self.viewer = self._viewers.get(mode)
|
||||||
if self.viewer is None:
|
if self.viewer is None:
|
||||||
if mode == "human":
|
if mode == "human":
|
||||||
self.viewer = mujoco_py.MjViewer(self.sim)
|
if self._mujoco_bindings.__name__ == "mujoco_py":
|
||||||
|
self.viewer = self._mujoco_bindings.MjViewer(self.sim)
|
||||||
|
else:
|
||||||
|
from gym.envs.mujoco.mujoco_rendering import Viewer
|
||||||
|
|
||||||
|
self.viewer = Viewer(self.model, self.data)
|
||||||
elif mode == "rgb_array" or mode == "depth_array":
|
elif mode == "rgb_array" or mode == "depth_array":
|
||||||
self.viewer = mujoco_py.MjRenderContextOffscreen(self.sim, -1)
|
if self._mujoco_bindings.__name__ == "mujoco_py":
|
||||||
|
self.viewer = self._mujoco_bindings.MjRenderContextOffscreen(
|
||||||
|
self.sim, -1
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
from gym.envs.mujoco.mujoco_rendering import RenderContextOffscreen
|
||||||
|
|
||||||
|
self.viewer = RenderContextOffscreen(
|
||||||
|
width, height, self.model, self.data
|
||||||
|
)
|
||||||
|
|
||||||
self.viewer_setup()
|
self.viewer_setup()
|
||||||
self._viewers[mode] = self.viewer
|
self._viewers[mode] = self.viewer
|
||||||
return self.viewer
|
return self.viewer
|
||||||
|
|
||||||
def get_body_com(self, body_name):
|
def get_body_com(self, body_name):
|
||||||
return self.data.get_body_xpos(body_name)
|
if self._mujoco_bindings.__name__ == "mujoco_py":
|
||||||
|
return self.data.get_body_xpos(body_name)
|
||||||
|
else:
|
||||||
|
return self.data.body(body_name).xpos
|
||||||
|
|
||||||
def state_vector(self):
|
def state_vector(self):
|
||||||
return np.concatenate([self.sim.data.qpos.flat, self.sim.data.qvel.flat])
|
return np.concatenate([self.data.qpos.flat, self.data.qvel.flat])
|
||||||
|
554
gym/envs/mujoco/mujoco_rendering.py
Normal file
554
gym/envs/mujoco/mujoco_rendering.py
Normal file
@@ -0,0 +1,554 @@
|
|||||||
|
import collections
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
from threading import Lock
|
||||||
|
|
||||||
|
import glfw
|
||||||
|
import imageio
|
||||||
|
import mujoco
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
def _import_egl(width, height):
|
||||||
|
from mujoco.egl import GLContext
|
||||||
|
|
||||||
|
return GLContext(width, height)
|
||||||
|
|
||||||
|
|
||||||
|
def _import_glfw(width, height):
|
||||||
|
from mujoco.glfw import GLContext
|
||||||
|
|
||||||
|
return GLContext(width, height)
|
||||||
|
|
||||||
|
|
||||||
|
def _import_osmesa(width, height):
|
||||||
|
from mujoco.osmesa import GLContext
|
||||||
|
|
||||||
|
return GLContext(width, height)
|
||||||
|
|
||||||
|
|
||||||
|
_ALL_RENDERERS = collections.OrderedDict(
|
||||||
|
[
|
||||||
|
("glfw", _import_glfw),
|
||||||
|
("egl", _import_egl),
|
||||||
|
("osmesa", _import_osmesa),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class RenderContext:
|
||||||
|
"""Render context superclass for offscreen and window rendering."""
|
||||||
|
|
||||||
|
def __init__(self, model, data, offscreen=True):
|
||||||
|
|
||||||
|
self.model = model
|
||||||
|
self.data = data
|
||||||
|
self.offscreen = offscreen
|
||||||
|
max_geom = 1000
|
||||||
|
|
||||||
|
mujoco.mj_forward(self.model, self.data)
|
||||||
|
|
||||||
|
self.scn = mujoco.MjvScene(self.model, max_geom)
|
||||||
|
self.cam = mujoco.MjvCamera()
|
||||||
|
self.vopt = mujoco.MjvOption()
|
||||||
|
self.pert = mujoco.MjvPerturb()
|
||||||
|
self.con = mujoco.MjrContext(self.model, mujoco.mjtFontScale.mjFONTSCALE_150)
|
||||||
|
|
||||||
|
self._markers = []
|
||||||
|
self._overlays = {}
|
||||||
|
|
||||||
|
self._init_camera()
|
||||||
|
self._set_mujoco_buffers()
|
||||||
|
|
||||||
|
def _set_mujoco_buffers(self):
|
||||||
|
if self.offscreen:
|
||||||
|
mujoco.mjr_setBuffer(mujoco.mjtFramebuffer.mjFB_OFFSCREEN, self.con)
|
||||||
|
if self.con.currentBuffer != mujoco.mjtFramebuffer.mjFB_OFFSCREEN:
|
||||||
|
raise RuntimeError("Offscreen rendering not supported")
|
||||||
|
else:
|
||||||
|
mujoco.mjr_setBuffer(mujoco.mjtFramebuffer.mjFB_WINDOW, self.con)
|
||||||
|
if self.con.currentBuffer != mujoco.mjtFramebuffer.mjFB_WINDOW:
|
||||||
|
raise RuntimeError("Window rendering not supported")
|
||||||
|
|
||||||
|
def update_offscreen_size(self, width, height):
|
||||||
|
if width != self.con.offWidth or height != self.con.offHeight:
|
||||||
|
self.model.vis.global_.offwidth = width
|
||||||
|
self.model.vis.global_.offheight = height
|
||||||
|
self.con.free()
|
||||||
|
self._set_mujoco_buffers()
|
||||||
|
|
||||||
|
def render(self, width, height, camera_id=None, segmentation=False):
|
||||||
|
rect = mujoco.MjrRect(left=0, bottom=0, width=width, height=height)
|
||||||
|
|
||||||
|
# Sometimes buffers are too small.
|
||||||
|
if width > self.con.offWidth or height > self.con.offHeight:
|
||||||
|
new_width = max(width, self.model.vis.global_.offwidth)
|
||||||
|
new_height = max(height, self.model.vis.global_.offheight)
|
||||||
|
self.update_offscreen_size(new_width, new_height)
|
||||||
|
|
||||||
|
if camera_id is not None:
|
||||||
|
if camera_id == -1:
|
||||||
|
self.cam.type = mujoco.mjtCamera.mjCAMERA_FREE
|
||||||
|
else:
|
||||||
|
self.cam.type = mujoco.mjtCamera.mjCAMERA_FIXED
|
||||||
|
self.cam.fixedcamid = camera_id
|
||||||
|
|
||||||
|
mujoco.mjv_updateScene(
|
||||||
|
self.model,
|
||||||
|
self.data,
|
||||||
|
self.vopt,
|
||||||
|
self.pert,
|
||||||
|
self.cam,
|
||||||
|
mujoco.mjtCatBit.mjCAT_ALL,
|
||||||
|
self.scn,
|
||||||
|
)
|
||||||
|
|
||||||
|
if segmentation:
|
||||||
|
self.scn.flags[mujoco.mjtRndFlag.mjRND_SEGMENT] = 1
|
||||||
|
self.scn.flags[mujoco.mjtRndFlag.mjRND_IDCOLOR] = 1
|
||||||
|
|
||||||
|
for marker_params in self._markers:
|
||||||
|
self._add_marker_to_scene(marker_params)
|
||||||
|
|
||||||
|
mujoco.mjr_render(rect, self.scn, self.con)
|
||||||
|
|
||||||
|
for gridpos, (text1, text2) in self._overlays.items():
|
||||||
|
mujoco.mjr_overlay(
|
||||||
|
mujoco.mjtFontScale.mjFONTSCALE_150,
|
||||||
|
gridpos,
|
||||||
|
rect,
|
||||||
|
text1.encode(),
|
||||||
|
text2.encode(),
|
||||||
|
self.con,
|
||||||
|
)
|
||||||
|
|
||||||
|
if segmentation:
|
||||||
|
self.scn.flags[mujoco.mjtRndFlag.mjRND_SEGMENT] = 0
|
||||||
|
self.scn.flags[mujoco.mjtRndFlag.mjRND_IDCOLOR] = 0
|
||||||
|
|
||||||
|
def read_pixels(self, width, height, depth=True, segmentation=False):
|
||||||
|
rect = mujoco.MjrRect(left=0, bottom=0, width=width, height=height)
|
||||||
|
|
||||||
|
rgb_arr = np.zeros(3 * rect.width * rect.height, dtype=np.uint8)
|
||||||
|
depth_arr = np.zeros(rect.width * rect.height, dtype=np.float32)
|
||||||
|
|
||||||
|
mujoco.mjr_readPixels(rgb_arr, depth_arr, rect, self.con)
|
||||||
|
rgb_img = rgb_arr.reshape(rect.height, rect.width, 3)
|
||||||
|
|
||||||
|
ret_img = rgb_img
|
||||||
|
if segmentation:
|
||||||
|
seg_img = (
|
||||||
|
rgb_img[:, :, 0]
|
||||||
|
+ rgb_img[:, :, 1] * (2**8)
|
||||||
|
+ rgb_img[:, :, 2] * (2**16)
|
||||||
|
)
|
||||||
|
seg_img[seg_img >= (self.scn.ngeom + 1)] = 0
|
||||||
|
seg_ids = np.full((self.scn.ngeom + 1, 2), fill_value=-1, dtype=np.int32)
|
||||||
|
|
||||||
|
for i in range(self.scn.ngeom):
|
||||||
|
geom = self.scn.geoms[i]
|
||||||
|
if geom.segid != -1:
|
||||||
|
seg_ids[geom.segid + 1, 0] = geom.objtype
|
||||||
|
seg_ids[geom.segid + 1, 1] = geom.objid
|
||||||
|
ret_img = seg_ids[seg_img]
|
||||||
|
|
||||||
|
if depth:
|
||||||
|
depth_img = depth_arr.reshape(rect.height, rect.width)
|
||||||
|
return (ret_img, depth_img)
|
||||||
|
else:
|
||||||
|
return ret_img
|
||||||
|
|
||||||
|
def _init_camera(self):
|
||||||
|
self.cam.type = mujoco.mjtCamera.mjCAMERA_FREE
|
||||||
|
self.cam.fixedcamid = -1
|
||||||
|
for i in range(3):
|
||||||
|
self.cam.lookat[i] = np.median(self.data.geom_xpos[:, i])
|
||||||
|
self.cam.distance = self.model.stat.extent
|
||||||
|
|
||||||
|
def add_overlay(self, gridpos: int, text1: str, text2: str):
|
||||||
|
"""Overlays text on the scene."""
|
||||||
|
if gridpos not in self._overlays:
|
||||||
|
self._overlays[gridpos] = ["", ""]
|
||||||
|
self._overlays[gridpos][0] += text1 + "\n"
|
||||||
|
self._overlays[gridpos][1] += text2 + "\n"
|
||||||
|
|
||||||
|
def add_marker(self, **marker_params):
|
||||||
|
self._markers.append(marker_params)
|
||||||
|
|
||||||
|
def _add_marker_to_scene(self, marker):
|
||||||
|
if self.scn.ngeom >= self.scn.maxgeom:
|
||||||
|
raise RuntimeError("Ran out of geoms. maxgeom: %d" % self.scn.maxgeom)
|
||||||
|
|
||||||
|
g = self.scn.geoms[self.scn.ngeom]
|
||||||
|
# default values.
|
||||||
|
g.dataid = -1
|
||||||
|
g.objtype = mujoco.mjtObj.mjOBJ_UNKNOWN
|
||||||
|
g.objid = -1
|
||||||
|
g.category = mujoco.mjtCatBit.mjCAT_DECOR
|
||||||
|
g.texid = -1
|
||||||
|
g.texuniform = 0
|
||||||
|
g.texrepeat[0] = 1
|
||||||
|
g.texrepeat[1] = 1
|
||||||
|
g.emission = 0
|
||||||
|
g.specular = 0.5
|
||||||
|
g.shininess = 0.5
|
||||||
|
g.reflectance = 0
|
||||||
|
g.type = mujoco.mjtGeom.mjGEOM_BOX
|
||||||
|
g.size[:] = np.ones(3) * 0.1
|
||||||
|
g.mat[:] = np.eye(3)
|
||||||
|
g.rgba[:] = np.ones(4)
|
||||||
|
|
||||||
|
for key, value in marker.items():
|
||||||
|
if isinstance(value, (int, float, mujoco._enums.mjtGeom)):
|
||||||
|
setattr(g, key, value)
|
||||||
|
elif isinstance(value, (tuple, list, np.ndarray)):
|
||||||
|
attr = getattr(g, key)
|
||||||
|
attr[:] = np.asarray(value).reshape(attr.shape)
|
||||||
|
elif isinstance(value, str):
|
||||||
|
assert key == "label", "Only label is a string in mjtGeom."
|
||||||
|
if value is None:
|
||||||
|
g.label[0] = 0
|
||||||
|
else:
|
||||||
|
g.label = value
|
||||||
|
elif hasattr(g, key):
|
||||||
|
raise ValueError(
|
||||||
|
"mjtGeom has attr {} but type {} is invalid".format(
|
||||||
|
key, type(value)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
raise ValueError("mjtGeom doesn't have field %s" % key)
|
||||||
|
|
||||||
|
self.scn.ngeom += 1
|
||||||
|
|
||||||
|
|
||||||
|
class RenderContextOffscreen(RenderContext):
|
||||||
|
"""Offscreen rendering class with opengl context."""
|
||||||
|
|
||||||
|
def __init__(self, width, height, model, data):
|
||||||
|
|
||||||
|
self._get_opengl_backend(width, height)
|
||||||
|
self.opengl_context.make_current()
|
||||||
|
|
||||||
|
super().__init__(model, data, offscreen=True)
|
||||||
|
|
||||||
|
def _get_opengl_backend(self, width, height):
|
||||||
|
|
||||||
|
backend = os.environ.get("MUJOCO_GL")
|
||||||
|
if backend is not None:
|
||||||
|
try:
|
||||||
|
self.opengl_context = _ALL_RENDERERS[backend](width, height)
|
||||||
|
except KeyError:
|
||||||
|
raise RuntimeError(
|
||||||
|
"Environment variable {} must be one of {!r}: got {!r}.".format(
|
||||||
|
"MUJOCO_GL", _ALL_RENDERERS.keys(), backend
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
else:
|
||||||
|
for name, _ in _ALL_RENDERERS.items():
|
||||||
|
try:
|
||||||
|
self.opengl_context = _ALL_RENDERERS[name](width, height)
|
||||||
|
backend = name
|
||||||
|
break
|
||||||
|
except: # noqa:E722
|
||||||
|
pass
|
||||||
|
if backend is None:
|
||||||
|
raise RuntimeError(
|
||||||
|
"No OpenGL backend could be imported. Attempting to create a "
|
||||||
|
"rendering context will result in a RuntimeError."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class Viewer(RenderContext):
|
||||||
|
"""Class for window rendering in all MuJoCo environments."""
|
||||||
|
|
||||||
|
def __init__(self, model, data):
|
||||||
|
self._gui_lock = Lock()
|
||||||
|
self._button_left_pressed = False
|
||||||
|
self._button_right_pressed = False
|
||||||
|
self._last_mouse_x = 0
|
||||||
|
self._last_mouse_y = 0
|
||||||
|
self._paused = False
|
||||||
|
self._transparent = False
|
||||||
|
self._contacts = False
|
||||||
|
self._render_every_frame = True
|
||||||
|
self._image_idx = 0
|
||||||
|
self._image_path = "/tmp/frame_%07d.png"
|
||||||
|
self._time_per_render = 1 / 60.0
|
||||||
|
self._run_speed = 1.0
|
||||||
|
self._loop_count = 0
|
||||||
|
self._advance_by_one_step = False
|
||||||
|
self._hide_menu = False
|
||||||
|
|
||||||
|
# glfw init
|
||||||
|
glfw.init()
|
||||||
|
width, height = glfw.get_video_mode(glfw.get_primary_monitor()).size
|
||||||
|
self.window = glfw.create_window(width // 2, height // 2, "mujoco", None, None)
|
||||||
|
glfw.make_context_current(self.window)
|
||||||
|
glfw.swap_interval(1)
|
||||||
|
|
||||||
|
framebuffer_width, framebuffer_height = glfw.get_framebuffer_size(self.window)
|
||||||
|
window_width, _ = glfw.get_window_size(self.window)
|
||||||
|
self._scale = framebuffer_width * 1.0 / window_width
|
||||||
|
|
||||||
|
# set callbacks
|
||||||
|
glfw.set_cursor_pos_callback(self.window, self._cursor_pos_callback)
|
||||||
|
glfw.set_mouse_button_callback(self.window, self._mouse_button_callback)
|
||||||
|
glfw.set_scroll_callback(self.window, self._scroll_callback)
|
||||||
|
glfw.set_key_callback(self.window, self._key_callback)
|
||||||
|
|
||||||
|
# get viewport
|
||||||
|
self.viewport = mujoco.MjrRect(0, 0, framebuffer_width, framebuffer_height)
|
||||||
|
|
||||||
|
super().__init__(model, data, offscreen=False)
|
||||||
|
|
||||||
|
def _key_callback(self, key, action):
|
||||||
|
if action != glfw.RELEASE:
|
||||||
|
return
|
||||||
|
# Switch cameras
|
||||||
|
elif key == glfw.KEY_TAB:
|
||||||
|
self.cam.fixedcamid += 1
|
||||||
|
self.cam.type = mujoco.mjtCamera.mjCAMERA_FIXED
|
||||||
|
if self.cam.fixedcamid >= self.model.ncam:
|
||||||
|
self.cam.fixedcamid = -1
|
||||||
|
self.cam.type = mujoco.mjtCamera.mjCAMERA_FREE
|
||||||
|
# Pause simulation
|
||||||
|
elif key == glfw.KEY_SPACE and self._paused is not None:
|
||||||
|
self._paused = not self._paused
|
||||||
|
# Advances simulation by one step.
|
||||||
|
elif key == glfw.KEY_RIGHT and self._paused is not None:
|
||||||
|
self._advance_by_one_step = True
|
||||||
|
self._paused = True
|
||||||
|
# Slows down simulation
|
||||||
|
elif key == glfw.KEY_S:
|
||||||
|
self._run_speed /= 2.0
|
||||||
|
# Speeds up simulation
|
||||||
|
elif key == glfw.KEY_F:
|
||||||
|
self._run_speed *= 2.0
|
||||||
|
# Turn off / turn on rendering every frame.
|
||||||
|
elif key == glfw.KEY_D:
|
||||||
|
self._render_every_frame = not self._render_every_frame
|
||||||
|
# Capture screenshot
|
||||||
|
elif key == glfw.KEY_T:
|
||||||
|
img = np.zeros(
|
||||||
|
(
|
||||||
|
glfw.get_framebuffer_size(self.window)[1],
|
||||||
|
glfw.get_framebuffer_size(self.window)[0],
|
||||||
|
3,
|
||||||
|
),
|
||||||
|
dtype=np.uint8,
|
||||||
|
)
|
||||||
|
mujoco.mjr_readPixels(img, None, self.viewport, self.con)
|
||||||
|
imageio.imwrite(self._image_path % self._image_idx, np.flipud(img))
|
||||||
|
self._image_idx += 1
|
||||||
|
# Display contact forces
|
||||||
|
elif key == glfw.KEY_C:
|
||||||
|
self._contacts = not self._contacts
|
||||||
|
self.vopt.flags[mujoco.mjtVisFlag.mjVIS_CONTACTPOINT] = self._contacts
|
||||||
|
self.vopt.flags[mujoco.mjtVisFlag.mjVIS_CONTACTFORCE] = self._contacts
|
||||||
|
# Display coordinate frames
|
||||||
|
elif key == glfw.KEY_E:
|
||||||
|
self.vopt.frame = 1 - self.vopt.frame
|
||||||
|
# Hide overlay menu
|
||||||
|
elif key == glfw.KEY_H:
|
||||||
|
self._hide_menu = not self._hide_menu
|
||||||
|
# Make transparent
|
||||||
|
elif key == glfw.KEY_R:
|
||||||
|
self._transparent = not self._transparent
|
||||||
|
if self._transparent:
|
||||||
|
self.model.geom_rgba[:, 3] /= 5.0
|
||||||
|
else:
|
||||||
|
self.model.geom_rgba[:, 3] *= 5.0
|
||||||
|
# Geom group visibility
|
||||||
|
elif key in (glfw.KEY_0, glfw.KEY_1, glfw.KEY_2, glfw.KEY_3, glfw.KEY_4):
|
||||||
|
self.vopt.geomgroup[key - glfw.KEY_0] ^= 1
|
||||||
|
# Quit
|
||||||
|
if key == glfw.KEY_ESCAPE:
|
||||||
|
print("Pressed ESC")
|
||||||
|
print("Quitting.")
|
||||||
|
glfw.terminate()
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
def _cursor_pos_callback(self, window, xpos, ypos):
|
||||||
|
if not (self._button_left_pressed or self._button_right_pressed):
|
||||||
|
return
|
||||||
|
|
||||||
|
mod_shift = (
|
||||||
|
glfw.get_key(window, glfw.KEY_LEFT_SHIFT) == glfw.PRESS
|
||||||
|
or glfw.get_key(window, glfw.KEY_RIGHT_SHIFT) == glfw.PRESS
|
||||||
|
)
|
||||||
|
if self._button_right_pressed:
|
||||||
|
action = (
|
||||||
|
mujoco.mjtMouse.mjMOUSE_MOVE_H
|
||||||
|
if mod_shift
|
||||||
|
else mujoco.mjtMouse.mjMOUSE_MOVE_V
|
||||||
|
)
|
||||||
|
elif self._button_left_pressed:
|
||||||
|
action = (
|
||||||
|
mujoco.mjtMouse.mjMOUSE_ROTATE_H
|
||||||
|
if mod_shift
|
||||||
|
else mujoco.mjtMouse.mjMOUSE_ROTATE_V
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
action = mujoco.mjtMouse.mjMOUSE_ZOOM
|
||||||
|
|
||||||
|
dx = int(self._scale * xpos) - self._last_mouse_x
|
||||||
|
dy = int(self._scale * ypos) - self._last_mouse_y
|
||||||
|
width, height = glfw.get_framebuffer_size(window)
|
||||||
|
|
||||||
|
with self._gui_lock:
|
||||||
|
mujoco.mjv_moveCamera(
|
||||||
|
self.model, action, dx / height, dy / height, self.scn, self.cam
|
||||||
|
)
|
||||||
|
|
||||||
|
self._last_mouse_x = int(self._scale * xpos)
|
||||||
|
self._last_mouse_y = int(self._scale * ypos)
|
||||||
|
|
||||||
|
def _mouse_button_callback(self, window, button, act, mods):
|
||||||
|
self._button_left_pressed = (
|
||||||
|
glfw.get_mouse_button(window, glfw.MOUSE_BUTTON_LEFT) == glfw.PRESS
|
||||||
|
)
|
||||||
|
self._button_right_pressed = (
|
||||||
|
glfw.get_mouse_button(window, glfw.MOUSE_BUTTON_RIGHT) == glfw.PRESS
|
||||||
|
)
|
||||||
|
|
||||||
|
x, y = glfw.get_cursor_pos(window)
|
||||||
|
self._last_mouse_x = int(self._scale * x)
|
||||||
|
self._last_mouse_y = int(self._scale * y)
|
||||||
|
|
||||||
|
def _scroll_callback(self, window, x_offset, y_offset):
|
||||||
|
with self._gui_lock:
|
||||||
|
mujoco.mjv_moveCamera(
|
||||||
|
self.model,
|
||||||
|
mujoco.mjtMouse.mjMOUSE_ZOOM,
|
||||||
|
0,
|
||||||
|
-0.05 * y_offset,
|
||||||
|
self.scn,
|
||||||
|
self.cam,
|
||||||
|
)
|
||||||
|
|
||||||
|
def _create_overlay(self):
|
||||||
|
topleft = mujoco.mjtGridPos.mjGRID_TOPLEFT
|
||||||
|
bottomleft = mujoco.mjtGridPos.mjGRID_BOTTOMLEFT
|
||||||
|
|
||||||
|
if self._render_every_frame:
|
||||||
|
self.add_overlay(topleft, "", "")
|
||||||
|
else:
|
||||||
|
self.add_overlay(
|
||||||
|
topleft,
|
||||||
|
"Run speed = %.3f x real time" % self._run_speed,
|
||||||
|
"[S]lower, [F]aster",
|
||||||
|
)
|
||||||
|
self.add_overlay(
|
||||||
|
topleft, "Ren[d]er every frame", "On" if self._render_every_frame else "Off"
|
||||||
|
)
|
||||||
|
self.add_overlay(
|
||||||
|
topleft,
|
||||||
|
"Switch camera (#cams = %d)" % (self.model.ncam + 1),
|
||||||
|
"[Tab] (camera ID = %d)" % self.cam.fixedcamid,
|
||||||
|
)
|
||||||
|
self.add_overlay(topleft, "[C]ontact forces", "On" if self._contacts else "Off")
|
||||||
|
self.add_overlay(topleft, "T[r]ansparent", "On" if self._transparent else "Off")
|
||||||
|
if self._paused is not None:
|
||||||
|
if not self._paused:
|
||||||
|
self.add_overlay(topleft, "Stop", "[Space]")
|
||||||
|
else:
|
||||||
|
self.add_overlay(topleft, "Start", "[Space]")
|
||||||
|
self.add_overlay(
|
||||||
|
topleft, "Advance simulation by one step", "[right arrow]"
|
||||||
|
)
|
||||||
|
self.add_overlay(
|
||||||
|
topleft, "Referenc[e] frames", "On" if self.vopt.frame == 1 else "Off"
|
||||||
|
)
|
||||||
|
self.add_overlay(topleft, "[H]ide Menu", "")
|
||||||
|
if self._image_idx > 0:
|
||||||
|
fname = self._image_path % (self._image_idx - 1)
|
||||||
|
self.add_overlay(topleft, "Cap[t]ure frame", "Saved as %s" % fname)
|
||||||
|
else:
|
||||||
|
self.add_overlay(topleft, "Cap[t]ure frame", "")
|
||||||
|
self.add_overlay(topleft, "Toggle geomgroup visibility", "0-4")
|
||||||
|
|
||||||
|
self.add_overlay(bottomleft, "FPS", "%d%s" % (1 / self._time_per_render, ""))
|
||||||
|
self.add_overlay(
|
||||||
|
bottomleft, "Solver iterations", str(self.data.solver_iter + 1)
|
||||||
|
)
|
||||||
|
self.add_overlay(
|
||||||
|
bottomleft, "Step", str(round(self.data.time / self.model.opt.timestep))
|
||||||
|
)
|
||||||
|
self.add_overlay(bottomleft, "timestep", "%.5f" % self.model.opt.timestep)
|
||||||
|
|
||||||
|
def render(self):
|
||||||
|
# mjv_updateScene, mjr_render, mjr_overlay
|
||||||
|
def update():
|
||||||
|
# fill overlay items
|
||||||
|
self._create_overlay()
|
||||||
|
|
||||||
|
render_start = time.time()
|
||||||
|
if self.window is None:
|
||||||
|
return
|
||||||
|
elif glfw.window_should_close(self.window):
|
||||||
|
glfw.terminate()
|
||||||
|
sys.exit(0)
|
||||||
|
self.viewport.width, self.viewport.height = glfw.get_framebuffer_size(
|
||||||
|
self.window
|
||||||
|
)
|
||||||
|
with self._gui_lock:
|
||||||
|
# update scene
|
||||||
|
mujoco.mjv_updateScene(
|
||||||
|
self.model,
|
||||||
|
self.data,
|
||||||
|
self.vopt,
|
||||||
|
mujoco.MjvPerturb(),
|
||||||
|
self.cam,
|
||||||
|
mujoco.mjtCatBit.mjCAT_ALL.value,
|
||||||
|
self.scn,
|
||||||
|
)
|
||||||
|
# marker items
|
||||||
|
for marker in self._markers:
|
||||||
|
self._add_marker_to_scene(marker)
|
||||||
|
# render
|
||||||
|
mujoco.mjr_render(self.viewport, self.scn, self.con)
|
||||||
|
# overlay items
|
||||||
|
if not self._hide_menu:
|
||||||
|
for gridpos, [t1, t2] in self._overlays.items():
|
||||||
|
mujoco.mjr_overlay(
|
||||||
|
mujoco.mjtFontScale.mjFONTSCALE_150,
|
||||||
|
gridpos,
|
||||||
|
self.viewport,
|
||||||
|
t1,
|
||||||
|
t2,
|
||||||
|
self.con,
|
||||||
|
)
|
||||||
|
glfw.swap_buffers(self.window)
|
||||||
|
glfw.poll_events()
|
||||||
|
self._time_per_render = 0.9 * self._time_per_render + 0.1 * (
|
||||||
|
time.time() - render_start
|
||||||
|
)
|
||||||
|
|
||||||
|
# clear overlay
|
||||||
|
self._overlays.clear()
|
||||||
|
|
||||||
|
if self._paused:
|
||||||
|
while self._paused:
|
||||||
|
update()
|
||||||
|
if self._advance_by_one_step:
|
||||||
|
self._advance_by_one_step = False
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
self._loop_count += self.model.opt.timestep / (
|
||||||
|
self._time_per_render * self._run_speed
|
||||||
|
)
|
||||||
|
if self._render_every_frame:
|
||||||
|
self._loop_count = 1
|
||||||
|
while self._loop_count > 0:
|
||||||
|
update()
|
||||||
|
self._loop_count -= 1
|
||||||
|
|
||||||
|
# clear markers
|
||||||
|
self._markers[:] = []
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
glfw.terminate()
|
||||||
|
sys.exit(0)
|
@@ -5,131 +5,11 @@ from gym.envs.mujoco import mujoco_env
|
|||||||
|
|
||||||
|
|
||||||
class PusherEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
class PusherEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
||||||
"""
|
|
||||||
### Description
|
|
||||||
"Pusher" is a multi-jointed robot arm which is very similar to that of a human.
|
|
||||||
The goal is to move a target cylinder (called *object*) to a goal position using the robot's end effector (called *fingertip*).
|
|
||||||
The robot consists of shoulder, elbow, forearm, and wrist joints.
|
|
||||||
|
|
||||||
### Action Space
|
|
||||||
The action space is a `Box(-2, 2, (7,), float32)`. An action `(a, b)` represents the torques applied at the hinge joints.
|
|
||||||
|
|
||||||
| Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit |
|
|
||||||
|-----|--------------------------------------------------------------------|-------------|-------------|----------------------------------|-------|--------------|
|
|
||||||
| 0 | Rotation of the panning the shoulder | -2 | 2 | r_shoulder_pan_joint | hinge | torque (N m) |
|
|
||||||
| 1 | Rotation of the shoulder lifting joint | -2 | 2 | r_shoulder_lift_joint | hinge | torque (N m) |
|
|
||||||
| 2 | Rotation of the shoulder rolling joint | -2 | 2 | r_upper_arm_roll_joint | hinge | torque (N m) |
|
|
||||||
| 3 | Rotation of hinge joint that flexed the elbow | -2 | 2 | r_elbow_flex_joint | hinge | torque (N m) |
|
|
||||||
| 4 | Rotation of hinge that rolls the forearm | -2 | 2 | r_forearm_roll_joint | hinge | torque (N m) |
|
|
||||||
| 5 | Rotation of flexing the wrist | -2 | 2 | r_wrist_flex_joint | hinge | torque (N m) |
|
|
||||||
| 6 | Rotation of rolling the wrist | -2 | 2 | r_wrist_roll_joint | hinge | torque (N m) |
|
|
||||||
|
|
||||||
### Observation Space
|
|
||||||
|
|
||||||
Observations consist of
|
|
||||||
|
|
||||||
- Angle of rotational joints on the pusher
|
|
||||||
- Anglular velocities of rotational joints on the pusher
|
|
||||||
- The coordinates of the fingertip of the pusher
|
|
||||||
- The coordinates of the object to be moved
|
|
||||||
- The coordinates of the goal position
|
|
||||||
|
|
||||||
The observation is a `ndarray` with shape `(23,)` where the elements correspond to the table below.
|
|
||||||
An analogy can be drawn to a human arm in order to help understand the state space, with the words flex and roll meaning the
|
|
||||||
same as human joints.
|
|
||||||
|
|
||||||
| Num | Observation | Min | Max | Name (in corresponding XML file) | Joint| Unit |
|
|
||||||
|-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------|
|
|
||||||
| 0 | Rotation of the panning the shoulder | -Inf | Inf | r_shoulder_pan_joint | hinge | angle (rad) |
|
|
||||||
| 1 | Rotation of the shoulder lifting joint | -Inf | Inf | r_shoulder_lift_joint | hinge | angle (rad) |
|
|
||||||
| 2 | Rotation of the shoulder rolling joint | -Inf | Inf | r_upper_arm_roll_joint | hinge | angle (rad) |
|
|
||||||
| 3 | Rotation of hinge joint that flexed the elbow | -Inf | Inf | r_elbow_flex_joint | hinge | angle (rad) |
|
|
||||||
| 4 | Rotation of hinge that rolls the forearm | -Inf | Inf | r_forearm_roll_joint | hinge | angle (rad) |
|
|
||||||
| 5 | Rotation of flexing the wrist | -Inf | Inf | r_wrist_flex_joint | hinge | angle (rad) |
|
|
||||||
| 6 | Rotation of rolling the wrist | -Inf | Inf | r_wrist_roll_joint | hinge | angle (rad) |
|
|
||||||
| 7 | Rotational velocity of the panning the shoulder | -Inf | Inf | r_shoulder_pan_joint | hinge | angular velocity (rad/s) |
|
|
||||||
| 8 | Rotational velocity of the shoulder lifting joint | -Inf | Inf | r_shoulder_lift_joint | hinge | angular velocity (rad/s) |
|
|
||||||
| 9 | Rotational velocity of the shoulder rolling joint | -Inf | Inf | r_upper_arm_roll_joint | hinge | angular velocity (rad/s) |
|
|
||||||
| 10 | Rotational velocity of hinge joint that flexed the elbow | -Inf | Inf | r_elbow_flex_joint | hinge | angular velocity (rad/s) |
|
|
||||||
| 11 | Rotational velocity of hinge that rolls the forearm | -Inf | Inf | r_forearm_roll_joint | hinge | angular velocity (rad/s) |
|
|
||||||
| 12 | Rotational velocity of flexing the wrist | -Inf | Inf | r_wrist_flex_joint | hinge | angular velocity (rad/s) |
|
|
||||||
| 13 | Rotational velocity of rolling the wrist | -Inf | Inf | r_wrist_roll_joint | hinge | angular velocity (rad/s) |
|
|
||||||
| 14 | x-coordinate of the fingertip of the pusher | -Inf | Inf | tips_arm | slide | position (m) |
|
|
||||||
| 15 | y-coordinate of the fingertip of the pusher | -Inf | Inf | tips_arm | slide | position (m) |
|
|
||||||
| 16 | z-coordinate of the fingertip of the pusher | -Inf | Inf | tips_arm | slide | position (m) |
|
|
||||||
| 17 | x-coordinate of the object to be moved | -Inf | Inf | object (obj_slidex) | slide | position (m) |
|
|
||||||
| 18 | y-coordinate of the object to be moved | -Inf | Inf | object (obj_slidey) | slide | position (m) |
|
|
||||||
| 19 | z-coordinate of the object to be moved | -Inf | Inf | object | cylinder | position (m) |
|
|
||||||
| 20 | x-coordinate of the goal position of the object | -Inf | Inf | goal (goal_slidex) | slide | position (m) |
|
|
||||||
| 21 | y-coordinate of the goal position of the object | -Inf | Inf | goal (goal_slidey) | slide | position (m) |
|
|
||||||
| 22 | z-coordinate of the goal position of the object | -Inf | Inf | goal | sphere | position (m) |
|
|
||||||
|
|
||||||
|
|
||||||
### Rewards
|
|
||||||
The reward consists of two parts:
|
|
||||||
- *reward_near *: This reward is a measure of how far the *fingertip*
|
|
||||||
of the pusher (the unattached end) is from the object, with a more negative
|
|
||||||
value assigned for when the pusher's *fingertip* is further away from the
|
|
||||||
target. It is calculated as the negative vector norm of (position of
|
|
||||||
the fingertip - position of target), or *-norm("fingertip" - "target")*.
|
|
||||||
- *reward_dist *: This reward is a measure of how far the object is from
|
|
||||||
the target goal position, with a more negative value assigned for object is
|
|
||||||
further away from the target. It is calculated as the negative vector norm of
|
|
||||||
(position of the object - position of goal), or *-norm("object" - "target")*.
|
|
||||||
- *reward_control*: A negative reward for penalising the pusher if
|
|
||||||
it takes actions that are too large. It is measured as the negative squared
|
|
||||||
Euclidean norm of the action, i.e. as *- sum(action<sup>2</sup>)*.
|
|
||||||
|
|
||||||
The total reward returned is ***reward*** *=* *reward_dist + 0.1 * reward_ctrl + 0.5 * reward_near*
|
|
||||||
|
|
||||||
Unlike other environments, Pusher does not allow you to specify weights for the individual reward terms.
|
|
||||||
However, `info` does contain the keys *reward_dist* and *reward_ctrl*. Thus, if you'd like to weight the terms,
|
|
||||||
you should create a wrapper that computes the weighted reward from `info`.
|
|
||||||
|
|
||||||
|
|
||||||
### Starting State
|
|
||||||
All pusher (not including object and goal) states start in
|
|
||||||
(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0). A uniform noise in the range
|
|
||||||
[-0.005, 0.005] is added to the velocity attributes only. The velocities of
|
|
||||||
the object and goal are permanently set to 0. The object's x-position is selected uniformly
|
|
||||||
between [-0.3, 0] while the y-position is selected uniformly between [-0.2, 0.2], and this
|
|
||||||
process is repeated until the vector norm between the object's (x,y) position and origin is not greater
|
|
||||||
than 0.17. The goal always have the same position of (0.45, -0.05, -0.323).
|
|
||||||
|
|
||||||
The default framerate is 5 with each frame lasting for 0.01, giving rise to a *dt = 5 * 0.01 = 0.05*
|
|
||||||
|
|
||||||
### Episode Termination
|
|
||||||
|
|
||||||
The episode terminates when any of the following happens:
|
|
||||||
|
|
||||||
1. The episode duration reaches a 100 timesteps.
|
|
||||||
2. Any of the state space values is no longer finite.
|
|
||||||
|
|
||||||
### Arguments
|
|
||||||
|
|
||||||
No additional arguments are currently supported (in v2 and lower),
|
|
||||||
but modifications can be made to the XML file in the assets folder
|
|
||||||
(or by changing the path to a modified XML file in another folder)..
|
|
||||||
|
|
||||||
```
|
|
||||||
env = gym.make('Pusher-v2')
|
|
||||||
```
|
|
||||||
|
|
||||||
There is no v3 for Pusher, unlike the robot environments where a v3 and
|
|
||||||
beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
|
|
||||||
|
|
||||||
|
|
||||||
### Version History
|
|
||||||
|
|
||||||
* v2: All continuous control environments now use mujoco_py >= 1.50
|
|
||||||
* v1: max_time_steps raised to 1000 for robot based tasks (not including reacher, which has a max_time_steps of 50). Added reward_threshold to environments.
|
|
||||||
* v0: Initial versions release (1.0.0)
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
utils.EzPickle.__init__(self)
|
utils.EzPickle.__init__(self)
|
||||||
mujoco_env.MujocoEnv.__init__(self, "pusher.xml", 5)
|
mujoco_env.MujocoEnv.__init__(
|
||||||
|
self, "pusher.xml", 5, mujoco_bindings="mujoco_py"
|
||||||
|
)
|
||||||
|
|
||||||
def step(self, a):
|
def step(self, a):
|
||||||
vec_1 = self.get_body_com("object") - self.get_body_com("tips_arm")
|
vec_1 = self.get_body_com("object") - self.get_body_com("tips_arm")
|
||||||
|
185
gym/envs/mujoco/pusher_v4.py
Normal file
185
gym/envs/mujoco/pusher_v4.py
Normal file
@@ -0,0 +1,185 @@
|
|||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from gym import utils
|
||||||
|
from gym.envs.mujoco import mujoco_env
|
||||||
|
|
||||||
|
|
||||||
|
class PusherEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
||||||
|
"""
|
||||||
|
### Description
|
||||||
|
"Pusher" is a multi-jointed robot arm which is very similar to that of a human.
|
||||||
|
The goal is to move a target cylinder (called *object*) to a goal position using the robot's end effector (called *fingertip*).
|
||||||
|
The robot consists of shoulder, elbow, forearm, and wrist joints.
|
||||||
|
|
||||||
|
### Action Space
|
||||||
|
The action space is a `Box(-2, 2, (7,), float32)`. An action `(a, b)` represents the torques applied at the hinge joints.
|
||||||
|
|
||||||
|
| Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit |
|
||||||
|
|-----|--------------------------------------------------------------------|-------------|-------------|----------------------------------|-------|--------------|
|
||||||
|
| 0 | Rotation of the panning the shoulder | -2 | 2 | r_shoulder_pan_joint | hinge | torque (N m) |
|
||||||
|
| 1 | Rotation of the shoulder lifting joint | -2 | 2 | r_shoulder_lift_joint | hinge | torque (N m) |
|
||||||
|
| 2 | Rotation of the shoulder rolling joint | -2 | 2 | r_upper_arm_roll_joint | hinge | torque (N m) |
|
||||||
|
| 3 | Rotation of hinge joint that flexed the elbow | -2 | 2 | r_elbow_flex_joint | hinge | torque (N m) |
|
||||||
|
| 4 | Rotation of hinge that rolls the forearm | -2 | 2 | r_forearm_roll_joint | hinge | torque (N m) |
|
||||||
|
| 5 | Rotation of flexing the wrist | -2 | 2 | r_wrist_flex_joint | hinge | torque (N m) |
|
||||||
|
| 6 | Rotation of rolling the wrist | -2 | 2 | r_wrist_roll_joint | hinge | torque (N m) |
|
||||||
|
|
||||||
|
### Observation Space
|
||||||
|
|
||||||
|
Observations consist of
|
||||||
|
|
||||||
|
- Angle of rotational joints on the pusher
|
||||||
|
- Anglular velocities of rotational joints on the pusher
|
||||||
|
- The coordinates of the fingertip of the pusher
|
||||||
|
- The coordinates of the object to be moved
|
||||||
|
- The coordinates of the goal position
|
||||||
|
|
||||||
|
The observation is a `ndarray` with shape `(23,)` where the elements correspond to the table below.
|
||||||
|
An analogy can be drawn to a human arm in order to help understand the state space, with the words flex and roll meaning the
|
||||||
|
same as human joints.
|
||||||
|
|
||||||
|
| Num | Observation | Min | Max | Name (in corresponding XML file) | Joint| Unit |
|
||||||
|
|-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------|
|
||||||
|
| 0 | Rotation of the panning the shoulder | -Inf | Inf | r_shoulder_pan_joint | hinge | angle (rad) |
|
||||||
|
| 1 | Rotation of the shoulder lifting joint | -Inf | Inf | r_shoulder_lift_joint | hinge | angle (rad) |
|
||||||
|
| 2 | Rotation of the shoulder rolling joint | -Inf | Inf | r_upper_arm_roll_joint | hinge | angle (rad) |
|
||||||
|
| 3 | Rotation of hinge joint that flexed the elbow | -Inf | Inf | r_elbow_flex_joint | hinge | angle (rad) |
|
||||||
|
| 4 | Rotation of hinge that rolls the forearm | -Inf | Inf | r_forearm_roll_joint | hinge | angle (rad) |
|
||||||
|
| 5 | Rotation of flexing the wrist | -Inf | Inf | r_wrist_flex_joint | hinge | angle (rad) |
|
||||||
|
| 6 | Rotation of rolling the wrist | -Inf | Inf | r_wrist_roll_joint | hinge | angle (rad) |
|
||||||
|
| 7 | Rotational velocity of the panning the shoulder | -Inf | Inf | r_shoulder_pan_joint | hinge | angular velocity (rad/s) |
|
||||||
|
| 8 | Rotational velocity of the shoulder lifting joint | -Inf | Inf | r_shoulder_lift_joint | hinge | angular velocity (rad/s) |
|
||||||
|
| 9 | Rotational velocity of the shoulder rolling joint | -Inf | Inf | r_upper_arm_roll_joint | hinge | angular velocity (rad/s) |
|
||||||
|
| 10 | Rotational velocity of hinge joint that flexed the elbow | -Inf | Inf | r_elbow_flex_joint | hinge | angular velocity (rad/s) |
|
||||||
|
| 11 | Rotational velocity of hinge that rolls the forearm | -Inf | Inf | r_forearm_roll_joint | hinge | angular velocity (rad/s) |
|
||||||
|
| 12 | Rotational velocity of flexing the wrist | -Inf | Inf | r_wrist_flex_joint | hinge | angular velocity (rad/s) |
|
||||||
|
| 13 | Rotational velocity of rolling the wrist | -Inf | Inf | r_wrist_roll_joint | hinge | angular velocity (rad/s) |
|
||||||
|
| 14 | x-coordinate of the fingertip of the pusher | -Inf | Inf | tips_arm | slide | position (m) |
|
||||||
|
| 15 | y-coordinate of the fingertip of the pusher | -Inf | Inf | tips_arm | slide | position (m) |
|
||||||
|
| 16 | z-coordinate of the fingertip of the pusher | -Inf | Inf | tips_arm | slide | position (m) |
|
||||||
|
| 17 | x-coordinate of the object to be moved | -Inf | Inf | object (obj_slidex) | slide | position (m) |
|
||||||
|
| 18 | y-coordinate of the object to be moved | -Inf | Inf | object (obj_slidey) | slide | position (m) |
|
||||||
|
| 19 | z-coordinate of the object to be moved | -Inf | Inf | object | cylinder | position (m) |
|
||||||
|
| 20 | x-coordinate of the goal position of the object | -Inf | Inf | goal (goal_slidex) | slide | position (m) |
|
||||||
|
| 21 | y-coordinate of the goal position of the object | -Inf | Inf | goal (goal_slidey) | slide | position (m) |
|
||||||
|
| 22 | z-coordinate of the goal position of the object | -Inf | Inf | goal | sphere | position (m) |
|
||||||
|
|
||||||
|
|
||||||
|
### Rewards
|
||||||
|
The reward consists of two parts:
|
||||||
|
- *reward_near *: This reward is a measure of how far the *fingertip*
|
||||||
|
of the pusher (the unattached end) is from the object, with a more negative
|
||||||
|
value assigned for when the pusher's *fingertip* is further away from the
|
||||||
|
target. It is calculated as the negative vector norm of (position of
|
||||||
|
the fingertip - position of target), or *-norm("fingertip" - "target")*.
|
||||||
|
- *reward_dist *: This reward is a measure of how far the object is from
|
||||||
|
the target goal position, with a more negative value assigned for object is
|
||||||
|
further away from the target. It is calculated as the negative vector norm of
|
||||||
|
(position of the object - position of goal), or *-norm("object" - "target")*.
|
||||||
|
- *reward_control*: A negative reward for penalising the pusher if
|
||||||
|
it takes actions that are too large. It is measured as the negative squared
|
||||||
|
Euclidean norm of the action, i.e. as *- sum(action<sup>2</sup>)*.
|
||||||
|
|
||||||
|
The total reward returned is ***reward*** *=* *reward_dist + 0.1 * reward_ctrl + 0.5 * reward_near*
|
||||||
|
|
||||||
|
Unlike other environments, Pusher does not allow you to specify weights for the individual reward terms.
|
||||||
|
However, `info` does contain the keys *reward_dist* and *reward_ctrl*. Thus, if you'd like to weight the terms,
|
||||||
|
you should create a wrapper that computes the weighted reward from `info`.
|
||||||
|
|
||||||
|
|
||||||
|
### Starting State
|
||||||
|
All pusher (not including object and goal) states start in
|
||||||
|
(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0). A uniform noise in the range
|
||||||
|
[-0.005, 0.005] is added to the velocity attributes only. The velocities of
|
||||||
|
the object and goal are permanently set to 0. The object's x-position is selected uniformly
|
||||||
|
between [-0.3, 0] while the y-position is selected uniformly between [-0.2, 0.2], and this
|
||||||
|
process is repeated until the vector norm between the object's (x,y) position and origin is not greater
|
||||||
|
than 0.17. The goal always have the same position of (0.45, -0.05, -0.323).
|
||||||
|
|
||||||
|
The default framerate is 5 with each frame lasting for 0.01, giving rise to a *dt = 5 * 0.01 = 0.05*
|
||||||
|
|
||||||
|
### Episode Termination
|
||||||
|
|
||||||
|
The episode terminates when any of the following happens:
|
||||||
|
|
||||||
|
1. The episode duration reaches a 100 timesteps.
|
||||||
|
2. Any of the state space values is no longer finite.
|
||||||
|
|
||||||
|
### Arguments
|
||||||
|
|
||||||
|
No additional arguments are currently supported (in v2 and lower),
|
||||||
|
but modifications can be made to the XML file in the assets folder
|
||||||
|
(or by changing the path to a modified XML file in another folder)..
|
||||||
|
|
||||||
|
```
|
||||||
|
env = gym.make('Pusher-v2')
|
||||||
|
```
|
||||||
|
|
||||||
|
There is no v3 for Pusher, unlike the robot environments where a v3 and
|
||||||
|
beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
|
||||||
|
|
||||||
|
|
||||||
|
### Version History
|
||||||
|
|
||||||
|
* v4: all mujoco environments now use the mujoco binidings in mujoco>=2.1.3
|
||||||
|
* v2: All continuous control environments now use mujoco_py >= 1.50
|
||||||
|
* v1: max_time_steps raised to 1000 for robot based tasks (not including reacher, which has a max_time_steps of 50). Added reward_threshold to environments.
|
||||||
|
* v0: Initial versions release (1.0.0)
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
utils.EzPickle.__init__(self)
|
||||||
|
mujoco_env.MujocoEnv.__init__(self, "pusher.xml", 5)
|
||||||
|
|
||||||
|
def step(self, a):
|
||||||
|
vec_1 = self.get_body_com("object") - self.get_body_com("tips_arm")
|
||||||
|
vec_2 = self.get_body_com("object") - self.get_body_com("goal")
|
||||||
|
|
||||||
|
reward_near = -np.linalg.norm(vec_1)
|
||||||
|
reward_dist = -np.linalg.norm(vec_2)
|
||||||
|
reward_ctrl = -np.square(a).sum()
|
||||||
|
reward = reward_dist + 0.1 * reward_ctrl + 0.5 * reward_near
|
||||||
|
|
||||||
|
self.do_simulation(a, self.frame_skip)
|
||||||
|
ob = self._get_obs()
|
||||||
|
done = False
|
||||||
|
return ob, reward, done, dict(reward_dist=reward_dist, reward_ctrl=reward_ctrl)
|
||||||
|
|
||||||
|
def viewer_setup(self):
|
||||||
|
self.viewer.cam.trackbodyid = -1
|
||||||
|
self.viewer.cam.distance = 4.0
|
||||||
|
|
||||||
|
def reset_model(self):
|
||||||
|
qpos = self.init_qpos
|
||||||
|
|
||||||
|
self.goal_pos = np.asarray([0, 0])
|
||||||
|
while True:
|
||||||
|
self.cylinder_pos = np.concatenate(
|
||||||
|
[
|
||||||
|
self.np_random.uniform(low=-0.3, high=0, size=1),
|
||||||
|
self.np_random.uniform(low=-0.2, high=0.2, size=1),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
if np.linalg.norm(self.cylinder_pos - self.goal_pos) > 0.17:
|
||||||
|
break
|
||||||
|
|
||||||
|
qpos[-4:-2] = self.cylinder_pos
|
||||||
|
qpos[-2:] = self.goal_pos
|
||||||
|
qvel = self.init_qvel + self.np_random.uniform(
|
||||||
|
low=-0.005, high=0.005, size=self.model.nv
|
||||||
|
)
|
||||||
|
qvel[-4:] = 0
|
||||||
|
self.set_state(qpos, qvel)
|
||||||
|
return self._get_obs()
|
||||||
|
|
||||||
|
def _get_obs(self):
|
||||||
|
return np.concatenate(
|
||||||
|
[
|
||||||
|
self.data.qpos.flat[:7],
|
||||||
|
self.data.qvel.flat[:7],
|
||||||
|
self.get_body_com("tips_arm"),
|
||||||
|
self.get_body_com("object"),
|
||||||
|
self.get_body_com("goal"),
|
||||||
|
]
|
||||||
|
)
|
@@ -5,121 +5,11 @@ from gym.envs.mujoco import mujoco_env
|
|||||||
|
|
||||||
|
|
||||||
class ReacherEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
class ReacherEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
||||||
"""
|
|
||||||
### Description
|
|
||||||
"Reacher" is a two-jointed robot arm. The goal is to move the robot's end effector (called *fingertip*) close to a
|
|
||||||
target that is spawned at a random position.
|
|
||||||
|
|
||||||
### Action Space
|
|
||||||
The action space is a `Box(-1, 1, (2,), float32)`. An action `(a, b)` represents the torques applied at the hinge joints.
|
|
||||||
|
|
||||||
| Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit |
|
|
||||||
|-----|---------------------------------------------------------------------------------|-------------|-------------|--------------------------|-------|------|
|
|
||||||
| 0 | Torque applied at the first hinge (connecting the link to the point of fixture) | -1 | 1 | joint0 | hinge | torque (N m) |
|
|
||||||
| 1 | Torque applied at the second hinge (connecting the two links) | -1 | 1 | joint1 | hinge | torque (N m) |
|
|
||||||
|
|
||||||
### Observation Space
|
|
||||||
|
|
||||||
Observations consist of
|
|
||||||
|
|
||||||
- The cosine of the angles of the two arms
|
|
||||||
- The sine of the angles of the two arms
|
|
||||||
- The coordinates of the target
|
|
||||||
- The angular velocities of the arms
|
|
||||||
- The vector between the target and the reacher's fingertip (3 dimensional with the last element being 0)
|
|
||||||
|
|
||||||
The observation is a `ndarray` with shape `(11,)` where the elements correspond to the following:
|
|
||||||
|
|
||||||
| Num | Observation | Min | Max | Name (in corresponding XML file) | Joint| Unit |
|
|
||||||
|-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------|
|
|
||||||
| 0 | cosine of the angle of the first arm | -Inf | Inf | cos(joint0) | hinge | unitless |
|
|
||||||
| 1 | cosine of the angle of the second arm | -Inf | Inf | cos(joint1) | hinge | unitless |
|
|
||||||
| 2 | sine of the angle of the first arm | -Inf | Inf | cos(joint0) | hinge | unitless |
|
|
||||||
| 3 | sine of the angle of the second arm | -Inf | Inf | cos(joint1) | hinge | unitless |
|
|
||||||
| 4 | x-coorddinate of the target | -Inf | Inf | target_x | slide | position (m) |
|
|
||||||
| 5 | y-coorddinate of the target | -Inf | Inf | target_y | slide | position (m) |
|
|
||||||
| 6 | angular velocity of the first arm | -Inf | Inf | joint0 | hinge | angular velocity (rad/s) |
|
|
||||||
| 7 | angular velocity of the second arm | -Inf | Inf | joint1 | hinge | angular velocity (rad/s) |
|
|
||||||
| 8 | x-value of position_fingertip - position_target | -Inf | Inf | NA | slide | position (m) |
|
|
||||||
| 9 | y-value of position_fingertip - position_target | -Inf | Inf | NA | slide | position (m) |
|
|
||||||
| 10 | z-value of position_fingertip - position_target (0 since reacher is 2d and z is same for both) | -Inf | Inf | NA | slide | position (m) |
|
|
||||||
|
|
||||||
|
|
||||||
Most Gym environments just return the positions and velocity of the
|
|
||||||
joints in the `.xml` file as the state of the environment. However, in
|
|
||||||
reacher the state is created by combining only certain elements of the
|
|
||||||
position and velocity, and performing some function transformations on them.
|
|
||||||
If one is to read the `.xml` for reacher then they will find 4 joints:
|
|
||||||
|
|
||||||
| Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit |
|
|
||||||
|-----|-----------------------------|----------|----------|----------------------------------|-------|--------------------|
|
|
||||||
| 0 | angle of the first arm | -Inf | Inf | joint0 | hinge | angle (rad) |
|
|
||||||
| 1 | angle of the second arm | -Inf | Inf | joint1 | hinge | angle (rad) |
|
|
||||||
| 2 | x-coordinate of the target | -Inf | Inf | target_x | slide | position (m) |
|
|
||||||
| 3 | y-coordinate of the target | -Inf | Inf | target_y | slide | position (m) |
|
|
||||||
|
|
||||||
|
|
||||||
### Rewards
|
|
||||||
The reward consists of two parts:
|
|
||||||
- *reward_distance*: This reward is a measure of how far the *fingertip*
|
|
||||||
of the reacher (the unattached end) is from the target, with a more negative
|
|
||||||
value assigned for when the reacher's *fingertip* is further away from the
|
|
||||||
target. It is calculated as the negative vector norm of (position of
|
|
||||||
the fingertip - position of target), or *-norm("fingertip" - "target")*.
|
|
||||||
- *reward_control*: A negative reward for penalising the walker if
|
|
||||||
it takes actions that are too large. It is measured as the negative squared
|
|
||||||
Euclidean norm of the action, i.e. as *- sum(action<sup>2</sup>)*.
|
|
||||||
|
|
||||||
The total reward returned is ***reward*** *=* *reward_distance + reward_control*
|
|
||||||
|
|
||||||
Unlike other environments, Reacher does not allow you to specify weights for the individual reward terms.
|
|
||||||
However, `info` does contain the keys *reward_dist* and *reward_ctrl*. Thus, if you'd like to weight the terms,
|
|
||||||
you should create a wrapper that computes the weighted reward from `info`.
|
|
||||||
|
|
||||||
|
|
||||||
### Starting State
|
|
||||||
All observations start in state
|
|
||||||
(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
|
|
||||||
with a noise added for stochasticity. A uniform noise in the range
|
|
||||||
[-0.1, 0.1] is added to the positional attributes, while the target position
|
|
||||||
is selected uniformly at random in a disk of radius 0.2 around the origin.
|
|
||||||
Independent, uniform noise in the
|
|
||||||
range of [-0.005, 0.005] is added to the velocities, and the last
|
|
||||||
element ("fingertip" - "target") is calculated at the end once everything
|
|
||||||
is set. The default setting has a framerate of 2 and a *dt = 2 * 0.01 = 0.02*
|
|
||||||
|
|
||||||
### Episode Termination
|
|
||||||
|
|
||||||
The episode terminates when any of the following happens:
|
|
||||||
|
|
||||||
1. The episode duration reaches a 50 timesteps (with a new random target popping up if the reacher's fingertip reaches it before 50 timesteps)
|
|
||||||
2. Any of the state space values is no longer finite.
|
|
||||||
|
|
||||||
### Arguments
|
|
||||||
|
|
||||||
No additional arguments are currently supported (in v2 and lower),
|
|
||||||
but modifications can be made to the XML file in the assets folder
|
|
||||||
(or by changing the path to a modified XML file in another folder)..
|
|
||||||
|
|
||||||
```
|
|
||||||
env = gym.make('Reacher-v2')
|
|
||||||
```
|
|
||||||
|
|
||||||
There is no v3 for Reacher, unlike the robot environments where a v3 and
|
|
||||||
beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
|
|
||||||
|
|
||||||
|
|
||||||
### Version History
|
|
||||||
|
|
||||||
* v2: All continuous control environments now use mujoco_py >= 1.50
|
|
||||||
* v1: max_time_steps raised to 1000 for robot based tasks (not including reacher, which has a max_time_steps of 50). Added reward_threshold to environments.
|
|
||||||
* v0: Initial versions release (1.0.0)
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
utils.EzPickle.__init__(self)
|
utils.EzPickle.__init__(self)
|
||||||
mujoco_env.MujocoEnv.__init__(self, "reacher.xml", 2)
|
mujoco_env.MujocoEnv.__init__(
|
||||||
|
self, "reacher.xml", 2, mujoco_bindings="mujoco_py"
|
||||||
|
)
|
||||||
|
|
||||||
def step(self, a):
|
def step(self, a):
|
||||||
vec = self.get_body_com("fingertip") - self.get_body_com("target")
|
vec = self.get_body_com("fingertip") - self.get_body_com("target")
|
||||||
|
165
gym/envs/mujoco/reacher_v4.py
Normal file
165
gym/envs/mujoco/reacher_v4.py
Normal file
@@ -0,0 +1,165 @@
|
|||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from gym import utils
|
||||||
|
from gym.envs.mujoco import mujoco_env
|
||||||
|
|
||||||
|
|
||||||
|
class ReacherEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
||||||
|
"""
|
||||||
|
### Description
|
||||||
|
"Reacher" is a two-jointed robot arm. The goal is to move the robot's end effector (called *fingertip*) close to a
|
||||||
|
target that is spawned at a random position.
|
||||||
|
|
||||||
|
### Action Space
|
||||||
|
The action space is a `Box(-1, 1, (2,), float32)`. An action `(a, b)` represents the torques applied at the hinge joints.
|
||||||
|
|
||||||
|
| Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit |
|
||||||
|
|-----|---------------------------------------------------------------------------------|-------------|-------------|--------------------------|-------|------|
|
||||||
|
| 0 | Torque applied at the first hinge (connecting the link to the point of fixture) | -1 | 1 | joint0 | hinge | torque (N m) |
|
||||||
|
| 1 | Torque applied at the second hinge (connecting the two links) | -1 | 1 | joint1 | hinge | torque (N m) |
|
||||||
|
|
||||||
|
### Observation Space
|
||||||
|
|
||||||
|
Observations consist of
|
||||||
|
|
||||||
|
- The cosine of the angles of the two arms
|
||||||
|
- The sine of the angles of the two arms
|
||||||
|
- The coordinates of the target
|
||||||
|
- The angular velocities of the arms
|
||||||
|
- The vector between the target and the reacher's fingertip (3 dimensional with the last element being 0)
|
||||||
|
|
||||||
|
The observation is a `ndarray` with shape `(11,)` where the elements correspond to the following:
|
||||||
|
|
||||||
|
| Num | Observation | Min | Max | Name (in corresponding XML file) | Joint| Unit |
|
||||||
|
|-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------|
|
||||||
|
| 0 | cosine of the angle of the first arm | -Inf | Inf | cos(joint0) | hinge | unitless |
|
||||||
|
| 1 | cosine of the angle of the second arm | -Inf | Inf | cos(joint1) | hinge | unitless |
|
||||||
|
| 2 | sine of the angle of the first arm | -Inf | Inf | cos(joint0) | hinge | unitless |
|
||||||
|
| 3 | sine of the angle of the second arm | -Inf | Inf | cos(joint1) | hinge | unitless |
|
||||||
|
| 4 | x-coorddinate of the target | -Inf | Inf | target_x | slide | position (m) |
|
||||||
|
| 5 | y-coorddinate of the target | -Inf | Inf | target_y | slide | position (m) |
|
||||||
|
| 6 | angular velocity of the first arm | -Inf | Inf | joint0 | hinge | angular velocity (rad/s) |
|
||||||
|
| 7 | angular velocity of the second arm | -Inf | Inf | joint1 | hinge | angular velocity (rad/s) |
|
||||||
|
| 8 | x-value of position_fingertip - position_target | -Inf | Inf | NA | slide | position (m) |
|
||||||
|
| 9 | y-value of position_fingertip - position_target | -Inf | Inf | NA | slide | position (m) |
|
||||||
|
| 10 | z-value of position_fingertip - position_target (0 since reacher is 2d and z is same for both) | -Inf | Inf | NA | slide | position (m) |
|
||||||
|
|
||||||
|
|
||||||
|
Most Gym environments just return the positions and velocity of the
|
||||||
|
joints in the `.xml` file as the state of the environment. However, in
|
||||||
|
reacher the state is created by combining only certain elements of the
|
||||||
|
position and velocity, and performing some function transformations on them.
|
||||||
|
If one is to read the `.xml` for reacher then they will find 4 joints:
|
||||||
|
|
||||||
|
| Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit |
|
||||||
|
|-----|-----------------------------|----------|----------|----------------------------------|-------|--------------------|
|
||||||
|
| 0 | angle of the first arm | -Inf | Inf | joint0 | hinge | angle (rad) |
|
||||||
|
| 1 | angle of the second arm | -Inf | Inf | joint1 | hinge | angle (rad) |
|
||||||
|
| 2 | x-coordinate of the target | -Inf | Inf | target_x | slide | position (m) |
|
||||||
|
| 3 | y-coordinate of the target | -Inf | Inf | target_y | slide | position (m) |
|
||||||
|
|
||||||
|
|
||||||
|
### Rewards
|
||||||
|
The reward consists of two parts:
|
||||||
|
- *reward_distance*: This reward is a measure of how far the *fingertip*
|
||||||
|
of the reacher (the unattached end) is from the target, with a more negative
|
||||||
|
value assigned for when the reacher's *fingertip* is further away from the
|
||||||
|
target. It is calculated as the negative vector norm of (position of
|
||||||
|
the fingertip - position of target), or *-norm("fingertip" - "target")*.
|
||||||
|
- *reward_control*: A negative reward for penalising the walker if
|
||||||
|
it takes actions that are too large. It is measured as the negative squared
|
||||||
|
Euclidean norm of the action, i.e. as *- sum(action<sup>2</sup>)*.
|
||||||
|
|
||||||
|
The total reward returned is ***reward*** *=* *reward_distance + reward_control*
|
||||||
|
|
||||||
|
Unlike other environments, Reacher does not allow you to specify weights for the individual reward terms.
|
||||||
|
However, `info` does contain the keys *reward_dist* and *reward_ctrl*. Thus, if you'd like to weight the terms,
|
||||||
|
you should create a wrapper that computes the weighted reward from `info`.
|
||||||
|
|
||||||
|
|
||||||
|
### Starting State
|
||||||
|
All observations start in state
|
||||||
|
(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
|
||||||
|
with a noise added for stochasticity. A uniform noise in the range
|
||||||
|
[-0.1, 0.1] is added to the positional attributes, while the target position
|
||||||
|
is selected uniformly at random in a disk of radius 0.2 around the origin.
|
||||||
|
Independent, uniform noise in the
|
||||||
|
range of [-0.005, 0.005] is added to the velocities, and the last
|
||||||
|
element ("fingertip" - "target") is calculated at the end once everything
|
||||||
|
is set. The default setting has a framerate of 2 and a *dt = 2 * 0.01 = 0.02*
|
||||||
|
|
||||||
|
### Episode Termination
|
||||||
|
|
||||||
|
The episode terminates when any of the following happens:
|
||||||
|
|
||||||
|
1. The episode duration reaches a 50 timesteps (with a new random target popping up if the reacher's fingertip reaches it before 50 timesteps)
|
||||||
|
2. Any of the state space values is no longer finite.
|
||||||
|
|
||||||
|
### Arguments
|
||||||
|
|
||||||
|
No additional arguments are currently supported (in v2 and lower),
|
||||||
|
but modifications can be made to the XML file in the assets folder
|
||||||
|
(or by changing the path to a modified XML file in another folder)..
|
||||||
|
|
||||||
|
```
|
||||||
|
env = gym.make('Reacher-v2')
|
||||||
|
```
|
||||||
|
|
||||||
|
There is no v3 for Reacher, unlike the robot environments where a v3 and
|
||||||
|
beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
|
||||||
|
|
||||||
|
|
||||||
|
### Version History
|
||||||
|
|
||||||
|
* v4: all mujoco environments now use the mujoco binidings in mujoco>=2.1.3
|
||||||
|
* v2: All continuous control environments now use mujoco_py >= 1.50
|
||||||
|
* v1: max_time_steps raised to 1000 for robot based tasks (not including reacher, which has a max_time_steps of 50). Added reward_threshold to environments.
|
||||||
|
* v0: Initial versions release (1.0.0)
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
utils.EzPickle.__init__(self)
|
||||||
|
mujoco_env.MujocoEnv.__init__(self, "reacher.xml", 2)
|
||||||
|
|
||||||
|
def step(self, a):
|
||||||
|
vec = self.get_body_com("fingertip") - self.get_body_com("target")
|
||||||
|
reward_dist = -np.linalg.norm(vec)
|
||||||
|
reward_ctrl = -np.square(a).sum()
|
||||||
|
reward = reward_dist + reward_ctrl
|
||||||
|
self.do_simulation(a, self.frame_skip)
|
||||||
|
ob = self._get_obs()
|
||||||
|
done = False
|
||||||
|
return ob, reward, done, dict(reward_dist=reward_dist, reward_ctrl=reward_ctrl)
|
||||||
|
|
||||||
|
def viewer_setup(self):
|
||||||
|
self.viewer.cam.trackbodyid = 0
|
||||||
|
|
||||||
|
def reset_model(self):
|
||||||
|
qpos = (
|
||||||
|
self.np_random.uniform(low=-0.1, high=0.1, size=self.model.nq)
|
||||||
|
+ self.init_qpos
|
||||||
|
)
|
||||||
|
while True:
|
||||||
|
self.goal = self.np_random.uniform(low=-0.2, high=0.2, size=2)
|
||||||
|
if np.linalg.norm(self.goal) < 0.2:
|
||||||
|
break
|
||||||
|
qpos[-2:] = self.goal
|
||||||
|
qvel = self.init_qvel + self.np_random.uniform(
|
||||||
|
low=-0.005, high=0.005, size=self.model.nv
|
||||||
|
)
|
||||||
|
qvel[-2:] = 0
|
||||||
|
self.set_state(qpos, qvel)
|
||||||
|
return self._get_obs()
|
||||||
|
|
||||||
|
def _get_obs(self):
|
||||||
|
theta = self.data.qpos.flat[:2]
|
||||||
|
return np.concatenate(
|
||||||
|
[
|
||||||
|
np.cos(theta),
|
||||||
|
np.sin(theta),
|
||||||
|
self.data.qpos.flat[2:],
|
||||||
|
self.data.qvel.flat[:2],
|
||||||
|
self.get_body_com("fingertip") - self.get_body_com("target"),
|
||||||
|
]
|
||||||
|
)
|
@@ -6,7 +6,9 @@ from gym.envs.mujoco import mujoco_env
|
|||||||
|
|
||||||
class SwimmerEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
class SwimmerEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
mujoco_env.MujocoEnv.__init__(self, "swimmer.xml", 4)
|
mujoco_env.MujocoEnv.__init__(
|
||||||
|
self, "swimmer.xml", 4, mujoco_bindings="mujoco_py"
|
||||||
|
)
|
||||||
utils.EzPickle.__init__(self)
|
utils.EzPickle.__init__(self)
|
||||||
|
|
||||||
def step(self, a):
|
def step(self, a):
|
||||||
|
@@ -9,119 +9,6 @@ DEFAULT_CAMERA_CONFIG = {}
|
|||||||
|
|
||||||
|
|
||||||
class SwimmerEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
class SwimmerEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
||||||
"""
|
|
||||||
### Description
|
|
||||||
|
|
||||||
This environment corresponds to the Swimmer environment described in Rémi Coulom's PhD thesis
|
|
||||||
["Reinforcement Learning Using Neural Networks, with Applications to Motor Control"](https://tel.archives-ouvertes.fr/tel-00003985/document).
|
|
||||||
The environment aims to increase the number of independent state and control
|
|
||||||
variables as compared to the classic control environments. The swimmers
|
|
||||||
consist of three or more segments ('***links***') and one less articulation
|
|
||||||
joints ('***rotors***') - one rotor joint connecting exactly two links to
|
|
||||||
form a linear chain. The swimmer is suspended in a two dimensional pool and
|
|
||||||
always starts in the same position (subject to some deviation drawn from an
|
|
||||||
uniform distribution), and the goal is to move as fast as possible towards
|
|
||||||
the right by applying torque on the rotors and using the fluids friction.
|
|
||||||
|
|
||||||
### Notes
|
|
||||||
|
|
||||||
The problem parameters are:
|
|
||||||
Problem parameters:
|
|
||||||
* *n*: number of body parts
|
|
||||||
* *m<sub>i</sub>*: mass of part *i* (*i* ∈ {1...n})
|
|
||||||
* *l<sub>i</sub>*: length of part *i* (*i* ∈ {1...n})
|
|
||||||
* *k*: viscous-friction coefficient
|
|
||||||
|
|
||||||
While the default environment has *n* = 3, *l<sub>i</sub>* = 0.1,
|
|
||||||
and *k* = 0.1. It is possible to pass a custom MuJoCo XML file during construction to increase the
|
|
||||||
number of links, or to tweak any of the parameters.
|
|
||||||
|
|
||||||
### Action Space
|
|
||||||
The action space is a `Box(-1, 1, (2,), float32)`. An action represents the torques applied between *links*
|
|
||||||
|
|
||||||
| Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit |
|
|
||||||
|-----|------------------------------------|-------------|-------------|----------------------------------|-------|--------------|
|
|
||||||
| 0 | Torque applied on the first rotor | -1 | 1 | rot2 | hinge | torque (N m) |
|
|
||||||
| 1 | Torque applied on the second rotor | -1 | 1 | rot3 | hinge | torque (N m) |
|
|
||||||
|
|
||||||
### Observation Space
|
|
||||||
|
|
||||||
By default, observations consists of:
|
|
||||||
* θ<sub>i</sub>: angle of part *i* with respect to the *x* axis
|
|
||||||
* θ<sub>i</sub>': its derivative with respect to time (angular velocity)
|
|
||||||
|
|
||||||
In the default case, observations do not include the x- and y-coordinates of the front tip. These may
|
|
||||||
be included by passing `exclude_current_positions_from_observation=False` during construction.
|
|
||||||
Then, the observation space will have 10 dimensions where the first two dimensions
|
|
||||||
represent the x- and y-coordinates of the front tip.
|
|
||||||
Regardless of whether `exclude_current_positions_from_observation` was set to true or false, the x- and y-coordinates
|
|
||||||
will be returned in `info` with keys `"x_position"` and `"y_position"`, respectively.
|
|
||||||
|
|
||||||
By default, the observation is a `ndarray` with shape `(8,)` where the elements correspond to the following:
|
|
||||||
|
|
||||||
| Num | Observation | Min | Max | Name (in corresponding XML file) | Joint| Unit |
|
|
||||||
|-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------|
|
|
||||||
| 0 | angle of the front tip | -Inf | Inf | rot | hinge | angle (rad) |
|
|
||||||
| 1 | angle of the second rotor | -Inf | Inf | rot2 | hinge | angle (rad) |
|
|
||||||
| 2 | angle of the second rotor | -Inf | Inf | rot3 | hinge | angle (rad) |
|
|
||||||
| 3 | velocity of the tip along the x-axis | -Inf | Inf | slider1 | slide | velocity (m/s) |
|
|
||||||
| 4 | velocity of the tip along the y-axis | -Inf | Inf | slider2 | slide | velocity (m/s) |
|
|
||||||
| 5 | angular velocity of front tip | -Inf | Inf | rot | hinge | angular velocity (rad/s) |
|
|
||||||
| 6 | angular velocity of second rotor | -Inf | Inf | rot2 | hinge | angular velocity (rad/s) |
|
|
||||||
| 7 | angular velocity of third rotor | -Inf | Inf | rot3 | hinge | angular velocity (rad/s) |
|
|
||||||
|
|
||||||
### Rewards
|
|
||||||
The reward consists of two parts:
|
|
||||||
- *forward_reward*: A reward of moving forward which is measured
|
|
||||||
as *`forward_reward_weight` * (x-coordinate before action - x-coordinate after action)/dt*. *dt* is
|
|
||||||
the time between actions and is dependent on the frame_skip parameter
|
|
||||||
(default is 4), where the frametime is 0.01 - making the
|
|
||||||
default *dt = 4 * 0.01 = 0.04*. This reward would be positive if the swimmer
|
|
||||||
swims right as desired.
|
|
||||||
- *ctrl_cost*: A cost for penalising the swimmer if it takes
|
|
||||||
actions that are too large. It is measured as *`ctrl_cost_weight` *
|
|
||||||
sum(action<sup>2</sup>)* where *`ctrl_cost_weight`* is a parameter set for the
|
|
||||||
control and has a default value of 1e-4
|
|
||||||
|
|
||||||
The total reward returned is ***reward*** *=* *forward_reward - ctrl_cost* and `info` will also contain the individual reward terms
|
|
||||||
|
|
||||||
### Starting State
|
|
||||||
All observations start in state (0,0,0,0,0,0,0,0) with a Uniform noise in the range of [-`reset_noise_scale`, `reset_noise_scale`] is added to the initial state for stochasticity.
|
|
||||||
|
|
||||||
### Episode Termination
|
|
||||||
The episode terminates when the episode length is greater than 1000.
|
|
||||||
|
|
||||||
### Arguments
|
|
||||||
|
|
||||||
No additional arguments are currently supported in v2 and lower.
|
|
||||||
|
|
||||||
```
|
|
||||||
gym.make('Swimmer-v2')
|
|
||||||
```
|
|
||||||
|
|
||||||
v3 and beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
|
|
||||||
|
|
||||||
```
|
|
||||||
env = gym.make('Swimmer-v3', ctrl_cost_weight=0.1, ....)
|
|
||||||
```
|
|
||||||
|
|
||||||
| Parameter | Type | Default |Description |
|
|
||||||
|-------------------------|------------|--------------|-------------------------------|
|
|
||||||
| `xml_file` | **str** | `"swimmer.xml"`| Path to a MuJoCo model |
|
|
||||||
| `forward_reward_weight` | **float** | `1.0` | Weight for *forward_reward* term (see section on reward) |
|
|
||||||
| `ctrl_cost_weight` | **float** | `1e-4` | Weight for *ctrl_cost* term (see section on reward) |
|
|
||||||
| `reset_noise_scale` | **float** | `0.1` | Scale of random perturbations of initial position and velocity (see section on Starting State) |
|
|
||||||
| `exclude_current_positions_from_observation`| **bool** | `True`| Whether or not to omit the x- and y-coordinates from observations. Excluding the position can serve as an inductive bias to induce position-agnostic behavior in policies |
|
|
||||||
|
|
||||||
|
|
||||||
### Version History
|
|
||||||
|
|
||||||
* v3: support for gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. rgb rendering comes from tracking camera (so agent does not run away from screen)
|
|
||||||
* v2: All continuous control environments now use mujoco_py >= 1.50
|
|
||||||
* v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments.
|
|
||||||
* v0: Initial versions release (1.0.0)
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
xml_file="swimmer.xml",
|
xml_file="swimmer.xml",
|
||||||
@@ -141,7 +28,7 @@ class SwimmerEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
|||||||
exclude_current_positions_from_observation
|
exclude_current_positions_from_observation
|
||||||
)
|
)
|
||||||
|
|
||||||
mujoco_env.MujocoEnv.__init__(self, xml_file, 4)
|
mujoco_env.MujocoEnv.__init__(self, xml_file, 4, mujoco_bindings="mujoco_py")
|
||||||
|
|
||||||
def control_cost(self, action):
|
def control_cost(self, action):
|
||||||
control_cost = self._ctrl_cost_weight * np.sum(np.square(action))
|
control_cost = self._ctrl_cost_weight * np.sum(np.square(action))
|
||||||
|
217
gym/envs/mujoco/swimmer_v4.py
Normal file
217
gym/envs/mujoco/swimmer_v4.py
Normal file
@@ -0,0 +1,217 @@
|
|||||||
|
__credits__ = ["Rushiv Arora"]
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from gym import utils
|
||||||
|
from gym.envs.mujoco import mujoco_env
|
||||||
|
|
||||||
|
DEFAULT_CAMERA_CONFIG = {}
|
||||||
|
|
||||||
|
|
||||||
|
class SwimmerEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
||||||
|
"""
|
||||||
|
### Description
|
||||||
|
|
||||||
|
This environment corresponds to the Swimmer environment described in Rémi Coulom's PhD thesis
|
||||||
|
["Reinforcement Learning Using Neural Networks, with Applications to Motor Control"](https://tel.archives-ouvertes.fr/tel-00003985/document).
|
||||||
|
The environment aims to increase the number of independent state and control
|
||||||
|
variables as compared to the classic control environments. The swimmers
|
||||||
|
consist of three or more segments ('***links***') and one less articulation
|
||||||
|
joints ('***rotors***') - one rotor joint connecting exactly two links to
|
||||||
|
form a linear chain. The swimmer is suspended in a two dimensional pool and
|
||||||
|
always starts in the same position (subject to some deviation drawn from an
|
||||||
|
uniform distribution), and the goal is to move as fast as possible towards
|
||||||
|
the right by applying torque on the rotors and using the fluids friction.
|
||||||
|
|
||||||
|
### Notes
|
||||||
|
|
||||||
|
The problem parameters are:
|
||||||
|
Problem parameters:
|
||||||
|
* *n*: number of body parts
|
||||||
|
* *m<sub>i</sub>*: mass of part *i* (*i* ∈ {1...n})
|
||||||
|
* *l<sub>i</sub>*: length of part *i* (*i* ∈ {1...n})
|
||||||
|
* *k*: viscous-friction coefficient
|
||||||
|
|
||||||
|
While the default environment has *n* = 3, *l<sub>i</sub>* = 0.1,
|
||||||
|
and *k* = 0.1. It is possible to tweak the MuJoCo XML files to increase the
|
||||||
|
number of links, or to tweak any of the parameters.
|
||||||
|
|
||||||
|
### Action Space
|
||||||
|
The agent take a 2-element vector for actions.
|
||||||
|
The action space is a continuous `(action, action)` in `[-1, 1]`, where
|
||||||
|
`action` represents the numerical torques applied between *links*
|
||||||
|
|
||||||
|
| Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit |
|
||||||
|
|-----|------------------------------------|-------------|-------------|----------------------------------|-------|--------------|
|
||||||
|
| 0 | Torque applied on the first rotor | -1 | 1 | rot2 | hinge | torque (N m) |
|
||||||
|
| 1 | Torque applied on the second rotor | -1 | 1 | rot3 | hinge | torque (N m) |
|
||||||
|
|
||||||
|
### Observation Space
|
||||||
|
|
||||||
|
The state space consists of:
|
||||||
|
* A<sub>0</sub>: position of first point
|
||||||
|
* θ<sub>i</sub>: angle of part *i* with respect to the *x* axis
|
||||||
|
* A<sub>0</sub>, θ<sub>i</sub>: their derivatives with respect to time (velocity and angular velocity)
|
||||||
|
|
||||||
|
The observation is a `ndarray` with shape `(8,)` where the elements correspond to the following:
|
||||||
|
|
||||||
|
| Num | Observation | Min | Max | Name (in corresponding XML file) | Joint| Unit |
|
||||||
|
|-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------|
|
||||||
|
| 0 | x-coordinate of the front tip | -Inf | Inf | slider1 | slide | position (m) |
|
||||||
|
| 1 | y-coordinate of the front tip | -Inf | Inf | slider2 | slide | position (m) |
|
||||||
|
| 2 | angle of the front tip | -Inf | Inf | rot | hinge | angle (rad) |
|
||||||
|
| 3 | angle of the second rotor | -Inf | Inf | rot2 | hinge | angle (rad) |
|
||||||
|
| 4 | angle of the second rotor | -Inf | Inf | rot3 | hinge | angle (rad) |
|
||||||
|
| 5 | velocity of the tip along the x-axis | -Inf | Inf | slider1 | slide | velocity (m/s) |
|
||||||
|
| 6 | velocity of the tip along the y-axis | -Inf | Inf | slider2 | slide | velocity (m/s) |
|
||||||
|
| 7 | angular velocity of front tip | -Inf | Inf | rot | hinge | angular velocity (rad/s) |
|
||||||
|
| 8 | angular velocity of second rotor | -Inf | Inf | rot2 | hinge | angular velocity (rad/s) |
|
||||||
|
| 9 | angular velocity of third rotor | -Inf | Inf | rot3 | hinge | angular velocity (rad/s) |
|
||||||
|
|
||||||
|
**Note:**
|
||||||
|
In practice (and Gym implementation), the first two positional elements are
|
||||||
|
omitted from the state space since the reward function is calculated based
|
||||||
|
on those values. Therefore, observation space has shape `(8,)` and looks like:
|
||||||
|
|
||||||
|
| Num | Observation | Min | Max | Name (in corresponding XML file) | Joint| Unit |
|
||||||
|
|-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------|
|
||||||
|
| 0 | angle of the front tip | -Inf | Inf | rot | hinge | angle (rad) |
|
||||||
|
| 1 | angle of the second rotor | -Inf | Inf | rot2 | hinge | angle (rad) |
|
||||||
|
| 2 | angle of the second rotor | -Inf | Inf | rot3 | hinge | angle (rad) |
|
||||||
|
| 3 | velocity of the tip along the x-axis | -Inf | Inf | slider1 | slide | velocity (m/s) |
|
||||||
|
| 4 | velocity of the tip along the y-axis | -Inf | Inf | slider2 | slide | velocity (m/s) |
|
||||||
|
| 5 | angular velocity of front tip | -Inf | Inf | rot | hinge | angular velocity (rad/s) |
|
||||||
|
| 6 | angular velocity of second rotor | -Inf | Inf | rot2 | hinge | angular velocity (rad/s) |
|
||||||
|
| 7 | angular velocity of third rotor | -Inf | Inf | rot3 | hinge | angular velocity (rad/s) |
|
||||||
|
|
||||||
|
### Rewards
|
||||||
|
The reward consists of two parts:
|
||||||
|
- *reward_front*: A reward of moving forward which is measured
|
||||||
|
as *(x-coordinate before action - x-coordinate after action)/dt*. *dt* is
|
||||||
|
the time between actions and is dependeent on the frame_skip parameter
|
||||||
|
(default is 4), where the *dt* for one frame is 0.01 - making the
|
||||||
|
default *dt = 4 * 0.01 = 0.04*. This reward would be positive if the swimmer
|
||||||
|
swims right as desired.
|
||||||
|
- *reward_control*: A negative reward for penalising the swimmer if it takes
|
||||||
|
actions that are too large. It is measured as *-coefficient x
|
||||||
|
sum(action<sup>2</sup>)* where *coefficient* is a parameter set for the
|
||||||
|
control and has a default value of 0.0001
|
||||||
|
|
||||||
|
The total reward returned is ***reward*** *=* *reward_front + reward_control*
|
||||||
|
|
||||||
|
### Starting State
|
||||||
|
All observations start in state (0,0,0,0,0,0,0,0) with a Uniform noise in the range of [-0.1, 0.1] is added to the initial state for stochasticity.
|
||||||
|
|
||||||
|
### Episode Termination
|
||||||
|
The episode terminates when the episode length is greater than 1000.
|
||||||
|
|
||||||
|
### Arguments
|
||||||
|
|
||||||
|
No additional arguments are currently supported (in v2 and lower), but
|
||||||
|
modifications can be made to the XML file in the assets folder
|
||||||
|
(or by changing the path to a modified XML file in another folder).
|
||||||
|
|
||||||
|
```
|
||||||
|
gym.make('Swimmer-v2')
|
||||||
|
```
|
||||||
|
|
||||||
|
v3 and beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
|
||||||
|
|
||||||
|
```
|
||||||
|
env = gym.make('Swimmer-v3', ctrl_cost_weight=0.1, ....)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Version History
|
||||||
|
|
||||||
|
* v4: all mujoco environments now use the mujoco binidings in mujoco>=2.1.3
|
||||||
|
* v3: support for gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. rgb rendering comes from tracking camera (so agent does not run away from screen)
|
||||||
|
* v2: All continuous control environments now use mujoco_py >= 1.50
|
||||||
|
* v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments.
|
||||||
|
* v0: Initial versions release (1.0.0)
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
xml_file="swimmer.xml",
|
||||||
|
forward_reward_weight=1.0,
|
||||||
|
ctrl_cost_weight=1e-4,
|
||||||
|
reset_noise_scale=0.1,
|
||||||
|
exclude_current_positions_from_observation=True,
|
||||||
|
):
|
||||||
|
utils.EzPickle.__init__(**locals())
|
||||||
|
|
||||||
|
self._forward_reward_weight = forward_reward_weight
|
||||||
|
self._ctrl_cost_weight = ctrl_cost_weight
|
||||||
|
|
||||||
|
self._reset_noise_scale = reset_noise_scale
|
||||||
|
|
||||||
|
self._exclude_current_positions_from_observation = (
|
||||||
|
exclude_current_positions_from_observation
|
||||||
|
)
|
||||||
|
|
||||||
|
mujoco_env.MujocoEnv.__init__(self, xml_file, 4)
|
||||||
|
|
||||||
|
def control_cost(self, action):
|
||||||
|
control_cost = self._ctrl_cost_weight * np.sum(np.square(action))
|
||||||
|
return control_cost
|
||||||
|
|
||||||
|
def step(self, action):
|
||||||
|
xy_position_before = self.data.qpos[0:2].copy()
|
||||||
|
self.do_simulation(action, self.frame_skip)
|
||||||
|
xy_position_after = self.data.qpos[0:2].copy()
|
||||||
|
|
||||||
|
xy_velocity = (xy_position_after - xy_position_before) / self.dt
|
||||||
|
x_velocity, y_velocity = xy_velocity
|
||||||
|
|
||||||
|
forward_reward = self._forward_reward_weight * x_velocity
|
||||||
|
|
||||||
|
ctrl_cost = self.control_cost(action)
|
||||||
|
|
||||||
|
observation = self._get_obs()
|
||||||
|
reward = forward_reward - ctrl_cost
|
||||||
|
done = False
|
||||||
|
info = {
|
||||||
|
"reward_fwd": forward_reward,
|
||||||
|
"reward_ctrl": -ctrl_cost,
|
||||||
|
"x_position": xy_position_after[0],
|
||||||
|
"y_position": xy_position_after[1],
|
||||||
|
"distance_from_origin": np.linalg.norm(xy_position_after, ord=2),
|
||||||
|
"x_velocity": x_velocity,
|
||||||
|
"y_velocity": y_velocity,
|
||||||
|
"forward_reward": forward_reward,
|
||||||
|
}
|
||||||
|
|
||||||
|
return observation, reward, done, info
|
||||||
|
|
||||||
|
def _get_obs(self):
|
||||||
|
position = self.data.qpos.flat.copy()
|
||||||
|
velocity = self.data.qvel.flat.copy()
|
||||||
|
|
||||||
|
if self._exclude_current_positions_from_observation:
|
||||||
|
position = position[2:]
|
||||||
|
|
||||||
|
observation = np.concatenate([position, velocity]).ravel()
|
||||||
|
return observation
|
||||||
|
|
||||||
|
def reset_model(self):
|
||||||
|
noise_low = -self._reset_noise_scale
|
||||||
|
noise_high = self._reset_noise_scale
|
||||||
|
|
||||||
|
qpos = self.init_qpos + self.np_random.uniform(
|
||||||
|
low=noise_low, high=noise_high, size=self.model.nq
|
||||||
|
)
|
||||||
|
qvel = self.init_qvel + self.np_random.uniform(
|
||||||
|
low=noise_low, high=noise_high, size=self.model.nv
|
||||||
|
)
|
||||||
|
|
||||||
|
self.set_state(qpos, qvel)
|
||||||
|
|
||||||
|
observation = self._get_obs()
|
||||||
|
return observation
|
||||||
|
|
||||||
|
def viewer_setup(self):
|
||||||
|
for key, value in DEFAULT_CAMERA_CONFIG.items():
|
||||||
|
if isinstance(value, np.ndarray):
|
||||||
|
getattr(self.viewer.cam, key)[:] = value
|
||||||
|
else:
|
||||||
|
setattr(self.viewer.cam, key, value)
|
@@ -6,7 +6,9 @@ from gym.envs.mujoco import mujoco_env
|
|||||||
|
|
||||||
class Walker2dEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
class Walker2dEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
mujoco_env.MujocoEnv.__init__(self, "walker2d.xml", 4)
|
mujoco_env.MujocoEnv.__init__(
|
||||||
|
self, "walker2d.xml", 4, mujoco_bindings="mujoco_py"
|
||||||
|
)
|
||||||
utils.EzPickle.__init__(self)
|
utils.EzPickle.__init__(self)
|
||||||
|
|
||||||
def step(self, a):
|
def step(self, a):
|
||||||
|
@@ -12,137 +12,6 @@ DEFAULT_CAMERA_CONFIG = {
|
|||||||
|
|
||||||
|
|
||||||
class Walker2dEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
class Walker2dEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
||||||
"""
|
|
||||||
### Description
|
|
||||||
|
|
||||||
This environment builds on the hopper environment based on the work done by Erez, Tassa, and Todorov
|
|
||||||
in ["Infinite Horizon Model Predictive Control for Nonlinear Periodic Tasks"](http://www.roboticsproceedings.org/rss07/p10.pdf)
|
|
||||||
by adding another set of legs making it possible for the robot to walker forward instead of
|
|
||||||
hop. Like other Mujoco environments, this environment aims to increase the number of independent state
|
|
||||||
and control variables as compared to the classic control environments. The walker is a
|
|
||||||
two-dimensional two-legged figure that consist of four main body parts - a single torso at the top
|
|
||||||
(with the two legs splitting after the torso), two thighs in the middle below the torso, two legs
|
|
||||||
in the bottom below the thighs, and two feet attached to the legs on which the entire body rests.
|
|
||||||
The goal is to make coordinate both sets of feet, legs, and thighs to move in the forward (right)
|
|
||||||
direction by applying torques on the six hinges connecting the six body parts.
|
|
||||||
|
|
||||||
### Action Space
|
|
||||||
The action space is a `Box(-1, 1, (6,), float32)`. An action represents the torques applied at the hinge joints.
|
|
||||||
|
|
||||||
| Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit |
|
|
||||||
|-----|----------------------------------------|-------------|-------------|----------------------------------|-------|--------------|
|
|
||||||
| 0 | Torque applied on the thigh rotor | -1 | 1 | thigh_joint | hinge | torque (N m) |
|
|
||||||
| 1 | Torque applied on the leg rotor | -1 | 1 | leg_joint | hinge | torque (N m) |
|
|
||||||
| 2 | Torque applied on the foot rotor | -1 | 1 | foot_joint | hinge | torque (N m) |
|
|
||||||
| 3 | Torque applied on the left thigh rotor | -1 | 1 | thigh_left_joint | hinge | torque (N m) |
|
|
||||||
| 4 | Torque applied on the left leg rotor | -1 | 1 | leg_left_joint | hinge | torque (N m) |
|
|
||||||
| 5 | Torque applied on the left foot rotor | -1 | 1 | foot_left_joint | hinge | torque (N m) |
|
|
||||||
|
|
||||||
### Observation Space
|
|
||||||
|
|
||||||
Observations consist of positional values of different body parts of the walker,
|
|
||||||
followed by the velocities of those individual parts (their derivatives) with all the positions ordered before all the velocities.
|
|
||||||
|
|
||||||
By default, observations do not include the x-coordinate of the top. It may
|
|
||||||
be included by passing `exclude_current_positions_from_observation=False` during construction.
|
|
||||||
In that case, the observation space will have 18 dimensions where the first dimension
|
|
||||||
represent the x-coordinates of the top of the walker.
|
|
||||||
Regardless of whether `exclude_current_positions_from_observation` was set to true or false, the x-coordinate
|
|
||||||
of the top will be returned in `info` with key `"x_position"`.
|
|
||||||
|
|
||||||
By default, observation is a `ndarray` with shape `(17,)` where the elements correspond to the following:
|
|
||||||
|
|
||||||
| Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit |
|
|
||||||
|-----|--------------------------------------------------------|----------------|-----------------|----------------------------------------|-------|------|
|
|
||||||
| 0 | z-coordinate of the top (height of hopper) | -Inf | Inf | rootz (torso) | slide | position (m) |
|
|
||||||
| 1 | angle of the top | -Inf | Inf | rooty (torso) | hinge | angle (rad) |
|
|
||||||
| 2 | angle of the thigh joint | -Inf | Inf | thigh_joint | hinge | angle (rad) |
|
|
||||||
| 3 | angle of the leg joint | -Inf | Inf | leg_joint | hinge | angle (rad) |
|
|
||||||
| 4 | angle of the foot joint | -Inf | Inf | foot_joint | hinge | angle (rad) |
|
|
||||||
| 5 | angle of the left thigh joint | -Inf | Inf | thigh_left_joint | hinge | angle (rad) |
|
|
||||||
| 6 | angle of the left leg joint | -Inf | Inf | leg_left_joint | hinge | angle (rad) |
|
|
||||||
| 7 | angle of the left foot joint | -Inf | Inf | foot_left_joint | hinge | angle (rad) |
|
|
||||||
| 8 | velocity of the x-coordinate of the top | -Inf | Inf | rootx | slide | velocity (m/s) |
|
|
||||||
| 9 | velocity of the z-coordinate (height) of the top | -Inf | Inf | rootz | slide | velocity (m/s) |
|
|
||||||
| 10 | angular velocity of the angle of the top | -Inf | Inf | rooty | hinge | angular velocity (rad/s) |
|
|
||||||
| 11 | angular velocity of the thigh hinge | -Inf | Inf | thigh_joint | hinge | angular velocity (rad/s) |
|
|
||||||
| 12 | angular velocity of the leg hinge | -Inf | Inf | leg_joint | hinge | angular velocity (rad/s) |
|
|
||||||
| 13 | angular velocity of the foot hinge | -Inf | Inf | foot_joint | hinge | angular velocity (rad/s) |
|
|
||||||
| 14 | angular velocity of the thigh hinge | -Inf | Inf | thigh_left_joint | hinge | angular velocity (rad/s) |
|
|
||||||
| 15 | angular velocity of the leg hinge | -Inf | Inf | leg_left_joint | hinge | angular velocity (rad/s) |
|
|
||||||
| 16 | angular velocity of the foot hinge | -Inf | Inf | foot_left_joint | hinge | angular velocity (rad/s) |
|
|
||||||
|
|
||||||
### Rewards
|
|
||||||
The reward consists of three parts:
|
|
||||||
- *healthy_reward*: Every timestep that the walker is alive, it receives a fixed reward of value `healthy_reward`,
|
|
||||||
- *forward_reward*: A reward of walking forward which is measured as
|
|
||||||
*`forward_reward_weight` * (x-coordinate before action - x-coordinate after action)/dt*.
|
|
||||||
*dt* is the time between actions and is dependeent on the frame_skip parameter
|
|
||||||
(default is 4), where the frametime is 0.002 - making the default
|
|
||||||
*dt = 4 * 0.002 = 0.008*. This reward would be positive if the walker walks forward (right) desired.
|
|
||||||
- *ctrl_cost*: A cost for penalising the walker if it
|
|
||||||
takes actions that are too large. It is measured as
|
|
||||||
*`ctrl_cost_weight` * sum(action<sup>2</sup>)* where *`ctrl_cost_weight`* is
|
|
||||||
a parameter set for the control and has a default value of 0.001
|
|
||||||
|
|
||||||
The total reward returned is ***reward*** *=* *healthy_reward bonus + forward_reward - ctrl_cost* and `info` will also contain the individual reward terms
|
|
||||||
|
|
||||||
### Starting State
|
|
||||||
All observations start in state
|
|
||||||
(0.0, 1.25, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
|
|
||||||
with a uniform noise in the range of [-`reset_noise_scale`, `reset_noise_scale`] added to the values for stochasticity.
|
|
||||||
|
|
||||||
### Episode Termination
|
|
||||||
The walker is said to be unhealthy if any of the following happens:
|
|
||||||
|
|
||||||
1. Any of the state space values is no longer finite
|
|
||||||
2. The height of the walker is ***not*** in the closed interval specified by `healthy_z_range`
|
|
||||||
3. The absolute value of the angle (`observation[1]` if `exclude_current_positions_from_observation=False`, else `observation[2]`) is ***not*** in the closed interval specified by `healthy_angle_range`
|
|
||||||
|
|
||||||
If `terminate_when_unhealthy=True` is passed during construction (which is the default),
|
|
||||||
the episode terminates when any of the following happens:
|
|
||||||
|
|
||||||
1. The episode duration reaches a 1000 timesteps
|
|
||||||
2. The walker is unhealthy
|
|
||||||
|
|
||||||
If `terminate_when_unhealthy=False` is passed, the episode is terminated only when 1000 timesteps are exceeded.
|
|
||||||
|
|
||||||
### Arguments
|
|
||||||
|
|
||||||
No additional arguments are currently supported in v2 and lower.
|
|
||||||
|
|
||||||
```
|
|
||||||
env = gym.make('Walker2d-v2')
|
|
||||||
```
|
|
||||||
|
|
||||||
v3 and beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
|
|
||||||
|
|
||||||
```
|
|
||||||
env = gym.make('Walker2d-v3', ctrl_cost_weight=0.1, ....)
|
|
||||||
```
|
|
||||||
|
|
||||||
| Parameter | Type | Default |Description |
|
|
||||||
|-------------------------|------------|--------------|-------------------------------|
|
|
||||||
| `xml_file` | **str** | `"walker2d.xml"` | Path to a MuJoCo model |
|
|
||||||
| `forward_reward_weight` | **float** | `1.0` | Weight for *forward_reward* term (see section on reward) |
|
|
||||||
| `ctrl_cost_weight` | **float** | `1e-3` | Weight for *ctr_cost* term (see section on reward) |
|
|
||||||
| `healthy_reward` | **float** | `1.0` | Constant reward given if the ant is "healthy" after timestep |
|
|
||||||
| `terminate_when_unhealthy` | **bool**| `True` | If true, issue a done signal if the z-coordinate of the walker is no longer healthy |
|
|
||||||
| `healthy_z_range` | **tuple** | `(0.8, 2)` | The z-coordinate of the top of the walker must be in this range to be considered healthy |
|
|
||||||
| `healthy_angle_range` | **tuple** | `(-1, 1)` | The angle must be in this range to be considered healthy|
|
|
||||||
| `reset_noise_scale` | **float** | `5e-3` | Scale of random perturbations of initial position and velocity (see section on Starting State) |
|
|
||||||
| `exclude_current_positions_from_observation`| **bool** | `True`| Whether or not to omit the x-coordinate from observations. Excluding the position can serve as an inductive bias to induce position-agnostic behavior in policies |
|
|
||||||
|
|
||||||
|
|
||||||
### Version History
|
|
||||||
|
|
||||||
* v3: support for gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. rgb rendering comes from tracking camera (so agent does not run away from screen)
|
|
||||||
* v2: All continuous control environments now use mujoco_py >= 1.50
|
|
||||||
* v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments.
|
|
||||||
* v0: Initial versions release (1.0.0)
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
xml_file="walker2d.xml",
|
xml_file="walker2d.xml",
|
||||||
@@ -172,7 +41,7 @@ class Walker2dEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
|||||||
exclude_current_positions_from_observation
|
exclude_current_positions_from_observation
|
||||||
)
|
)
|
||||||
|
|
||||||
mujoco_env.MujocoEnv.__init__(self, xml_file, 4)
|
mujoco_env.MujocoEnv.__init__(self, xml_file, 4, mujoco_bindings="mujoco_py")
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def healthy_reward(self):
|
def healthy_reward(self):
|
||||||
|
276
gym/envs/mujoco/walker2d_v4.py
Normal file
276
gym/envs/mujoco/walker2d_v4.py
Normal file
@@ -0,0 +1,276 @@
|
|||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from gym import utils
|
||||||
|
from gym.envs.mujoco import mujoco_env
|
||||||
|
|
||||||
|
DEFAULT_CAMERA_CONFIG = {
|
||||||
|
"trackbodyid": 2,
|
||||||
|
"distance": 4.0,
|
||||||
|
"lookat": np.array((0.0, 0.0, 1.15)),
|
||||||
|
"elevation": -20.0,
|
||||||
|
}
|
||||||
|
|
||||||
|
DEFAULT_CAMERA_CONFIG = {
|
||||||
|
"trackbodyid": 2,
|
||||||
|
"distance": 4.0,
|
||||||
|
"lookat": np.array((0.0, 0.0, 1.15)),
|
||||||
|
"elevation": -20.0,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class Walker2dEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
||||||
|
"""
|
||||||
|
### Description
|
||||||
|
|
||||||
|
This environment builds on the hopper environment based on the work done by Erez, Tassa, and Todorov
|
||||||
|
in ["Infinite Horizon Model Predictive Control for Nonlinear Periodic Tasks"](http://www.roboticsproceedings.org/rss07/p10.pdf)
|
||||||
|
by adding another set of legs making it possiible for the robot to walker forward instead of
|
||||||
|
hop. Like other Mujoco environments, this environment aims to increase the number of independent state
|
||||||
|
and control variables as compared to the classic control environments. The walker is a
|
||||||
|
two-dimensional two-legged figure that consist of four main body parts - a single torso at the top
|
||||||
|
(with the two legs splitting after the torso), two thighs in the middle below the torso, two legs
|
||||||
|
in the bottom below the thighs, and two feet attached to the legs on which the entire body rests.
|
||||||
|
The goal is to make coordinate both sets of feet, legs, and thighs to move in the forward (right)
|
||||||
|
direction by applying torques on the six hinges connecting the six body parts.
|
||||||
|
|
||||||
|
### Action Space
|
||||||
|
The agent take a 6-element vector for actions.
|
||||||
|
The action space is a continuous `(action, action, action, action, action, action)`
|
||||||
|
all in `[-1, 1]`, where `action` represents the numerical torques applied at the hinge joints.
|
||||||
|
|
||||||
|
| Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit |
|
||||||
|
|-----|----------------------------------------|-------------|-------------|----------------------------------|-------|--------------|
|
||||||
|
| 0 | Torque applied on the thigh rotor | -1 | 1 | thigh_joint | hinge | torque (N m) |
|
||||||
|
| 1 | Torque applied on the leg rotor | -1 | 1 | leg_joint | hinge | torque (N m) |
|
||||||
|
| 2 | Torque applied on the foot rotor | -1 | 1 | foot_joint | hinge | torque (N m) |
|
||||||
|
| 3 | Torque applied on the left thigh rotor | -1 | 1 | thigh_left_joint | hinge | torque (N m) |
|
||||||
|
| 4 | Torque applied on the left leg rotor | -1 | 1 | leg_left_joint | hinge | torque (N m) |
|
||||||
|
| 5 | Torque applied on the left foot rotor | -1 | 1 | foot_left_joint | hinge | torque (N m) |
|
||||||
|
|
||||||
|
### Observation Space
|
||||||
|
|
||||||
|
The state space consists of positional values of different body parts of the walker,
|
||||||
|
followed by the velocities of those individual parts (their derivatives) with all the positions ordered before all the velocities.
|
||||||
|
|
||||||
|
The observation is a `ndarray` with shape `(17,)` where the elements correspond to the following:
|
||||||
|
|
||||||
|
| Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit |
|
||||||
|
|-----|--------------------------------------------------------|----------------|-----------------|----------------------------------------|-------|------|
|
||||||
|
| 0 | x-coordinate of the top | -Inf | Inf | rootx (torso) | slide | position (m) |
|
||||||
|
| 1 | z-coordinate of the top (height of hopper) | -Inf | Inf | rootz (torso) | slide | position (m) |
|
||||||
|
| 2 | angle of the top | -Inf | Inf | rooty (torso) | hinge | angle (rad) |
|
||||||
|
| 3 | angle of the thigh joint | -Inf | Inf | thigh_joint | hinge | angle (rad) |
|
||||||
|
| 4 | angle of the leg joint | -Inf | Inf | leg_joint | hinge | angle (rad) |
|
||||||
|
| 5 | angle of the foot joint | -Inf | Inf | foot_joint | hinge | angle (rad) |
|
||||||
|
| 6 | angle of the left thigh joint | -Inf | Inf | thigh_left_joint | hinge | angle (rad) |
|
||||||
|
| 7 | angle of the left leg joint | -Inf | Inf | leg_left_joint | hinge | angle (rad) |
|
||||||
|
| 8 | angle of the left foot joint | -Inf | Inf | foot_left_joint | hinge | angle (rad) |
|
||||||
|
| 9 | velocity of the x-coordinate of the top | -Inf | Inf | rootx | slide | velocity (m/s) |
|
||||||
|
| 10 | velocity of the z-coordinate (height) of the top | -Inf | Inf | rootz | slide | velocity (m/s) |
|
||||||
|
| 11 | angular velocity of the angle of the top | -Inf | Inf | rooty | hinge | angular velocity (rad/s) |
|
||||||
|
| 12 | angular velocity of the thigh hinge | -Inf | Inf | thigh_joint | hinge | angular velocity (rad/s) |
|
||||||
|
| 13 | angular velocity of the leg hinge | -Inf | Inf | leg_joint | hinge | angular velocity (rad/s) |
|
||||||
|
| 14 | angular velocity of the foot hinge | -Inf | Inf | foot_joint | hinge | angular velocity (rad/s) |
|
||||||
|
| 15 | angular velocity of the thigh hinge | -Inf | Inf | thigh_left_joint | hinge | angular velocity (rad/s) |
|
||||||
|
| 16 | angular velocity of the leg hinge | -Inf | Inf | leg_left_joint | hinge | angular velocity (rad/s) |
|
||||||
|
| 17 | angular velocity of the foot hinge | -Inf | Inf | foot_left_joint | hinge | angular velocity (rad/s) |
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
**Note:**
|
||||||
|
In practice (and Gym implementation), the first positional element is omitted from the
|
||||||
|
state space since the reward function is calculated based on that value. This value is
|
||||||
|
hidden from the algorithm, which in turn has to develop an abstract understanding of it
|
||||||
|
from the observed rewards. Therefore, observation space has shape `(17,)`
|
||||||
|
instead of `(18,)` and looks like:
|
||||||
|
|
||||||
|
| Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit |
|
||||||
|
|-----|--------------------------------------------------------|----------------|-----------------|----------------------------------------|-------|------|
|
||||||
|
| 0 | z-coordinate of the top (height of hopper) | -Inf | Inf | rootz (torso) | slide | position (m) |
|
||||||
|
| 1 | angle of the top | -Inf | Inf | rooty (torso) | hinge | angle (rad) |
|
||||||
|
| 2 | angle of the thigh joint | -Inf | Inf | thigh_joint | hinge | angle (rad) |
|
||||||
|
| 3 | angle of the leg joint | -Inf | Inf | leg_joint | hinge | angle (rad) |
|
||||||
|
| 4 | angle of the foot joint | -Inf | Inf | foot_joint | hinge | angle (rad) |
|
||||||
|
| 5 | angle of the left thigh joint | -Inf | Inf | thigh_left_joint | hinge | angle (rad) |
|
||||||
|
| 6 | angle of the left leg joint | -Inf | Inf | leg_left_joint | hinge | angle (rad) |
|
||||||
|
| 7 | angle of the left foot joint | -Inf | Inf | foot_left_joint | hinge | angle (rad) |
|
||||||
|
| 8 | velocity of the x-coordinate of the top | -Inf | Inf | rootx | slide | velocity (m/s) |
|
||||||
|
| 9 | velocity of the z-coordinate (height) of the top | -Inf | Inf | rootz | slide | velocity (m/s) |
|
||||||
|
| 10 | angular velocity of the angle of the top | -Inf | Inf | rooty | hinge | angular velocity (rad/s) |
|
||||||
|
| 11 | angular velocity of the thigh hinge | -Inf | Inf | thigh_joint | hinge | angular velocity (rad/s) |
|
||||||
|
| 12 | angular velocity of the leg hinge | -Inf | Inf | leg_joint | hinge | angular velocity (rad/s) |
|
||||||
|
| 13 | angular velocity of the foot hinge | -Inf | Inf | foot_joint | hinge | angular velocity (rad/s) |
|
||||||
|
| 14 | angular velocity of the thigh hinge | -Inf | Inf | thigh_left_joint | hinge | angular velocity (rad/s) |
|
||||||
|
| 15 | angular velocity of the leg hinge | -Inf | Inf | leg_left_joint | hinge | angular velocity (rad/s) |
|
||||||
|
| 16 | angular velocity of the foot hinge | -Inf | Inf | foot_left_joint | hinge | angular velocity (rad/s) |
|
||||||
|
|
||||||
|
### Rewards
|
||||||
|
The reward consists of three parts:
|
||||||
|
- *alive bonus*: Every timestep that the walker is alive, it gets a reward of 1,
|
||||||
|
- *reward_forward*: A reward of walking forward which is measured as
|
||||||
|
*(x-coordinate before action - x-coordinate after action)/dt*.
|
||||||
|
*dt* is the time between actions and is dependeent on the frame_skip parameter
|
||||||
|
(default is 4), where the *dt* for one frame is 0.002 - making the default
|
||||||
|
*dt = 4 * 0.002 = 0.008*. This reward would be positive if the walker walks forward (right) desired.
|
||||||
|
- *reward_control*: A negative reward for penalising the walker if it
|
||||||
|
takes actions that are too large. It is measured as
|
||||||
|
*-coefficient **x** sum(action<sup>2</sup>)* where *coefficient* is
|
||||||
|
a parameter set for the control and has a default value of 0.001
|
||||||
|
|
||||||
|
The total reward returned is ***reward*** *=* *alive bonus + reward_forward + reward_control*
|
||||||
|
|
||||||
|
### Starting State
|
||||||
|
All observations start in state
|
||||||
|
(0.0, 1.25, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
|
||||||
|
with a uniform noise in the range of [-0.005, 0.005] added to the values for stochasticity.
|
||||||
|
|
||||||
|
### Episode Termination
|
||||||
|
The episode terminates when any of the following happens:
|
||||||
|
|
||||||
|
1. The episode duration reaches a 1000 timesteps
|
||||||
|
2. Any of the state space values is no longer finite
|
||||||
|
3. The height of the walker (index 1) is ***not*** in the range `[0.8, 2]`
|
||||||
|
4. The absolute value of the angle (index 2) is ***not*** in the range `[-1, 1]`
|
||||||
|
|
||||||
|
### Arguments
|
||||||
|
|
||||||
|
No additional arguments are currently supported (in v2 and lower),
|
||||||
|
but modifications can be made to the XML file in the assets folder
|
||||||
|
(or by changing the path to a modified XML file in another folder)..
|
||||||
|
|
||||||
|
```
|
||||||
|
env = gym.make('Walker2d-v2')
|
||||||
|
```
|
||||||
|
|
||||||
|
v3 and beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
|
||||||
|
|
||||||
|
```
|
||||||
|
env = gym.make('Walker2d-v3', ctrl_cost_weight=0.1, ....)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Version History
|
||||||
|
|
||||||
|
* v4: all mujoco environments now use the mujoco binidings in mujoco>=2.1.3
|
||||||
|
* v3: support for gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. rgb rendering comes from tracking camera (so agent does not run away from screen)
|
||||||
|
* v2: All continuous control environments now use mujoco_py >= 1.50
|
||||||
|
* v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments.
|
||||||
|
* v0: Initial versions release (1.0.0)
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
xml_file="walker2d.xml",
|
||||||
|
forward_reward_weight=1.0,
|
||||||
|
ctrl_cost_weight=1e-3,
|
||||||
|
healthy_reward=1.0,
|
||||||
|
terminate_when_unhealthy=True,
|
||||||
|
healthy_z_range=(0.8, 2.0),
|
||||||
|
healthy_angle_range=(-1.0, 1.0),
|
||||||
|
reset_noise_scale=5e-3,
|
||||||
|
exclude_current_positions_from_observation=True,
|
||||||
|
):
|
||||||
|
utils.EzPickle.__init__(**locals())
|
||||||
|
|
||||||
|
self._forward_reward_weight = forward_reward_weight
|
||||||
|
self._ctrl_cost_weight = ctrl_cost_weight
|
||||||
|
|
||||||
|
self._healthy_reward = healthy_reward
|
||||||
|
self._terminate_when_unhealthy = terminate_when_unhealthy
|
||||||
|
|
||||||
|
self._healthy_z_range = healthy_z_range
|
||||||
|
self._healthy_angle_range = healthy_angle_range
|
||||||
|
|
||||||
|
self._reset_noise_scale = reset_noise_scale
|
||||||
|
|
||||||
|
self._exclude_current_positions_from_observation = (
|
||||||
|
exclude_current_positions_from_observation
|
||||||
|
)
|
||||||
|
|
||||||
|
mujoco_env.MujocoEnv.__init__(self, xml_file, 4)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def healthy_reward(self):
|
||||||
|
return (
|
||||||
|
float(self.is_healthy or self._terminate_when_unhealthy)
|
||||||
|
* self._healthy_reward
|
||||||
|
)
|
||||||
|
|
||||||
|
def control_cost(self, action):
|
||||||
|
control_cost = self._ctrl_cost_weight * np.sum(np.square(action))
|
||||||
|
return control_cost
|
||||||
|
|
||||||
|
@property
|
||||||
|
def is_healthy(self):
|
||||||
|
z, angle = self.data.qpos[1:3]
|
||||||
|
|
||||||
|
min_z, max_z = self._healthy_z_range
|
||||||
|
min_angle, max_angle = self._healthy_angle_range
|
||||||
|
|
||||||
|
healthy_z = min_z < z < max_z
|
||||||
|
healthy_angle = min_angle < angle < max_angle
|
||||||
|
is_healthy = healthy_z and healthy_angle
|
||||||
|
|
||||||
|
return is_healthy
|
||||||
|
|
||||||
|
@property
|
||||||
|
def done(self):
|
||||||
|
done = not self.is_healthy if self._terminate_when_unhealthy else False
|
||||||
|
return done
|
||||||
|
|
||||||
|
def _get_obs(self):
|
||||||
|
position = self.data.qpos.flat.copy()
|
||||||
|
velocity = np.clip(self.data.qvel.flat.copy(), -10, 10)
|
||||||
|
|
||||||
|
if self._exclude_current_positions_from_observation:
|
||||||
|
position = position[1:]
|
||||||
|
|
||||||
|
observation = np.concatenate((position, velocity)).ravel()
|
||||||
|
return observation
|
||||||
|
|
||||||
|
def step(self, action):
|
||||||
|
x_position_before = self.data.qpos[0]
|
||||||
|
self.do_simulation(action, self.frame_skip)
|
||||||
|
x_position_after = self.data.qpos[0]
|
||||||
|
x_velocity = (x_position_after - x_position_before) / self.dt
|
||||||
|
|
||||||
|
ctrl_cost = self.control_cost(action)
|
||||||
|
|
||||||
|
forward_reward = self._forward_reward_weight * x_velocity
|
||||||
|
healthy_reward = self.healthy_reward
|
||||||
|
|
||||||
|
rewards = forward_reward + healthy_reward
|
||||||
|
costs = ctrl_cost
|
||||||
|
|
||||||
|
observation = self._get_obs()
|
||||||
|
reward = rewards - costs
|
||||||
|
done = self.done
|
||||||
|
info = {
|
||||||
|
"x_position": x_position_after,
|
||||||
|
"x_velocity": x_velocity,
|
||||||
|
}
|
||||||
|
|
||||||
|
return observation, reward, done, info
|
||||||
|
|
||||||
|
def reset_model(self):
|
||||||
|
noise_low = -self._reset_noise_scale
|
||||||
|
noise_high = self._reset_noise_scale
|
||||||
|
|
||||||
|
qpos = self.init_qpos + self.np_random.uniform(
|
||||||
|
low=noise_low, high=noise_high, size=self.model.nq
|
||||||
|
)
|
||||||
|
qvel = self.init_qvel + self.np_random.uniform(
|
||||||
|
low=noise_low, high=noise_high, size=self.model.nv
|
||||||
|
)
|
||||||
|
|
||||||
|
self.set_state(qpos, qvel)
|
||||||
|
|
||||||
|
observation = self._get_obs()
|
||||||
|
return observation
|
||||||
|
|
||||||
|
def viewer_setup(self):
|
||||||
|
for key, value in DEFAULT_CAMERA_CONFIG.items():
|
||||||
|
if isinstance(value, np.ndarray):
|
||||||
|
getattr(self.viewer.cam, key)[:] = value
|
||||||
|
else:
|
||||||
|
setattr(self.viewer.cam, key, value)
|
@@ -6,15 +6,14 @@ RUN apt-get -y update && apt-get install -y unzip libglu1-mesa-dev libgl1-mesa-d
|
|||||||
# Download mujoco
|
# Download mujoco
|
||||||
RUN mkdir /root/.mujoco && \
|
RUN mkdir /root/.mujoco && \
|
||||||
cd /root/.mujoco && \
|
cd /root/.mujoco && \
|
||||||
curl -O https://www.roboti.us/download/mjpro150_linux.zip && \
|
wget https://github.com/deepmind/mujoco/releases/download/2.1.0/mujoco210-linux-x86_64.tar.gz &&\
|
||||||
unzip mjpro150_linux.zip && \
|
tar -xf mujoco210-linux-x86_64.tar.gz
|
||||||
echo DUMMY_KEY > /root/.mujoco/mjkey.txt
|
|
||||||
|
|
||||||
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/root/.mujoco/mjpro150/bin
|
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/root/.mujoco/mujoco210/bin
|
||||||
|
|
||||||
COPY . /usr/local/gym/
|
COPY . /usr/local/gym/
|
||||||
WORKDIR /usr/local/gym/
|
WORKDIR /usr/local/gym/
|
||||||
|
|
||||||
RUN pip install .[nomujoco] && pip install -r test_requirements.txt
|
RUN pip install .[noatari] && pip install -r test_requirements.txt
|
||||||
|
|
||||||
ENTRYPOINT ["/usr/local/gym/bin/docker_entrypoint"]
|
ENTRYPOINT ["/usr/local/gym/bin/docker_entrypoint"]
|
||||||
|
@@ -1,7 +1,6 @@
|
|||||||
ale-py~=0.7
|
ale-py~=0.7
|
||||||
opencv-python>=3.0
|
opencv-python>=3.0
|
||||||
box2d-py==2.3.5
|
box2d-py==2.3.5
|
||||||
mujoco_py>=1.50, <2.0
|
|
||||||
scipy>=1.4.1
|
scipy>=1.4.1
|
||||||
numpy>=1.18.0
|
numpy>=1.18.0
|
||||||
pyglet>=1.4.0
|
pyglet>=1.4.0
|
||||||
|
10
setup.py
10
setup.py
@@ -15,19 +15,25 @@ extras = {
|
|||||||
"accept-rom-license": ["autorom[accept-rom-license]~=0.4.2"],
|
"accept-rom-license": ["autorom[accept-rom-license]~=0.4.2"],
|
||||||
"box2d": ["box2d-py==2.3.5", "pygame==2.1.0"],
|
"box2d": ["box2d-py==2.3.5", "pygame==2.1.0"],
|
||||||
"classic_control": ["pygame==2.1.0"],
|
"classic_control": ["pygame==2.1.0"],
|
||||||
"mujoco": ["mujoco_py>=1.50, <2.0"],
|
"mujoco_py": ["mujoco_py<2.2,>=2.1"],
|
||||||
|
"mujoco": ["mujoco==2.1.5", "imageio>=2.14.1"],
|
||||||
"toy_text": ["pygame==2.1.0", "scipy>=1.4.1"],
|
"toy_text": ["pygame==2.1.0", "scipy>=1.4.1"],
|
||||||
"other": ["lz4>=3.1.0", "opencv-python>=3.0", "matplotlib>=3.0"],
|
"other": ["lz4>=3.1.0", "opencv-python>=3.0", "matplotlib>=3.0"],
|
||||||
}
|
}
|
||||||
|
|
||||||
# Meta dependency groups.
|
# Meta dependency groups.
|
||||||
nomujoco_blacklist = {"mujoco", "accept-rom-license", "atari"}
|
nomujoco_blacklist = {"mujoco_py", "mujoco", "accept-rom-license", "atari"}
|
||||||
nomujoco_groups = set(extras.keys()) - nomujoco_blacklist
|
nomujoco_groups = set(extras.keys()) - nomujoco_blacklist
|
||||||
|
|
||||||
extras["nomujoco"] = list(
|
extras["nomujoco"] = list(
|
||||||
itertools.chain.from_iterable(map(lambda group: extras[group], nomujoco_groups))
|
itertools.chain.from_iterable(map(lambda group: extras[group], nomujoco_groups))
|
||||||
)
|
)
|
||||||
|
|
||||||
|
noatari_blacklist = {"accept-rom-license", "atari"}
|
||||||
|
noatari_groups = set(extras.keys()) - noatari_blacklist
|
||||||
|
extras["noatari"] = list(
|
||||||
|
itertools.chain.from_iterable(map(lambda group: extras[group], noatari_groups))
|
||||||
|
)
|
||||||
|
|
||||||
all_blacklist = {"accept-rom-license"}
|
all_blacklist = {"accept-rom-license"}
|
||||||
all_groups = set(extras.keys()) - all_blacklist
|
all_groups = set(extras.keys()) - all_blacklist
|
||||||
|
@@ -1,19 +1,14 @@
|
|||||||
import os
|
|
||||||
|
|
||||||
from gym import envs, logger
|
from gym import envs, logger
|
||||||
|
|
||||||
SKIP_MUJOCO_WARNING_MESSAGE = (
|
SKIP_MUJOCO_V3_WARNING_MESSAGE = (
|
||||||
"Cannot run mujoco test (either license key not found or mujoco not"
|
"Cannot run mujoco test because mujoco-py is not installed"
|
||||||
"installed properly)."
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
skip_mujoco_v3 = False
|
||||||
skip_mujoco = not (os.environ.get("MUJOCO_KEY"))
|
try:
|
||||||
if not skip_mujoco:
|
import mujoco_py # noqa:F401
|
||||||
try:
|
except ImportError:
|
||||||
import mujoco_py # noqa:F401
|
skip_mujoco_v3 = True
|
||||||
except ImportError:
|
|
||||||
skip_mujoco = True
|
|
||||||
|
|
||||||
|
|
||||||
def should_skip_env_spec_for_tests(spec):
|
def should_skip_env_spec_for_tests(spec):
|
||||||
@@ -21,7 +16,7 @@ def should_skip_env_spec_for_tests(spec):
|
|||||||
# troublesome to run frequently
|
# troublesome to run frequently
|
||||||
ep = spec.entry_point
|
ep = spec.entry_point
|
||||||
# Skip mujoco tests for pull request CI
|
# Skip mujoco tests for pull request CI
|
||||||
if skip_mujoco and ep.startswith("gym.envs.mujoco"):
|
if skip_mujoco_v3 and ep.startswith("gym.envs.mujoco"):
|
||||||
return True
|
return True
|
||||||
try:
|
try:
|
||||||
import gym.envs.atari # noqa:F401
|
import gym.envs.atari # noqa:F401
|
||||||
@@ -48,8 +43,19 @@ def should_skip_env_spec_for_tests(spec):
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def skip_mujoco_py_env_for_test(spec):
|
||||||
|
ep = spec.entry_point
|
||||||
|
version = spec.version
|
||||||
|
if ep.startswith("gym.envs.mujoco") and version < 4:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
spec_list = [
|
spec_list = [
|
||||||
spec
|
spec
|
||||||
for spec in sorted(envs.registry.values(), key=lambda x: x.id)
|
for spec in sorted(envs.registry.values(), key=lambda x: x.id)
|
||||||
if spec.entry_point is not None and not should_skip_env_spec_for_tests(spec)
|
if spec.entry_point is not None and not should_skip_env_spec_for_tests(spec)
|
||||||
]
|
]
|
||||||
|
spec_list_no_mujoco_py = [
|
||||||
|
spec for spec in spec_list if not skip_mujoco_py_env_for_test(spec)
|
||||||
|
]
|
||||||
|
@@ -8,17 +8,19 @@ from gym.envs.registration import EnvSpec
|
|||||||
from gym.spaces.box import Box
|
from gym.spaces.box import Box
|
||||||
from gym.spaces.discrete import Discrete
|
from gym.spaces.discrete import Discrete
|
||||||
from gym.spaces.space import Space
|
from gym.spaces.space import Space
|
||||||
from tests.envs.spec_list import SKIP_MUJOCO_WARNING_MESSAGE, skip_mujoco, spec_list
|
from tests.envs.spec_list import (
|
||||||
|
SKIP_MUJOCO_V3_WARNING_MESSAGE,
|
||||||
|
skip_mujoco_v3,
|
||||||
|
spec_list,
|
||||||
|
)
|
||||||
|
|
||||||
ENVIRONMENT_IDS = ("HalfCheetah-v2",)
|
ENVIRONMENT_IDS = ("HalfCheetah-v2",)
|
||||||
|
|
||||||
|
|
||||||
def make_envs_by_action_space_type(spec_list: List[EnvSpec], action_space: Space):
|
def make_envs_by_action_space_type(spec_list: List[EnvSpec], action_space: Space):
|
||||||
"""Make environments of specific action_space type.
|
"""Make environments of specific action_space type.
|
||||||
|
|
||||||
This function returns a filtered list of environment from the
|
This function returns a filtered list of environment from the
|
||||||
spec_list that matches the action_space type.
|
spec_list that matches the action_space type.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
spec_list (list): list of registered environments' specification
|
spec_list (list): list of registered environments' specification
|
||||||
action_space (gym.spaces.Space): action_space type
|
action_space (gym.spaces.Space): action_space type
|
||||||
@@ -31,7 +33,7 @@ def make_envs_by_action_space_type(spec_list: List[EnvSpec], action_space: Space
|
|||||||
return filtered_envs
|
return filtered_envs
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(skip_mujoco, reason=SKIP_MUJOCO_WARNING_MESSAGE)
|
@pytest.mark.skipif(skip_mujoco_v3, reason=SKIP_MUJOCO_V3_WARNING_MESSAGE)
|
||||||
@pytest.mark.parametrize("environment_id", ENVIRONMENT_IDS)
|
@pytest.mark.parametrize("environment_id", ENVIRONMENT_IDS)
|
||||||
def test_serialize_deserialize(environment_id):
|
def test_serialize_deserialize(environment_id):
|
||||||
env = envs.make(environment_id)
|
env = envs.make(environment_id)
|
||||||
@@ -47,10 +49,8 @@ def test_serialize_deserialize(environment_id):
|
|||||||
@pytest.mark.parametrize("env", make_envs_by_action_space_type(spec_list, Discrete))
|
@pytest.mark.parametrize("env", make_envs_by_action_space_type(spec_list, Discrete))
|
||||||
def test_discrete_actions_out_of_bound(env):
|
def test_discrete_actions_out_of_bound(env):
|
||||||
"""Test out of bound actions in Discrete action_space.
|
"""Test out of bound actions in Discrete action_space.
|
||||||
|
|
||||||
In discrete action_space environments, `out-of-bound`
|
In discrete action_space environments, `out-of-bound`
|
||||||
actions are not allowed and should raise an exception.
|
actions are not allowed and should raise an exception.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
env (gym.Env): the gym environment
|
env (gym.Env): the gym environment
|
||||||
"""
|
"""
|
||||||
@@ -69,11 +69,9 @@ def test_discrete_actions_out_of_bound(env):
|
|||||||
)
|
)
|
||||||
def test_box_actions_out_of_bound(env, seed):
|
def test_box_actions_out_of_bound(env, seed):
|
||||||
"""Test out of bound actions in Box action_space.
|
"""Test out of bound actions in Box action_space.
|
||||||
|
|
||||||
Environments with Box actions spaces perform clipping inside `step`.
|
Environments with Box actions spaces perform clipping inside `step`.
|
||||||
The expected behaviour is that an action `out-of-bound` has the same effect
|
The expected behaviour is that an action `out-of-bound` has the same effect
|
||||||
of an action with value exactly at the upper (or lower) bound.
|
of an action with value exactly at the upper (or lower) bound.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
env (gym.Env): the gym environment
|
env (gym.Env): the gym environment
|
||||||
seed (int): seed value for determinism
|
seed (int): seed value for determinism
|
||||||
|
@@ -4,7 +4,7 @@ import pytest
|
|||||||
from gym import envs
|
from gym import envs
|
||||||
from gym.spaces import Box
|
from gym.spaces import Box
|
||||||
from gym.utils.env_checker import check_env
|
from gym.utils.env_checker import check_env
|
||||||
from tests.envs.spec_list import spec_list
|
from tests.envs.spec_list import spec_list, spec_list_no_mujoco_py
|
||||||
|
|
||||||
|
|
||||||
# This runs a smoketest on each official registered env. We may want
|
# This runs a smoketest on each official registered env. We may want
|
||||||
@@ -13,7 +13,9 @@ from tests.envs.spec_list import spec_list
|
|||||||
@pytest.mark.filterwarnings(
|
@pytest.mark.filterwarnings(
|
||||||
"ignore:.*We recommend you to use a symmetric and normalized Box action space.*"
|
"ignore:.*We recommend you to use a symmetric and normalized Box action space.*"
|
||||||
)
|
)
|
||||||
@pytest.mark.parametrize("spec", spec_list, ids=[spec.id for spec in spec_list])
|
@pytest.mark.parametrize(
|
||||||
|
"spec", spec_list_no_mujoco_py, ids=[spec.id for spec in spec_list_no_mujoco_py]
|
||||||
|
)
|
||||||
def test_env(spec):
|
def test_env(spec):
|
||||||
# Capture warnings
|
# Capture warnings
|
||||||
with pytest.warns(None) as warnings:
|
with pytest.warns(None) as warnings:
|
||||||
@@ -47,13 +49,15 @@ def test_env(spec):
|
|||||||
assert (
|
assert (
|
||||||
observation.dtype == ob_space.dtype
|
observation.dtype == ob_space.dtype
|
||||||
), f"Step observation dtype: {ob.dtype}, expected: {ob_space.dtype}"
|
), f"Step observation dtype: {ob.dtype}, expected: {ob_space.dtype}"
|
||||||
|
|
||||||
for mode in env.metadata.get("render_modes", []):
|
for mode in env.metadata.get("render_modes", []):
|
||||||
env.render(mode=mode)
|
if not (mode == "human" and spec.entry_point.startswith("gym.envs.mujoco")):
|
||||||
|
env.render(mode=mode)
|
||||||
|
|
||||||
# Make sure we can render the environment after close.
|
# Make sure we can render the environment after close.
|
||||||
for mode in env.metadata.get("render_modes", []):
|
for mode in env.metadata.get("render_modes", []):
|
||||||
env.render(mode=mode)
|
if not (mode == "human" and spec.entry_point.startswith("gym.envs.mujoco")):
|
||||||
|
|
||||||
|
env.render(mode=mode)
|
||||||
|
|
||||||
env.close()
|
env.close()
|
||||||
|
|
||||||
|
@@ -3,7 +3,7 @@ import unittest
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from gym import envs
|
from gym import envs
|
||||||
from tests.envs.spec_list import SKIP_MUJOCO_WARNING_MESSAGE, skip_mujoco
|
from tests.envs.spec_list import SKIP_MUJOCO_V3_WARNING_MESSAGE, skip_mujoco_v3
|
||||||
|
|
||||||
|
|
||||||
def verify_environments_match(
|
def verify_environments_match(
|
||||||
@@ -31,7 +31,7 @@ def verify_environments_match(
|
|||||||
np.testing.assert_allclose(old_info[key], new_info[key], atol=eps)
|
np.testing.assert_allclose(old_info[key], new_info[key], atol=eps)
|
||||||
|
|
||||||
|
|
||||||
@unittest.skipIf(skip_mujoco, SKIP_MUJOCO_WARNING_MESSAGE)
|
@unittest.skipIf(skip_mujoco_v3, SKIP_MUJOCO_V3_WARNING_MESSAGE)
|
||||||
class Mujocov2Tov3ConversionTest(unittest.TestCase):
|
class Mujocov2Tov3ConversionTest(unittest.TestCase):
|
||||||
def test_environments_match(self):
|
def test_environments_match(self):
|
||||||
test_cases = (
|
test_cases = (
|
||||||
|
Reference in New Issue
Block a user