mirror of
https://github.com/Farama-Foundation/Gymnasium.git
synced 2025-07-31 13:54:31 +00:00
Add new MuJoCo bindings (#2762)
This commit is contained in:
committed by
GitHub
parent
8f9b62f6a6
commit
3e006f3ea5
1
.github/workflows/build.yml
vendored
1
.github/workflows/build.yml
vendored
@@ -11,7 +11,6 @@ jobs:
|
||||
- uses: actions/checkout@v2
|
||||
- run: |
|
||||
docker build -f py.Dockerfile \
|
||||
--build-arg MUJOCO_KEY=$MUJOCO_KEY \
|
||||
--build-arg PYTHON_VERSION=${{ matrix.python-version }} \
|
||||
--tag gym-docker .
|
||||
- name: Run tests
|
||||
|
@@ -9,7 +9,7 @@ repos:
|
||||
hooks:
|
||||
- id: codespell
|
||||
args:
|
||||
- --ignore-words-list=nd,reacher,thist,ths
|
||||
- --ignore-words-list=nd,reacher,thist,ths, ure, referenc
|
||||
- repo: https://gitlab.com/PyCQA/flake8
|
||||
rev: 4.0.1
|
||||
hooks:
|
||||
|
@@ -46,6 +46,11 @@ env.close()
|
||||
|
||||
Gym keeps strict versioning for reproducibility reasons. All environments end in a suffix like "\_v0". When changes are made to environments that might impact learning results, the number is increased by one to prevent potential confusion.
|
||||
|
||||
## MuJoCo Environments
|
||||
|
||||
The latest "\_v4" and future versions of the MuJoCo environments will no longer depend on `mujoco-py`. Instead `mujoco` will be the required dependency for future gym MuJoCo environment versions. Old gym MuJoCo environment versions that depend on `mujoco-py` will still be kept but unmaintained.
|
||||
To install the dependencies for the latest gym MuJoCo environments use `pip install gym[mujoco]`. Dependencies for old MuJoCo environments can still be installed by `pip install gym[mujoco_py]`.
|
||||
|
||||
## Citation
|
||||
|
||||
A whitepaper from when Gym just came out is available https://arxiv.org/pdf/1606.01540, and can be cited with the following bibtex entry:
|
||||
|
@@ -162,6 +162,13 @@ register(
|
||||
reward_threshold=-3.75,
|
||||
)
|
||||
|
||||
register(
|
||||
id="Reacher-v4",
|
||||
entry_point="gym.envs.mujoco.reacher_v4:ReacherEnv",
|
||||
max_episode_steps=50,
|
||||
reward_threshold=-3.75,
|
||||
)
|
||||
|
||||
register(
|
||||
id="Pusher-v2",
|
||||
entry_point="gym.envs.mujoco:PusherEnv",
|
||||
@@ -169,6 +176,13 @@ register(
|
||||
reward_threshold=0.0,
|
||||
)
|
||||
|
||||
register(
|
||||
id="Pusher-v4",
|
||||
entry_point="gym.envs.mujoco.pusher_v4:PusherEnv",
|
||||
max_episode_steps=100,
|
||||
reward_threshold=0.0,
|
||||
)
|
||||
|
||||
register(
|
||||
id="InvertedPendulum-v2",
|
||||
entry_point="gym.envs.mujoco:InvertedPendulumEnv",
|
||||
@@ -176,6 +190,13 @@ register(
|
||||
reward_threshold=950.0,
|
||||
)
|
||||
|
||||
register(
|
||||
id="InvertedPendulum-v4",
|
||||
entry_point="gym.envs.mujoco.inverted_pendulum_v4:InvertedPendulumEnv",
|
||||
max_episode_steps=1000,
|
||||
reward_threshold=950.0,
|
||||
)
|
||||
|
||||
register(
|
||||
id="InvertedDoublePendulum-v2",
|
||||
entry_point="gym.envs.mujoco:InvertedDoublePendulumEnv",
|
||||
@@ -183,6 +204,13 @@ register(
|
||||
reward_threshold=9100.0,
|
||||
)
|
||||
|
||||
register(
|
||||
id="InvertedDoublePendulum-v4",
|
||||
entry_point="gym.envs.mujoco.inverted_double_pendulum_v4:InvertedDoublePendulumEnv",
|
||||
max_episode_steps=1000,
|
||||
reward_threshold=9100.0,
|
||||
)
|
||||
|
||||
register(
|
||||
id="HalfCheetah-v2",
|
||||
entry_point="gym.envs.mujoco:HalfCheetahEnv",
|
||||
@@ -197,6 +225,13 @@ register(
|
||||
reward_threshold=4800.0,
|
||||
)
|
||||
|
||||
register(
|
||||
id="HalfCheetah-v4",
|
||||
entry_point="gym.envs.mujoco.half_cheetah_v4:HalfCheetahEnv",
|
||||
max_episode_steps=1000,
|
||||
reward_threshold=4800.0,
|
||||
)
|
||||
|
||||
register(
|
||||
id="Hopper-v2",
|
||||
entry_point="gym.envs.mujoco:HopperEnv",
|
||||
@@ -211,6 +246,13 @@ register(
|
||||
reward_threshold=3800.0,
|
||||
)
|
||||
|
||||
register(
|
||||
id="Hopper-v4",
|
||||
entry_point="gym.envs.mujoco.hopper_v4:HopperEnv",
|
||||
max_episode_steps=1000,
|
||||
reward_threshold=3800.0,
|
||||
)
|
||||
|
||||
register(
|
||||
id="Swimmer-v2",
|
||||
entry_point="gym.envs.mujoco:SwimmerEnv",
|
||||
@@ -225,6 +267,13 @@ register(
|
||||
reward_threshold=360.0,
|
||||
)
|
||||
|
||||
register(
|
||||
id="Swimmer-v4",
|
||||
entry_point="gym.envs.mujoco.swimmer_v4:SwimmerEnv",
|
||||
max_episode_steps=1000,
|
||||
reward_threshold=360.0,
|
||||
)
|
||||
|
||||
register(
|
||||
id="Walker2d-v2",
|
||||
max_episode_steps=1000,
|
||||
@@ -237,6 +286,12 @@ register(
|
||||
entry_point="gym.envs.mujoco.walker2d_v3:Walker2dEnv",
|
||||
)
|
||||
|
||||
register(
|
||||
id="Walker2d-v4",
|
||||
max_episode_steps=1000,
|
||||
entry_point="gym.envs.mujoco.walker2d_v4:Walker2dEnv",
|
||||
)
|
||||
|
||||
register(
|
||||
id="Ant-v2",
|
||||
entry_point="gym.envs.mujoco:AntEnv",
|
||||
@@ -251,6 +306,13 @@ register(
|
||||
reward_threshold=6000.0,
|
||||
)
|
||||
|
||||
register(
|
||||
id="Ant-v4",
|
||||
entry_point="gym.envs.mujoco.ant_v4:AntEnv",
|
||||
max_episode_steps=1000,
|
||||
reward_threshold=6000.0,
|
||||
)
|
||||
|
||||
register(
|
||||
id="Humanoid-v2",
|
||||
entry_point="gym.envs.mujoco:HumanoidEnv",
|
||||
@@ -263,8 +325,20 @@ register(
|
||||
max_episode_steps=1000,
|
||||
)
|
||||
|
||||
register(
|
||||
id="Humanoid-v4",
|
||||
entry_point="gym.envs.mujoco.humanoid_v4:HumanoidEnv",
|
||||
max_episode_steps=1000,
|
||||
)
|
||||
|
||||
register(
|
||||
id="HumanoidStandup-v2",
|
||||
entry_point="gym.envs.mujoco:HumanoidStandupEnv",
|
||||
max_episode_steps=1000,
|
||||
)
|
||||
|
||||
register(
|
||||
id="HumanoidStandup-v4",
|
||||
entry_point="gym.envs.mujoco.humanoidstandup_v4:HumanoidStandupEnv",
|
||||
max_episode_steps=1000,
|
||||
)
|
||||
|
@@ -9,6 +9,7 @@ from gym.envs.mujoco.humanoid import HumanoidEnv
|
||||
from gym.envs.mujoco.humanoidstandup import HumanoidStandupEnv
|
||||
from gym.envs.mujoco.inverted_double_pendulum import InvertedDoublePendulumEnv
|
||||
from gym.envs.mujoco.inverted_pendulum import InvertedPendulumEnv
|
||||
from gym.envs.mujoco.mujoco_rendering import RenderContextOffscreen, Viewer
|
||||
from gym.envs.mujoco.pusher import PusherEnv
|
||||
from gym.envs.mujoco.reacher import ReacherEnv
|
||||
from gym.envs.mujoco.swimmer import SwimmerEnv
|
||||
|
@@ -6,7 +6,7 @@ from gym.envs.mujoco import mujoco_env
|
||||
|
||||
class AntEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
||||
def __init__(self):
|
||||
mujoco_env.MujocoEnv.__init__(self, "ant.xml", 5)
|
||||
mujoco_env.MujocoEnv.__init__(self, "ant.xml", 5, mujoco_bindings="mujoco_py")
|
||||
utils.EzPickle.__init__(self)
|
||||
|
||||
def step(self, a):
|
||||
|
@@ -9,167 +9,6 @@ DEFAULT_CAMERA_CONFIG = {
|
||||
|
||||
|
||||
class AntEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
||||
"""
|
||||
### Description
|
||||
|
||||
This environment is based on the environment introduced by Schulman,
|
||||
Moritz, Levine, Jordan and Abbeel in ["High-Dimensional Continuous Control
|
||||
Using Generalized Advantage Estimation"](https://arxiv.org/abs/1506.02438).
|
||||
The ant is a 3D robot consisting of one torso (free rotational body) with
|
||||
four legs attached to it with each leg having two links. The goal is to
|
||||
coordinate the four legs to move in the forward (right) direction by applying
|
||||
torques on the eight hinges connecting the two links of each leg and the torso
|
||||
(nine parts and eight hinges).
|
||||
|
||||
### Action Space
|
||||
The action space is a `Box(-1, 1, (8,), float32)`. An action represents the torques applied at the hinge joints.
|
||||
|
||||
| Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit |
|
||||
|-----|----------------------|---------------|----------------|---------------------------------------|-------|------|
|
||||
| 0 | Torque applied on the rotor between the torso and front left hip | -1 | 1 | hip_1 (front_left_leg) | hinge | torque (N m) |
|
||||
| 1 | Torque applied on the rotor between the front left two links | -1 | 1 | angle_1 (front_left_leg) | hinge | torque (N m) |
|
||||
| 2 | Torque applied on the rotor between the torso and front right hip | -1 | 1 | hip_2 (front_right_leg) | hinge | torque (N m) |
|
||||
| 3 | Torque applied on the rotor between the front right two links | -1 | 1 | angle_2 (front_right_leg) | hinge | torque (N m) |
|
||||
| 4 | Torque applied on the rotor between the torso and back left hip | -1 | 1 | hip_3 (back_leg) | hinge | torque (N m) |
|
||||
| 5 | Torque applied on the rotor between the back left two links | -1 | 1 | angle_3 (back_leg) | hinge | torque (N m) |
|
||||
| 6 | Torque applied on the rotor between the torso and back right hip | -1 | 1 | hip_4 (right_back_leg) | hinge | torque (N m) |
|
||||
| 7 | Torque applied on the rotor between the back right two links | -1 | 1 | angle_4 (right_back_leg) | hinge | torque (N m) |
|
||||
|
||||
### Observation Space
|
||||
|
||||
Observations consist of positional values of different body parts of the ant,
|
||||
followed by the velocities of those individual parts (their derivatives) with all
|
||||
the positions ordered before all the velocities.
|
||||
|
||||
By default, observations do not include the x- and y-coordinates of the ant's torso. These may
|
||||
be included by passing `exclude_current_positions_from_observation=False` during construction.
|
||||
In that case, the observation space will have 113 dimensions where the first two dimensions
|
||||
represent the x- and y- coordinates of the ant's torso.
|
||||
Regardless of whether `exclude_current_positions_from_observation` was set to true or false, the x- and y-coordinates
|
||||
of the torso will be returned in `info` with keys `"x_position"` and `"y_position"`, respectively.
|
||||
|
||||
However, by default, an observation is a `ndarray` with shape `(111,)`
|
||||
where the elements correspond to the following:
|
||||
|
||||
| Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit |
|
||||
|-----|-------------------------------------------------------------|----------------|-----------------|----------------------------------------|-------|------|
|
||||
| 0 | z-coordinate of the torso (centre) | -Inf | Inf | torso | free | position (m) |
|
||||
| 1 | x-orientation of the torso (centre) | -Inf | Inf | torso | free | angle (rad) |
|
||||
| 2 | y-orientation of the torso (centre) | -Inf | Inf | torso | free | angle (rad) |
|
||||
| 3 | z-orientation of the torso (centre) | -Inf | Inf | torso | free | angle (rad) |
|
||||
| 4 | w-orientation of the torso (centre) | -Inf | Inf | torso | free | angle (rad) |
|
||||
| 5 | angle between torso and first link on front left | -Inf | Inf | hip_1 (front_left_leg) | hinge | angle (rad) |
|
||||
| 6 | angle between the two links on the front left | -Inf | Inf | ankle_1 (front_left_leg) | hinge | angle (rad) |
|
||||
| 7 | angle between torso and first link on front right | -Inf | Inf | hip_2 (front_right_leg) | hinge | angle (rad) |
|
||||
| 8 | angle between the two links on the front right | -Inf | Inf | ankle_2 (front_right_leg) | hinge | angle (rad) |
|
||||
| 9 | angle between torso and first link on back left | -Inf | Inf | hip_3 (back_leg) | hinge | angle (rad) |
|
||||
| 10 | angle between the two links on the back left | -Inf | Inf | ankle_3 (back_leg) | hinge | angle (rad) |
|
||||
| 11 | angle between torso and first link on back right | -Inf | Inf | hip_4 (right_back_leg) | hinge | angle (rad) |
|
||||
| 12 | angle between the two links on the back right | -Inf | Inf | ankle_4 (right_back_leg) | hinge | angle (rad) |
|
||||
| 13 | x-coordinate velocity of the torso | -Inf | Inf | torso | free | velocity (m/s) |
|
||||
| 14 | y-coordinate velocity of the torso | -Inf | Inf | torso | free | velocity (m/s) |
|
||||
| 15 | z-coordinate velocity of the torso | -Inf | Inf | torso | free | velocity (m/s) |
|
||||
| 16 | x-coordinate angular velocity of the torso | -Inf | Inf | torso | free | angular velocity (rad/s) |
|
||||
| 17 | y-coordinate angular velocity of the torso | -Inf | Inf | torso | free | angular velocity (rad/s) |
|
||||
| 18 | z-coordinate angular velocity of the torso | -Inf | Inf | torso | free | angular velocity (rad/s) |
|
||||
| 19 | angular velocity of angle between torso and front left link | -Inf | Inf | hip_1 (front_left_leg) | hinge | angle (rad) |
|
||||
| 20 | angular velocity of the angle between front left links | -Inf | Inf | ankle_1 (front_left_leg) | hinge | angle (rad) |
|
||||
| 21 | angular velocity of angle between torso and front right link| -Inf | Inf | hip_2 (front_right_leg) | hinge | angle (rad) |
|
||||
| 22 | angular velocity of the angle between front right links | -Inf | Inf | ankle_2 (front_right_leg) | hinge | angle (rad) |
|
||||
| 23 | angular velocity of angle between torso and back left link | -Inf | Inf | hip_3 (back_leg) | hinge | angle (rad) |
|
||||
| 24 | angular velocity of the angle between back left links | -Inf | Inf | ankle_3 (back_leg) | hinge | angle (rad) |
|
||||
| 25 | angular velocity of angle between torso and back right link | -Inf | Inf | hip_4 (right_back_leg) | hinge | angle (rad) |
|
||||
| 26 |angular velocity of the angle between back right links | -Inf | Inf | ankle_4 (right_back_leg) | hinge | angle (rad) |
|
||||
|
||||
|
||||
The remaining 14*6 = 84 elements of the observation are contact forces
|
||||
(external forces - force x, y, z and torque x, y, z) applied to the
|
||||
center of mass of each of the links. The 14 links are: the ground link,
|
||||
the torso link, and 3 links for each leg (1 + 1 + 12) with the 6 external forces.
|
||||
|
||||
The (x,y,z) coordinates are translational DOFs while the orientations are rotational
|
||||
DOFs expressed as quaternions. One can read more about free joints on the [Mujoco Documentation](https://mujoco.readthedocs.io/en/latest/XMLreference.html).
|
||||
|
||||
|
||||
**Note:** There have been reported issues that using a Mujoco-Py version > 2.0 results
|
||||
in the contact forces always being 0. As such we recommend to use a Mujoco-Py version < 2.0
|
||||
when using the Ant environment if you would like to report results with contact forces (if
|
||||
contact forces are not used in your experiments, you can use version > 2.0).
|
||||
|
||||
### Rewards
|
||||
The reward consists of three parts:
|
||||
- *healthy_reward*: Every timestep that the ant is healthy (see definition in section "Episode Termination"), it gets a reward of fixed value `healthy_reward`
|
||||
- *forward_reward*: A reward of moving forward which is measured as
|
||||
*(x-coordinate before action - x-coordinate after action)/dt*. *dt* is the time
|
||||
between actions and is dependent on the `frame_skip` parameter (default is 5),
|
||||
where the frametime is 0.01 - making the default *dt = 5 * 0.01 = 0.05*.
|
||||
This reward would be positive if the ant moves forward (in positive x direction).
|
||||
- *ctrl_cost*: A negative reward for penalising the ant if it takes actions
|
||||
that are too large. It is measured as *`ctrl_cost_weight` * sum(action<sup>2</sup>)*
|
||||
where *`ctr_cost_weight`* is a parameter set for the control and has a default value of 0.5.
|
||||
- *contact_cost*: A negative reward for penalising the ant if the external contact
|
||||
force is too large. It is calculated *`contact_cost_weight` * sum(clip(external contact
|
||||
force to `contact_force_range`)<sup>2</sup>)*.
|
||||
|
||||
The total reward returned is ***reward*** *=* *healthy_reward + forward_reward - ctrl_cost - contact_cost* and `info` will also contain the individual reward terms.
|
||||
|
||||
### Starting State
|
||||
All observations start in state
|
||||
(0.0, 0.0, 0.75, 1.0, 0.0 ... 0.0) with a uniform noise in the range
|
||||
of [-`reset_noise_scale`, `reset_noise_scale`] added to the positional values and standard normal noise
|
||||
with mean 0 and standard deviation `reset_noise_scale` added to the velocity values for
|
||||
stochasticity. Note that the initial z coordinate is intentionally selected
|
||||
to be slightly high, thereby indicating a standing up ant. The initial orientation
|
||||
is designed to make it face forward as well.
|
||||
|
||||
### Episode Termination
|
||||
The ant is said to be unhealthy if any of the following happens:
|
||||
|
||||
1. Any of the state space values is no longer finite
|
||||
2. The z-coordinate of the torso is **not** in the closed interval given by `healthy_z_range` (defaults to [0.2, 1.0])
|
||||
|
||||
If `terminate_when_unhealthy=True` is passed during construction (which is the default),
|
||||
the episode terminates when any of the following happens:
|
||||
|
||||
1. The episode duration reaches a 1000 timesteps
|
||||
2. The ant is unhealthy
|
||||
|
||||
If `terminate_when_unhealthy=False` is passed, the episode is terminated only when 1000 timesteps are exceeded.
|
||||
|
||||
### Arguments
|
||||
|
||||
No additional arguments are currently supported in v2 and lower.
|
||||
|
||||
```
|
||||
env = gym.make('Ant-v2')
|
||||
```
|
||||
|
||||
v3 and beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
|
||||
|
||||
```
|
||||
env = gym.make('Ant-v3', ctrl_cost_weight=0.1, ...)
|
||||
```
|
||||
|
||||
| Parameter | Type | Default |Description |
|
||||
|-------------------------|------------|--------------|-------------------------------|
|
||||
| `xml_file` | **str** | `"ant.xml"` | Path to a MuJoCo model |
|
||||
| `ctrl_cost_weight` | **float** | `0.5` | Weight for *ctrl_cost* term (see section on reward) |
|
||||
| `contact_cost_weight` | **float** | `5e-4` | Weight for *contact_cost* term (see section on reward) |
|
||||
| `healthy_reward` | **float** | `1` | Constant reward given if the ant is "healthy" after timestep |
|
||||
| `terminate_when_unhealthy` | **bool**| `True` | If true, issue a done signal if the z-coordinate of the torso is no longer in the `healthy_z_range` |
|
||||
| `healthy_z_range` | **tuple** | `(0.2, 1)` | The ant is considered healthy if the z-coordinate of the torso is in this range |
|
||||
| `contact_force_range` | **tuple** | `(-1, 1)` | Contact forces are clipped to this range in the computation of *contact_cost* |
|
||||
| `reset_noise_scale` | **float** | `0.1` | Scale of random perturbations of initial position and velocity (see section on Starting State) |
|
||||
| `exclude_current_positions_from_observation`| **bool** | `True`| Whether or not to omit the x- and y-coordinates from observations. Excluding the position can serve as an inductive bias to induce position-agnostic behavior in policies |
|
||||
|
||||
### Version History
|
||||
|
||||
* v3: support for gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. rgb rendering comes from tracking camera (so agent does not run away from screen)
|
||||
* v2: All continuous control environments now use mujoco_py >= 1.50
|
||||
* v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments.
|
||||
* v0: Initial versions release (1.0.0)
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
xml_file="ant.xml",
|
||||
@@ -199,7 +38,7 @@ class AntEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
||||
exclude_current_positions_from_observation
|
||||
)
|
||||
|
||||
mujoco_env.MujocoEnv.__init__(self, xml_file, 5)
|
||||
mujoco_env.MujocoEnv.__init__(self, xml_file, 5, mujoco_bindings="mujoco_py")
|
||||
|
||||
@property
|
||||
def healthy_reward(self):
|
||||
|
308
gym/envs/mujoco/ant_v4.py
Normal file
308
gym/envs/mujoco/ant_v4.py
Normal file
@@ -0,0 +1,308 @@
|
||||
import numpy as np
|
||||
|
||||
from gym import utils
|
||||
from gym.envs.mujoco import mujoco_env
|
||||
|
||||
DEFAULT_CAMERA_CONFIG = {
|
||||
"distance": 4.0,
|
||||
}
|
||||
|
||||
|
||||
class AntEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
||||
"""
|
||||
### Description
|
||||
|
||||
This environment is based on the environment introduced by Schulman,
|
||||
Moritz, Levine, Jordan and Abbeel in ["High-Dimensional Continuous Control
|
||||
Using Generalized Advantage Estimation"](https://arxiv.org/abs/1506.02438).
|
||||
The ant is a 3D robot consisting of one torso (free rotational body) with
|
||||
four legs attached to it with each leg having two links. The goal is to
|
||||
coordinate the four legs to move in the forward (right) direction by applying
|
||||
torques on the eight hinges connecting the two links of each leg and the torso
|
||||
(nine parts and eight hinges).
|
||||
|
||||
### Action Space
|
||||
The agent take a 8-element vector for actions.
|
||||
|
||||
The action space is a continuous `(action, action, action, action, action, action,
|
||||
action, action)` all in `[-1, 1]`, where `action` represents the numerical torques
|
||||
applied at the hinge joints.
|
||||
|
||||
| Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit |
|
||||
|-----|----------------------|---------------|----------------|---------------------------------------|-------|------|
|
||||
| 0 | Torque applied on the rotor between the torso and front left hip | -1 | 1 | hip_1 (front_left_leg) | hinge | torque (N m) |
|
||||
| 1 | Torque applied on the rotor between the front left two links | -1 | 1 | angle_1 (front_left_leg) | hinge | torque (N m) |
|
||||
| 2 | Torque applied on the rotor between the torso and front right hip | -1 | 1 | hip_2 (front_right_leg) | hinge | torque (N m) |
|
||||
| 3 | Torque applied on the rotor between the front right two links | -1 | 1 | angle_2 (front_right_leg) | hinge | torque (N m) |
|
||||
| 4 | Torque applied on the rotor between the torso and back left hip | -1 | 1 | hip_3 (back_leg) | hinge | torque (N m) |
|
||||
| 5 | Torque applied on the rotor between the back left two links | -1 | 1 | angle_3 (back_leg) | hinge | torque (N m) |
|
||||
| 6 | Torque applied on the rotor between the torso and back right hip | -1 | 1 | hip_4 (right_back_leg) | hinge | torque (N m) |
|
||||
| 7 | Torque applied on the rotor between the back right two links | -1 | 1 | angle_4 (right_back_leg) | hinge | torque (N m) |
|
||||
|
||||
### Observation Space
|
||||
|
||||
The state space consists of positional values of different body parts of the ant,
|
||||
followed by the velocities of those individual parts (their derivatives) with all
|
||||
the positions ordered before all the velocities.
|
||||
|
||||
The observation is a `ndarray` with shape `(111,)` where the elements correspond to the following:
|
||||
|
||||
| Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit |
|
||||
|-----|-------------------------------------------------------------|----------------|-----------------|----------------------------------------|-------|------|
|
||||
| 0 | x-coordinate of the torso (centre) | -Inf | Inf | torso | free | position (m) |
|
||||
| 1 | y-coordinate of the torso (centre) | -Inf | Inf | torso | free | position (m) |
|
||||
| 2 | z-coordinate of the torso (centre) | -Inf | Inf | torso | free | position (m) |
|
||||
| 3 | x-orientation of the torso (centre) | -Inf | Inf | torso | free | angle (rad) |
|
||||
| 4 | y-orientation of the torso (centre) | -Inf | Inf | torso | free | angle (rad) |
|
||||
| 5 | z-orientation of the torso (centre) | -Inf | Inf | torso | free | angle (rad) |
|
||||
| 6 | w-orientation of the torso (centre) | -Inf | Inf | torso | free | angle (rad) |
|
||||
| 7 | angle between torso and first link on front left | -Inf | Inf | hip_1 (front_left_leg) | hinge | angle (rad) |
|
||||
| 8 | angle between the two links on the front left | -Inf | Inf | ankle_1 (front_left_leg) | hinge | angle (rad) |
|
||||
| 9 | angle between torso and first link on front right | -Inf | Inf | hip_2 (front_right_leg) | hinge | angle (rad) |
|
||||
| 10 | angle between the two links on the front right | -Inf | Inf | ankle_2 (front_right_leg) | hinge | angle (rad) |
|
||||
| 11 | angle between torso and first link on back left | -Inf | Inf | hip_3 (back_leg) | hinge | angle (rad) |
|
||||
| 12 | angle between the two links on the back left | -Inf | Inf | ankle_3 (back_leg) | hinge | angle (rad) |
|
||||
| 13 | angle between torso and first link on back right | -Inf | Inf | hip_4 (right_back_leg) | hinge | angle (rad) |
|
||||
| 14 | angle between the two links on the back right | -Inf | Inf | ankle_4 (right_back_leg) | hinge | angle (rad) |
|
||||
| 15 | x-coordinate velocity of the torso | -Inf | Inf | torso | free | velocity (m/s) |
|
||||
| 16 | y-coordinate velocity of the torso | -Inf | Inf | torso | free | velocity (m/s) |
|
||||
| 17 | z-coordinate velocity of the torso | -Inf | Inf | torso | free | velocity (m/s) |
|
||||
| 18 | x-coordinate angular velocity of the torso | -Inf | Inf | torso | free | angular velocity (rad/s) |
|
||||
| 19 | y-coordinate angular velocity of the torso | -Inf | Inf | torso | free | angular velocity (rad/s) |
|
||||
| 20 | z-coordinate angular velocity of the torso | -Inf | Inf | torso | free | angular velocity (rad/s) |
|
||||
| 21 | angular velocity of angle between torso and front left link | -Inf | Inf | hip_1 (front_left_leg) | hinge | angle (rad) |
|
||||
| 22 | angular velocity of the angle between front left links | -Inf | Inf | ankle_1 (front_left_leg) | hinge | angle (rad) |
|
||||
| 23 | angular velocity of angle between torso and front right link| -Inf | Inf | hip_2 (front_right_leg) | hinge | angle (rad) |
|
||||
| 24 | angular velocity of the angle between front right links | -Inf | Inf | ankle_2 (front_right_leg) | hinge | angle (rad) |
|
||||
| 25 | angular velocity of angle between torso and back left link | -Inf | Inf | hip_3 (back_leg) | hinge | angle (rad) |
|
||||
| 26 | angular velocity of the angle between back left links | -Inf | Inf | ankle_3 (back_leg) | hinge | angle (rad) |
|
||||
| 27 | angular velocity of angle between torso and back right link | -Inf | Inf | hip_4 (right_back_leg) | hinge | angle (rad) |
|
||||
| 28 |angular velocity of the angle between back right links | -Inf | Inf | ankle_4 (right_back_leg) | hinge | angle (rad) |
|
||||
|
||||
|
||||
The remaining 14*6 = 84 elements in the state are contact forces
|
||||
(external forces - force x, y, z and torque x, y, z) applied to the
|
||||
center of mass of each of the links. The 14 links are: the ground link,
|
||||
the torso link, and 3 links for each leg (1 + 1 + 12) with the 6 external forces.
|
||||
|
||||
The (x,y,z) coordinates are translational DOFs while the orientations are rotational
|
||||
DOFs expressed as quaternions. One can read more about free joints on the [Mujoco Documentation](https://mujoco.readthedocs.io/en/latest/XMLreference.html).
|
||||
|
||||
**Note:** There are 29 elements in the table above - giving rise to `(113,)` elements
|
||||
in the state space. In practice (and Gym implementation), the first two positional
|
||||
elements are omitted from the state space since the reward function is calculated based
|
||||
on the x-coordinate value. This value is hidden from the algorithm, which in turn has to
|
||||
develop an abstract understanding of it from the observed rewards. Therefore, observation
|
||||
space has shape `(111,)` instead of `(113,)` and the table should not have the first two rows.
|
||||
|
||||
**Note:** Ant-v4 environment no longer has the following contact forces issue.
|
||||
If using previous Ant versions from v4, there have been reported issues that using a Mujoco-Py version > 2.0 results
|
||||
in the contact forces always being 0. As such we recommend to use a Mujoco-Py version < 2.0
|
||||
when using the Ant environment if you would like to report results with contact forces (if
|
||||
contact forces are not used in your experiments, you can use version > 2.0).
|
||||
|
||||
**Note:** Ant-v4 has the option of including contact forces in the observation space. To add contact forces set the argument
|
||||
'use_contact_forces" to True. The default value is False. Also note that training including contact forces can perform worse
|
||||
than not using them as shown in (https://github.com/openai/gym/pull/2762).
|
||||
|
||||
### Rewards
|
||||
The reward consists of three parts:
|
||||
- *survive_reward*: Every timestep that the ant is alive, it gets a reward of 1.
|
||||
- *forward_reward*: A reward of moving forward which is measured as
|
||||
*(x-coordinate before action - x-coordinate after action)/dt*. *dt* is the time
|
||||
between actions and is dependent on the frame_skip parameter (default is 5),
|
||||
where the *dt* for one frame is 0.01 - making the default *dt = 5 * 0.01 = 0.05*.
|
||||
This reward would be positive if the ant moves forward (right) desired.
|
||||
- *ctrl_cost*: A negative reward for penalising the ant if it takes actions
|
||||
that are too large. It is measured as *coefficient **x** sum(action<sup>2</sup>)*
|
||||
where *coefficient* is a parameter set for the control and has a default value of 0.5.
|
||||
- *contact_cost*: A negative reward for penalising the ant if the external contact
|
||||
force is too large. It is calculated *0.5 * 0.001 * sum(clip(external contact
|
||||
force to [-1,1])<sup>2</sup>)*.
|
||||
|
||||
The total reward returned is ***reward*** *=* *alive survive_reward + forward_reward - ctrl_cost - contact_cost*
|
||||
|
||||
### Starting State
|
||||
All observations start in state
|
||||
(0.0, 0.0, 0.75, 1.0, 0.0 ... 0.0) with a uniform noise in the range
|
||||
of [-0.1, 0.1] added to the positional values and standard normal noise
|
||||
with 0 mean and 0.1 standard deviation added to the velocity values for
|
||||
stochasticity. Note that the initial z coordinate is intentionally selected
|
||||
to be slightly high, thereby indicating a standing up ant. The initial orientation
|
||||
is designed to make it face forward as well.
|
||||
|
||||
### Episode Termination
|
||||
The episode terminates when any of the following happens:
|
||||
|
||||
1. The episode duration reaches a 1000 timesteps
|
||||
2. Any of the state space values is no longer finite
|
||||
3. The y-orientation (index 2) in the state is **not** in the range `[0.2, 1.0]`
|
||||
|
||||
### Arguments
|
||||
|
||||
No additional arguments are currently supported (in v2 and lower), but modifications
|
||||
can be made to the XML file in the assets folder (or by changing the path to a modified
|
||||
XML file in another folder).
|
||||
|
||||
```
|
||||
env = gym.make('Ant-v2')
|
||||
```
|
||||
|
||||
v3 and beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
|
||||
|
||||
```
|
||||
env = gym.make('Ant-v3', ctrl_cost_weight=0.1, ...)
|
||||
```
|
||||
|
||||
### Version History
|
||||
* v4: all mujoco environments now use the mujoco binidings in mujoco>=2.1.3
|
||||
* v3: support for gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. rgb rendering comes from tracking camera (so agent does not run away from screen)
|
||||
* v2: All continuous control environments now use mujoco_py >= 1.50
|
||||
* v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments.
|
||||
* v0: Initial versions release (1.0.0)
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
xml_file="ant.xml",
|
||||
ctrl_cost_weight=0.5,
|
||||
use_contact_forces=False,
|
||||
contact_cost_weight=5e-4,
|
||||
healthy_reward=1.0,
|
||||
terminate_when_unhealthy=True,
|
||||
healthy_z_range=(0.2, 1.0),
|
||||
contact_force_range=(-1.0, 1.0),
|
||||
reset_noise_scale=0.1,
|
||||
exclude_current_positions_from_observation=True,
|
||||
):
|
||||
utils.EzPickle.__init__(**locals())
|
||||
|
||||
self._ctrl_cost_weight = ctrl_cost_weight
|
||||
self._contact_cost_weight = contact_cost_weight
|
||||
|
||||
self._healthy_reward = healthy_reward
|
||||
self._terminate_when_unhealthy = terminate_when_unhealthy
|
||||
self._healthy_z_range = healthy_z_range
|
||||
|
||||
self._contact_force_range = contact_force_range
|
||||
|
||||
self._reset_noise_scale = reset_noise_scale
|
||||
|
||||
self._use_contact_forces = use_contact_forces
|
||||
|
||||
self._exclude_current_positions_from_observation = (
|
||||
exclude_current_positions_from_observation
|
||||
)
|
||||
|
||||
mujoco_env.MujocoEnv.__init__(self, xml_file, 5)
|
||||
|
||||
@property
|
||||
def healthy_reward(self):
|
||||
return (
|
||||
float(self.is_healthy or self._terminate_when_unhealthy)
|
||||
* self._healthy_reward
|
||||
)
|
||||
|
||||
def control_cost(self, action):
|
||||
control_cost = self._ctrl_cost_weight * np.sum(np.square(action))
|
||||
return control_cost
|
||||
|
||||
@property
|
||||
def contact_forces(self):
|
||||
raw_contact_forces = self.data.cfrc_ext
|
||||
min_value, max_value = self._contact_force_range
|
||||
contact_forces = np.clip(raw_contact_forces, min_value, max_value)
|
||||
return contact_forces
|
||||
|
||||
@property
|
||||
def contact_cost(self):
|
||||
contact_cost = self._contact_cost_weight * np.sum(
|
||||
np.square(self.contact_forces)
|
||||
)
|
||||
return contact_cost
|
||||
|
||||
@property
|
||||
def is_healthy(self):
|
||||
state = self.state_vector()
|
||||
min_z, max_z = self._healthy_z_range
|
||||
is_healthy = np.isfinite(state).all() and min_z <= state[2] <= max_z
|
||||
return is_healthy
|
||||
|
||||
@property
|
||||
def done(self):
|
||||
done = not self.is_healthy if self._terminate_when_unhealthy else False
|
||||
return done
|
||||
|
||||
def step(self, action):
|
||||
xy_position_before = self.get_body_com("torso")[:2].copy()
|
||||
self.do_simulation(action, self.frame_skip)
|
||||
xy_position_after = self.get_body_com("torso")[:2].copy()
|
||||
|
||||
xy_velocity = (xy_position_after - xy_position_before) / self.dt
|
||||
x_velocity, y_velocity = xy_velocity
|
||||
|
||||
forward_reward = x_velocity
|
||||
healthy_reward = self.healthy_reward
|
||||
|
||||
rewards = forward_reward + healthy_reward
|
||||
|
||||
costs = ctrl_cost = self.control_cost(action)
|
||||
|
||||
done = self.done
|
||||
observation = self._get_obs()
|
||||
info = {
|
||||
"reward_forward": forward_reward,
|
||||
"reward_ctrl": -ctrl_cost,
|
||||
"reward_survive": healthy_reward,
|
||||
"x_position": xy_position_after[0],
|
||||
"y_position": xy_position_after[1],
|
||||
"distance_from_origin": np.linalg.norm(xy_position_after, ord=2),
|
||||
"x_velocity": x_velocity,
|
||||
"y_velocity": y_velocity,
|
||||
"forward_reward": forward_reward,
|
||||
}
|
||||
if self._use_contact_forces:
|
||||
contact_cost = self.contact_cost
|
||||
costs += contact_cost
|
||||
info["reward_ctrl"] = -contact_cost
|
||||
|
||||
reward = rewards - costs
|
||||
|
||||
return observation, reward, done, info
|
||||
|
||||
def _get_obs(self):
|
||||
position = self.data.qpos.flat.copy()
|
||||
velocity = self.data.qvel.flat.copy()
|
||||
|
||||
if self._exclude_current_positions_from_observation:
|
||||
position = position[2:]
|
||||
|
||||
if self._use_contact_forces:
|
||||
contact_force = self.contact_forces.flat.copy()
|
||||
return np.concatenate((position, velocity, contact_force))
|
||||
else:
|
||||
return np.concatenate((position, velocity))
|
||||
|
||||
def reset_model(self):
|
||||
noise_low = -self._reset_noise_scale
|
||||
noise_high = self._reset_noise_scale
|
||||
|
||||
qpos = self.init_qpos + self.np_random.uniform(
|
||||
low=noise_low, high=noise_high, size=self.model.nq
|
||||
)
|
||||
qvel = (
|
||||
self.init_qvel
|
||||
+ self._reset_noise_scale * self.np_random.standard_normal(self.model.nv)
|
||||
)
|
||||
self.set_state(qpos, qvel)
|
||||
|
||||
observation = self._get_obs()
|
||||
|
||||
return observation
|
||||
|
||||
def viewer_setup(self):
|
||||
for key, value in DEFAULT_CAMERA_CONFIG.items():
|
||||
if isinstance(value, np.ndarray):
|
||||
getattr(self.viewer.cam, key)[:] = value
|
||||
else:
|
||||
setattr(self.viewer.cam, key, value)
|
@@ -22,6 +22,6 @@
|
||||
</body>
|
||||
</worldbody>
|
||||
<actuator>
|
||||
<motor gear="100" joint="slider" name="slide"/>
|
||||
<motor ctrllimited="true" ctrlrange="-3 3" gear="100" joint="slider" name="slide"/>
|
||||
</actuator>
|
||||
</mujoco>
|
@@ -6,7 +6,9 @@ from gym.envs.mujoco import mujoco_env
|
||||
|
||||
class HalfCheetahEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
||||
def __init__(self):
|
||||
mujoco_env.MujocoEnv.__init__(self, "half_cheetah.xml", 5)
|
||||
mujoco_env.MujocoEnv.__init__(
|
||||
self, "half_cheetah.xml", 5, mujoco_bindings="mujoco_py"
|
||||
)
|
||||
utils.EzPickle.__init__(self)
|
||||
|
||||
def step(self, action):
|
||||
|
@@ -10,125 +10,6 @@ DEFAULT_CAMERA_CONFIG = {
|
||||
|
||||
|
||||
class HalfCheetahEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
||||
"""
|
||||
### Description
|
||||
|
||||
This environment is based on the work by P. Wawrzy´nski in
|
||||
["A Cat-Like Robot Real-Time Learning to Run"](http://staff.elka.pw.edu.pl/~pwawrzyn/pub-s/0812_LSCLRR.pdf).
|
||||
The HalfCheetah is a 2-dimensional robot consisting of 9 links and 8
|
||||
joints connecting them (including two paws). The goal is to apply a torque
|
||||
on the joints to make the cheetah run forward (right) as fast as possible,
|
||||
with a positive reward allocated based on the distance moved forward and a
|
||||
negative reward allocated for moving backward. The torso and head of the
|
||||
cheetah are fixed, and the torque can only be applied on the other 6 joints
|
||||
over the front and back thighs (connecting to the torso), shins
|
||||
(connecting to the thighs) and feet (connecting to the shins).
|
||||
|
||||
### Action Space
|
||||
The action space is a `Box(-1, 1, (6,), float32)`. An action represents the torques applied between *links*.
|
||||
|
||||
| Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit |
|
||||
|-------|--------------------------------------|---------------|----------------|---------------------------------------|-------|------|
|
||||
| 0 | Torque applied on the back thigh rotor | -1 | 1 | bthigh | hinge | torque (N m) |
|
||||
| 1 | Torque applied on the back shin rotor | -1 | 1 | bshin | hinge | torque (N m) |
|
||||
| 2 | Torque applied on the back foot rotor | -1 | 1 | bfoot | hinge | torque (N m) |
|
||||
| 3 | Torque applied on the front thigh rotor| -1 | 1 | fthigh | hinge | torque (N m) |
|
||||
| 4 | Torque applied on the front shin rotor | -1 | 1 | fshin | hinge | torque (N m) |
|
||||
| 5 | Torque applied on the front foot rotor | -1 | 1 | ffoot | hinge | torque (N m) |
|
||||
|
||||
### Observation Space
|
||||
|
||||
Observations consist of positional values of different body parts of the
|
||||
cheetah, followed by the velocities of those individual parts (their derivatives) with all the positions ordered before all the velocities.
|
||||
|
||||
By default, observations do not include the x-coordinate of the cheetah's center of mass. It may
|
||||
be included by passing `exclude_current_positions_from_observation=False` during construction.
|
||||
In that case, the observation space will have 18 dimensions where the first dimension
|
||||
represents the x-coordinate of the cheetah's center of mass.
|
||||
Regardless of whether `exclude_current_positions_from_observation` was set to true or false, the x-coordinate
|
||||
will be returned in `info` with key `"x_position"`.
|
||||
|
||||
However, by default, the observation is a `ndarray` with shape `(17,)` where the elements correspond to the following:
|
||||
|
||||
|
||||
| Num | Observation | Min | Max | Name (in corresponding XML file) | Joint| Unit |
|
||||
|-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------|
|
||||
| 0 | z-coordinate of the front tip | -Inf | Inf | rootz | slide | position (m) |
|
||||
| 1 | angle of the front tip | -Inf | Inf | rooty | hinge | angle (rad) |
|
||||
| 2 | angle of the second rotor | -Inf | Inf | bthigh | hinge | angle (rad) |
|
||||
| 3 | angle of the second rotor | -Inf | Inf | bshin | hinge | angle (rad) |
|
||||
| 4 | velocity of the tip along the x-axis | -Inf | Inf | bfoot | hinge | angle (rad) |
|
||||
| 5 | velocity of the tip along the y-axis | -Inf | Inf | fthigh | hinge | angle (rad) |
|
||||
| 6 | angular velocity of front tip | -Inf | Inf | fshin | hinge | angle (rad) |
|
||||
| 7 | angular velocity of second rotor | -Inf | Inf | ffoot | hinge | angle (rad) |
|
||||
| 8 | x-coordinate of the front tip | -Inf | Inf | rootx | slide | velocity (m/s) |
|
||||
| 9 | y-coordinate of the front tip | -Inf | Inf | rootz | slide | velocity (m/s) |
|
||||
| 10 | angle of the front tip | -Inf | Inf | rooty | hinge | angular velocity (rad/s) |
|
||||
| 11 | angle of the second rotor | -Inf | Inf | bthigh | hinge | angular velocity (rad/s) |
|
||||
| 12 | angle of the second rotor | -Inf | Inf | bshin | hinge | angular velocity (rad/s) |
|
||||
| 13 | velocity of the tip along the x-axis | -Inf | Inf | bfoot | hinge | angular velocity (rad/s) |
|
||||
| 14 | velocity of the tip along the y-axis | -Inf | Inf | fthigh | hinge |angular velocity (rad/s) |
|
||||
| 15 | angular velocity of front tip | -Inf | Inf | fshin | hinge | angular velocity (rad/s) |
|
||||
| 16 | angular velocity of second rotor | -Inf | Inf | ffoot | hinge | angular velocity (rad/s) |
|
||||
|
||||
### Rewards
|
||||
The reward consists of two parts:
|
||||
- *forward_reward*: A reward of moving forward which is measured
|
||||
as *`forward_reward_weight` * (x-coordinate before action - x-coordinate after action)/dt*. *dt* is
|
||||
the time between actions and is dependent on the frame_skip parameter
|
||||
(fixed to 5), where the frametime is 0.01 - making the
|
||||
default *dt = 5 * 0.01 = 0.05*. This reward would be positive if the cheetah
|
||||
runs forward (right).
|
||||
- *ctrl_cost*: A cost for penalising the cheetah if it takes
|
||||
actions that are too large. It is measured as *`ctrl_cost_weight` *
|
||||
sum(action<sup>2</sup>)* where *`ctrl_cost_weight`* is a parameter set for the
|
||||
control and has a default value of 0.1
|
||||
|
||||
The total reward returned is ***reward*** *=* *forward_reward - ctrl_cost* and `info` will also contain the individual reward terms
|
||||
|
||||
### Starting State
|
||||
All observations start in state (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
|
||||
0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,) with a noise added to the
|
||||
initial state for stochasticity. As seen before, the first 8 values in the
|
||||
state are positional and the last 9 values are velocity. A uniform noise in
|
||||
the range of [-`reset_noise_scale`, `reset_noise_scale`] is added to the positional values while a standard
|
||||
normal noise with a mean of 0 and standard deviation of `reset_noise_scale` is added to the
|
||||
initial velocity values of all zeros.
|
||||
|
||||
### Episode Termination
|
||||
The episode terminates when the episode length is greater than 1000.
|
||||
|
||||
### Arguments
|
||||
|
||||
No additional arguments are currently supported in v2 and lower.
|
||||
|
||||
```
|
||||
env = gym.make('HalfCheetah-v2')
|
||||
```
|
||||
|
||||
v3 and beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
|
||||
|
||||
```
|
||||
env = gym.make('HalfCheetah-v3', ctrl_cost_weight=0.1, ....)
|
||||
```
|
||||
|
||||
| Parameter | Type | Default |Description |
|
||||
|-------------------------|------------|----------------------|-------------------------------|
|
||||
| `xml_file` | **str** | `"half_cheetah.xml"` | Path to a MuJoCo model |
|
||||
| `forward_reward_weight` | **float** | `1.0` | Weight for *forward_reward* term (see section on reward) |
|
||||
| `ctrl_cost_weight` | **float** | `0.1` | Weight for *ctrl_cost* weight (see section on reward) |
|
||||
| `reset_noise_scale` | **float** | `0.1` | Scale of random perturbations of initial position and velocity (see section on Starting State) |
|
||||
| `exclude_current_positions_from_observation`| **bool** | `True` | Whether or not to omit the x-coordinate from observations. Excluding the position can serve as an inductive bias to induce position-agnostic behavior in policies |
|
||||
|
||||
|
||||
### Version History
|
||||
|
||||
* v3: support for gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. rgb rendering comes from tracking camera (so agent does not run away from screen)
|
||||
* v2: All continuous control environments now use mujoco_py >= 1.50
|
||||
* v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments.
|
||||
* v0: Initial versions release (1.0.0)
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
xml_file="half_cheetah.xml",
|
||||
@@ -149,7 +30,7 @@ class HalfCheetahEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
||||
exclude_current_positions_from_observation
|
||||
)
|
||||
|
||||
mujoco_env.MujocoEnv.__init__(self, xml_file, 5)
|
||||
mujoco_env.MujocoEnv.__init__(self, xml_file, 5, mujoco_bindings="mujoco_py")
|
||||
|
||||
def control_cost(self, action):
|
||||
control_cost = self._ctrl_cost_weight * np.sum(np.square(action))
|
||||
|
230
gym/envs/mujoco/half_cheetah_v4.py
Normal file
230
gym/envs/mujoco/half_cheetah_v4.py
Normal file
@@ -0,0 +1,230 @@
|
||||
__credits__ = ["Rushiv Arora"]
|
||||
import numpy as np
|
||||
|
||||
from gym import utils
|
||||
from gym.envs.mujoco import mujoco_env
|
||||
|
||||
DEFAULT_CAMERA_CONFIG = {
|
||||
"distance": 4.0,
|
||||
}
|
||||
|
||||
|
||||
class HalfCheetahEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
||||
"""
|
||||
### Description
|
||||
|
||||
This environment is based on the work by P. Wawrzy´nski in
|
||||
["A Cat-Like Robot Real-Time Learning to Run"](http://staff.elka.pw.edu.pl/~pwawrzyn/pub-s/0812_LSCLRR.pdf).
|
||||
The HalfCheetah is a 2-dimensional robot consisting of 9 links and 8
|
||||
joints connecting them (including two paws). The goal is to apply a torque
|
||||
on the joints to make the cheetah run forward (right) as fast as possible,
|
||||
with a positive reward allocated based on the distance moved forward and a
|
||||
negative reward allocated for moving backward. The torso and head of the
|
||||
cheetah are fixed, and the torque can only be applied on the other 6 joints
|
||||
over the front and back thighs (connecting to the torso), shins
|
||||
(connecting to the thighs) and feet (connecting to the shins).
|
||||
|
||||
### Action Space
|
||||
The agent take a 6-element vector for actions.
|
||||
The action space is a continuous `(action, action, action, action, action, action)` all in `[-1.0, 1.0]`, where `action` represents the numerical torques applied between *links*
|
||||
|
||||
| Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit |
|
||||
|-------|--------------------------------------|---------------|----------------|---------------------------------------|-------|------|
|
||||
| 0 | Torque applied on the back thigh rotor | -1 | 1 | bthigh | hinge | torque (N m) |
|
||||
| 1 | Torque applied on the back shin rotor | -1 | 1 | bshin | hinge | torque (N m) |
|
||||
| 2 | Torque applied on the back foot rotor | -1 | 1 | bfoot | hinge | torque (N m) |
|
||||
| 3 | Torque applied on the front thigh rotor| -1 | 1 | fthigh | hinge | torque (N m) |
|
||||
| 4 | Torque applied on the front shin rotor | -1 | 1 | fshin | hinge | torque (N m) |
|
||||
| 5 | Torque applied on the front foot rotor | -1 | 1 | ffoot | hinge | torque (N m) |
|
||||
|
||||
### Observation Space
|
||||
|
||||
The state space consists of positional values of different body parts of the
|
||||
cheetah, followed by the velocities of those individual parts (their derivatives) with all the positions ordered before all the velocities.
|
||||
|
||||
The observation is a `ndarray` with shape `(17,)` where the elements correspond to the following:
|
||||
|
||||
| Num | Observation | Min | Max | Name (in corresponding XML file) | Joint| Unit |
|
||||
|-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------|
|
||||
| 0 | x-coordinate of the center of mass | -Inf | Inf | rootx | slide | position (m) |
|
||||
| 1 | y-coordinate of the center of mass | -Inf | Inf | rootz | slide | position (m) |
|
||||
| 2 | angle of the front tip | -Inf | Inf | rooty | hinge | angle (rad) |
|
||||
| 3 | angle of the back thigh rotor | -Inf | Inf | bthigh | hinge | angle (rad) |
|
||||
| 4 | angle of the back shin rotor | -Inf | Inf | bshin | hinge | angle (rad) |
|
||||
| 5 | angle of the back foot rotor | -Inf | Inf | bfoot | hinge | angle (rad) |
|
||||
| 6 | velocity of the tip along the y-axis | -Inf | Inf | fthigh | hinge | angle (rad) |
|
||||
| 7 | angular velocity of front tip | -Inf | Inf | fshin | hinge | angle (rad) |
|
||||
| 8 | angular velocity of second rotor | -Inf | Inf | ffoot | hinge | angle (rad) |
|
||||
| 9 | x-coordinate of the front tip | -Inf | Inf | rootx | slide | velocity (m/s) |
|
||||
| 10 | y-coordinate of the front tip | -Inf | Inf | rootz | slide | velocity (m/s) |
|
||||
| 11 | angle of the front tip | -Inf | Inf | rooty | hinge | angular velocity (rad/s) |
|
||||
| 12 | angle of the second rotor | -Inf | Inf | bthigh | hinge | angular velocity (rad/s) |
|
||||
| 13 | angle of the second rotor | -Inf | Inf | bshin | hinge | angular velocity (rad/s) |
|
||||
| 14 | velocity of the tip along the x-axis | -Inf | Inf | bfoot | hinge | angular velocity (rad/s) |
|
||||
| 15 | velocity of the tip along the y-axis | -Inf | Inf | fthigh | hinge |angular velocity (rad/s) |
|
||||
| 16 | angular velocity of front tip | -Inf | Inf | fshin | hinge | angular velocity (rad/s) |
|
||||
| 17 | angular velocity of second rotor | -Inf | Inf | ffoot | hinge | angular velocity (rad/s) |
|
||||
|
||||
|
||||
**Note:**
|
||||
In practice (and Gym implementation), the first positional element is
|
||||
omitted from the state space since the reward function is calculated based
|
||||
on that value. This value is hidden from the algorithm, which in turn has
|
||||
to develop an abstract understanding of it from the observed rewards.
|
||||
Therefore, observation space has shape `(8,)` and looks like:
|
||||
|
||||
| Num | Observation | Min | Max | Name (in corresponding XML file) | Joint| Unit |
|
||||
|-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------|
|
||||
| 0 | y-coordinate of the front tip | -Inf | Inf | rootz | slide | position (m) |
|
||||
| 1 | angle of the front tip | -Inf | Inf | rooty | hinge | angle (rad) |
|
||||
| 2 | angle of the second rotor | -Inf | Inf | bthigh | hinge | angle (rad) |
|
||||
| 3 | angle of the second rotor | -Inf | Inf | bshin | hinge | angle (rad) |
|
||||
| 4 | velocity of the tip along the x-axis | -Inf | Inf | bfoot | hinge | angle (rad) |
|
||||
| 5 | velocity of the tip along the y-axis | -Inf | Inf | fthigh | hinge | angle (rad) |
|
||||
| 6 | angular velocity of front tip | -Inf | Inf | fshin | hinge | angle (rad) |
|
||||
| 7 | angular velocity of second rotor | -Inf | Inf | ffoot | hinge | angle (rad) |
|
||||
| 8 | x-coordinate of the front tip | -Inf | Inf | rootx | slide | velocity (m/s) |
|
||||
| 9 | y-coordinate of the front tip | -Inf | Inf | rootz | slide | velocity (m/s) |
|
||||
| 10 | angle of the front tip | -Inf | Inf | rooty | hinge | angular velocity (rad/s) |
|
||||
| 11 | angle of the second rotor | -Inf | Inf | bthigh | hinge | angular velocity (rad/s) |
|
||||
| 12 | angle of the second rotor | -Inf | Inf | bshin | hinge | angular velocity (rad/s) |
|
||||
| 13 | velocity of the tip along the x-axis | -Inf | Inf | bfoot | hinge | angular velocity (rad/s) |
|
||||
| 14 | velocity of the tip along the y-axis | -Inf | Inf | fthigh | hinge |angular velocity (rad/s) |
|
||||
| 15 | angular velocity of front tip | -Inf | Inf | fshin | hinge | angular velocity (rad/s) |
|
||||
| 16 | angular velocity of second rotor | -Inf | Inf | ffoot | hinge | angular velocity (rad/s) |
|
||||
|
||||
### Rewards
|
||||
The reward consists of two parts:
|
||||
- *reward_run*: A reward of moving forward which is measured
|
||||
as *(x-coordinate before action - x-coordinate after action)/dt*. *dt* is
|
||||
the time between actions and is dependeent on the frame_skip parameter
|
||||
(default is 5), where the *dt* for one frame is 0.01 - making the
|
||||
default *dt = 5*0.01 = 0.05*. This reward would be positive if the cheetah
|
||||
runs forward (right) desired.
|
||||
- *reward_control*: A negative reward for penalising the cheetah if it takes
|
||||
actions that are too large. It is measured as *-coefficient x
|
||||
sum(action<sup>2</sup>)* where *coefficient* is a parameter set for the
|
||||
control and has a default value of 0.1
|
||||
|
||||
The total reward returned is ***reward*** *=* *reward_run + reward_control*
|
||||
|
||||
### Starting State
|
||||
All observations start in state (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
|
||||
0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,) with a noise added to the
|
||||
initial state for stochasticity. As seen before, the first 8 values in the
|
||||
state are positional and the last 9 values are velocity. A uniform noise in
|
||||
the range of [-0.1, 0.1] is added to the positional values while a standard
|
||||
normal noise with a mean of 0 and standard deviation of 0.1 is added to the
|
||||
initial velocity values of all zeros.
|
||||
|
||||
### Episode Termination
|
||||
The episode terminates when the episode length is greater than 1000.
|
||||
|
||||
### Arguments
|
||||
|
||||
No additional arguments are currently supported (in v2 and lower), but
|
||||
modifications can be made to the XML file in the assets folder at
|
||||
`gym/envs/mujoco/assets/half_cheetah.xml` (or by changing the path to a
|
||||
modified XML file in another folder).
|
||||
|
||||
```
|
||||
env = gym.make('HalfCheetah-v2')
|
||||
```
|
||||
|
||||
v3 and beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
|
||||
|
||||
```
|
||||
env = gym.make('HalfCheetah-v3', ctrl_cost_weight=0.1, ....)
|
||||
```
|
||||
|
||||
|
||||
### Version History
|
||||
|
||||
* v4: all mujoco environments now use the mujoco binidings in mujoco>=2.1.3
|
||||
* v3: support for gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. rgb rendering comes from tracking camera (so agent does not run away from screen)
|
||||
* v2: All continuous control environments now use mujoco_py >= 1.50
|
||||
* v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments.
|
||||
* v0: Initial versions release (1.0.0)
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
xml_file="half_cheetah.xml",
|
||||
forward_reward_weight=1.0,
|
||||
ctrl_cost_weight=0.1,
|
||||
reset_noise_scale=0.1,
|
||||
exclude_current_positions_from_observation=True,
|
||||
):
|
||||
utils.EzPickle.__init__(**locals())
|
||||
|
||||
self._forward_reward_weight = forward_reward_weight
|
||||
|
||||
self._ctrl_cost_weight = ctrl_cost_weight
|
||||
|
||||
self._reset_noise_scale = reset_noise_scale
|
||||
|
||||
self._exclude_current_positions_from_observation = (
|
||||
exclude_current_positions_from_observation
|
||||
)
|
||||
|
||||
mujoco_env.MujocoEnv.__init__(self, xml_file, 5)
|
||||
|
||||
def control_cost(self, action):
|
||||
control_cost = self._ctrl_cost_weight * np.sum(np.square(action))
|
||||
return control_cost
|
||||
|
||||
def step(self, action):
|
||||
x_position_before = self.data.qpos[0]
|
||||
self.do_simulation(action, self.frame_skip)
|
||||
x_position_after = self.data.qpos[0]
|
||||
x_velocity = (x_position_after - x_position_before) / self.dt
|
||||
|
||||
ctrl_cost = self.control_cost(action)
|
||||
|
||||
forward_reward = self._forward_reward_weight * x_velocity
|
||||
|
||||
observation = self._get_obs()
|
||||
reward = forward_reward - ctrl_cost
|
||||
done = False
|
||||
info = {
|
||||
"x_position": x_position_after,
|
||||
"x_velocity": x_velocity,
|
||||
"reward_run": forward_reward,
|
||||
"reward_ctrl": -ctrl_cost,
|
||||
}
|
||||
|
||||
return observation, reward, done, info
|
||||
|
||||
def _get_obs(self):
|
||||
position = self.data.qpos.flat.copy()
|
||||
velocity = self.data.qvel.flat.copy()
|
||||
|
||||
if self._exclude_current_positions_from_observation:
|
||||
position = position[1:]
|
||||
|
||||
observation = np.concatenate((position, velocity)).ravel()
|
||||
return observation
|
||||
|
||||
def reset_model(self):
|
||||
noise_low = -self._reset_noise_scale
|
||||
noise_high = self._reset_noise_scale
|
||||
|
||||
qpos = self.init_qpos + self.np_random.uniform(
|
||||
low=noise_low, high=noise_high, size=self.model.nq
|
||||
)
|
||||
qvel = (
|
||||
self.init_qvel
|
||||
+ self._reset_noise_scale * self.np_random.standard_normal(self.model.nv)
|
||||
)
|
||||
|
||||
self.set_state(qpos, qvel)
|
||||
|
||||
observation = self._get_obs()
|
||||
return observation
|
||||
|
||||
def viewer_setup(self):
|
||||
for key, value in DEFAULT_CAMERA_CONFIG.items():
|
||||
if isinstance(value, np.ndarray):
|
||||
getattr(self.viewer.cam, key)[:] = value
|
||||
else:
|
||||
setattr(self.viewer.cam, key, value)
|
@@ -6,7 +6,9 @@ from gym.envs.mujoco import mujoco_env
|
||||
|
||||
class HopperEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
||||
def __init__(self):
|
||||
mujoco_env.MujocoEnv.__init__(self, "hopper.xml", 4)
|
||||
mujoco_env.MujocoEnv.__init__(
|
||||
self, "hopper.xml", 4, mujoco_bindings="mujoco_py"
|
||||
)
|
||||
utils.EzPickle.__init__(self)
|
||||
|
||||
def step(self, a):
|
||||
|
@@ -14,130 +14,6 @@ DEFAULT_CAMERA_CONFIG = {
|
||||
|
||||
|
||||
class HopperEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
||||
"""
|
||||
### Description
|
||||
|
||||
This environment is based on the work done by Erez, Tassa, and Todorov in
|
||||
["Infinite Horizon Model Predictive Control for Nonlinear Periodic Tasks"](http://www.roboticsproceedings.org/rss07/p10.pdf). The environment aims to
|
||||
increase the number of independent state and control variables as compared to
|
||||
the classic control environments. The hopper is a two-dimensional
|
||||
one-legged figure that consist of four main body parts - the torso at the
|
||||
top, the thigh in the middle, the leg in the bottom, and a single foot on
|
||||
which the entire body rests. The goal is to make hops that move in the
|
||||
forward (right) direction by applying torques on the three hinges
|
||||
connecting the four body parts.
|
||||
|
||||
### Action Space
|
||||
The action space is a `Box(-1, 1, (3,), float32)`. An action represents the torques applied between *links*
|
||||
|
||||
| Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit |
|
||||
|-----|------------------------------------|-------------|-------------|----------------------------------|-------|--------------|
|
||||
| 0 | Torque applied on the thigh rotor | -1 | 1 | thigh_joint | hinge | torque (N m) |
|
||||
| 1 | Torque applied on the leg rotor | -1 | 1 | leg_joint | hinge | torque (N m) |
|
||||
| 3 | Torque applied on the foot rotor | -1 | 1 | foot_joint | hinge | torque (N m) |
|
||||
|
||||
### Observation Space
|
||||
|
||||
Observations consist of positional values of different body parts of the
|
||||
hopper, followed by the velocities of those individual parts
|
||||
(their derivatives) with all the positions ordered before all the velocities.
|
||||
|
||||
By default, observations do not include the x-coordinate of the hopper. It may
|
||||
be included by passing `exclude_current_positions_from_observation=False` during construction.
|
||||
In that case, the observation space will have 12 dimensions where the first dimension
|
||||
represents the x-coordinate of the hopper.
|
||||
Regardless of whether `exclude_current_positions_from_observation` was set to true or false, the x-coordinate
|
||||
will be returned in `info` with key `"x_position"`.
|
||||
|
||||
However, by default, the observation is a `ndarray` with shape `(11,)` where the elements
|
||||
correspond to the following:
|
||||
|
||||
| Num | Observation | Min | Max | Name (in corresponding XML file) | Joint| Unit |
|
||||
|-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------|
|
||||
| 0 | z-coordinate of the top (height of hopper) | -Inf | Inf | rootz | slide | position (m) |
|
||||
| 1 | angle of the top | -Inf | Inf | rooty | hinge | angle (rad) |
|
||||
| 2 | angle of the thigh joint | -Inf | Inf | thigh_joint | hinge | angle (rad) |
|
||||
| 3 | angle of the leg joint | -Inf | Inf | leg_joint | hinge | angle (rad) |
|
||||
| 4 | angle of the foot joint | -Inf | Inf | foot_joint | hinge | angle (rad) |
|
||||
| 5 | velocity of the x-coordinate of the top | -Inf | Inf | rootx | slide | velocity (m/s) |
|
||||
| 6 | velocity of the z-coordinate (height) of the top | -Inf | Inf | rootz | slide | velocity (m/s) |
|
||||
| 7 | angular velocity of the angle of the top | -Inf | Inf | rooty | hinge | angular velocity (rad/s) |
|
||||
| 8 | angular velocity of the thigh hinge | -Inf | Inf | thigh_joint | hinge | angular velocity (rad/s) |
|
||||
| 9 | angular velocity of the leg hinge | -Inf | Inf | leg_joint | hinge | angular velocity (rad/s) |
|
||||
| 10 | angular velocity of the foot hinge | -Inf | Inf | foot_joint | hinge | angular velocity (rad/s) |
|
||||
|
||||
### Rewards
|
||||
The reward consists of three parts:
|
||||
- *healthy_reward*: Every timestep that the hopper is healthy (see definition in section "Episode Termination"), it gets a reward of fixed value `healthy_reward`.
|
||||
- *forward_reward*: A reward of hopping forward which is measured
|
||||
as *`forward_reward_weight` * (x-coordinate before action - x-coordinate after action)/dt*. *dt* is
|
||||
the time between actions and is dependent on the frame_skip parameter
|
||||
(fixed to 4), where the frametime is 0.002 - making the
|
||||
default *dt = 4 * 0.002 = 0.008*. This reward would be positive if the hopper
|
||||
hops forward (positive x direction).
|
||||
- *ctrl_cost*: A cost for penalising the hopper if it takes
|
||||
actions that are too large. It is measured as *`ctrl_cost_weight` *
|
||||
sum(action<sup>2</sup>)* where *`ctrl_cost_weight`* is a parameter set for the
|
||||
control and has a default value of 0.001
|
||||
|
||||
The total reward returned is ***reward*** *=* *healthy_reward + forward_reward - ctrl_cost* and `info` will also contain the individual reward terms
|
||||
|
||||
### Starting State
|
||||
All observations start in state
|
||||
(0.0, 1.25, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0) with a uniform noise
|
||||
in the range of [-`reset_noise_scale`, `reset_noise_scale`] added to the values for stochasticity.
|
||||
|
||||
### Episode Termination
|
||||
The hopper is said to be unhealthy if any of the following happens:
|
||||
|
||||
1. An element of `observation[1:]` (if `exclude_current_positions_from_observation=True`, else `observation[2:]`) is no longer contained in the closed interval specified by the argument `healthy_state_range`
|
||||
2. The height of the hopper (`observation[0]` if `exclude_current_positions_from_observation=True`, else `observation[1]`) is no longer contained in the closed interval specified by the argument `healthy_z_range` (usually meaning that it has fallen)
|
||||
3. The angle (`observation[1]` if `exclude_current_positions_from_observation=True`, else `observation[2]`) is no longer contained in the closed interval specified by the argument `healthy_angle_range`
|
||||
|
||||
If `terminate_when_unhealthy=True` is passed during construction (which is the default),
|
||||
the episode terminates when any of the following happens:
|
||||
|
||||
1. The episode duration reaches a 1000 timesteps
|
||||
2. The hopper is unhealthy
|
||||
|
||||
If `terminate_when_unhealthy=False` is passed, the episode is terminated only when 1000 timesteps are exceeded.
|
||||
|
||||
### Arguments
|
||||
|
||||
No additional arguments are currently supported in v2 and lower.
|
||||
|
||||
```
|
||||
env = gym.make('Hopper-v2')
|
||||
```
|
||||
|
||||
v3 and beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
|
||||
|
||||
```
|
||||
env = gym.make('Hopper-v3', ctrl_cost_weight=0.1, ....)
|
||||
```
|
||||
|
||||
| Parameter | Type | Default |Description |
|
||||
|-------------------------|------------|----------------|-------------------------------|
|
||||
| `xml_file` | **str** | `"hopper.xml"` | Path to a MuJoCo model |
|
||||
| `forward_reward_weight` | **float** | `1.0` | Weight for *forward_reward* term (see section on reward) |
|
||||
| `ctrl_cost_weight` | **float** | `0.001` | Weight for *ctrl_cost* reward (see section on reward) |
|
||||
| `healthy_reward` | **float** | `1` | Constant reward given if the ant is "healthy" after timestep |
|
||||
| `terminate_when_unhealthy` | **bool**| `True` | If true, issue a done signal if the hopper is no longer healthy |
|
||||
| `healthy_state_range` | **tuple** | `(-100, 100)` | The elements of `observation[1:]` (if `exclude_current_positions_from_observation=True`, else `observation[2:]`) must be in this range for the hopper to be considered healthy |
|
||||
| `healthy_z_range` | **tuple** | `(0.7, float("inf"))` | The z-coordinate must be in this range for the hopper to be considered healthy |
|
||||
| `healthy_angle_range` | **tuple** | `(-0.2, 0.2)` | The angle given by `observation[1]` (if `exclude_current_positions_from_observation=True`, else `observation[2]`) must be in this range for the hopper to be considered healthy |
|
||||
| `reset_noise_scale` | **float** | `5e-3` | Scale of random perturbations of initial position and velocity (see section on Starting State) |
|
||||
| `exclude_current_positions_from_observation`| **bool** | `True`| Whether or not to omit the x-coordinate from observations. Excluding the position can serve as an inductive bias to induce position-agnostic behavior in policies |
|
||||
|
||||
|
||||
### Version History
|
||||
|
||||
* v3: support for gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. rgb rendering comes from tracking camera (so agent does not run away from screen)
|
||||
* v2: All continuous control environments now use mujoco_py >= 1.50
|
||||
* v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments.
|
||||
* v0: Initial versions release (1.0.0)
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
xml_file="hopper.xml",
|
||||
@@ -170,7 +46,7 @@ class HopperEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
||||
exclude_current_positions_from_observation
|
||||
)
|
||||
|
||||
mujoco_env.MujocoEnv.__init__(self, xml_file, 4)
|
||||
mujoco_env.MujocoEnv.__init__(self, xml_file, 4, mujoco_bindings="mujoco_py")
|
||||
|
||||
@property
|
||||
def healthy_reward(self):
|
||||
|
263
gym/envs/mujoco/hopper_v4.py
Normal file
263
gym/envs/mujoco/hopper_v4.py
Normal file
@@ -0,0 +1,263 @@
|
||||
import numpy as np
|
||||
|
||||
from gym import utils
|
||||
from gym.envs.mujoco import mujoco_env
|
||||
|
||||
DEFAULT_CAMERA_CONFIG = {
|
||||
"trackbodyid": 2,
|
||||
"distance": 3.0,
|
||||
"lookat": np.array((0.0, 0.0, 1.15)),
|
||||
"elevation": -20.0,
|
||||
}
|
||||
|
||||
|
||||
class HopperEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
||||
"""
|
||||
### Description
|
||||
|
||||
This environment is based on the work done by Erez, Tassa, and Todorov in
|
||||
["Infinite Horizon Model Predictive Control for Nonlinear Periodic Tasks"](http://www.roboticsproceedings.org/rss07/p10.pdf). The environment aims to
|
||||
increase the number of independent state and control variables as compared to
|
||||
the classic control environments. The hopper is a two-dimensional
|
||||
one-legged figure that consist of four main body parts - the torso at the
|
||||
top, the thigh in the middle, the leg in the bottom, and a single foot on
|
||||
which the entire body rests. The goal is to make hops that move in the
|
||||
forward (right) direction by applying torques on the three hinges
|
||||
connecting the four body parts.
|
||||
|
||||
### Action Space
|
||||
The agent take a 3-element vector for actions.
|
||||
The action space is a continuous `(action, action, action)` all in `[-1, 1]`
|
||||
, where `action` represents the numerical torques applied between *links*
|
||||
|
||||
| Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit |
|
||||
|-----|------------------------------------|-------------|-------------|----------------------------------|-------|--------------|
|
||||
| 0 | Torque applied on the thigh rotor | -1 | 1 | thigh_joint | hinge | torque (N m) |
|
||||
| 1 | Torque applied on the leg rotor | -1 | 1 | leg_joint | hinge | torque (N m) |
|
||||
| 3 | Torque applied on the foot rotor | -1 | 1 | foot_joint | hinge | torque (N m) |
|
||||
|
||||
### Observation Space
|
||||
|
||||
The state space consists of positional values of different body parts of the
|
||||
hopper, followed by the velocities of those individual parts
|
||||
(their derivatives) with all the positions ordered before all the velocities.
|
||||
|
||||
The observation is a `ndarray` with shape `(11,)` where the elements
|
||||
correspond to the following:
|
||||
|
||||
| Num | Observation | Min | Max | Name (in corresponding XML file) | Joint| Unit |
|
||||
|-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------|
|
||||
| 0 | x-coordinate of the top | -Inf | Inf | rootx | slide | position (m) |
|
||||
| 1 | z-coordinate of the top (height of hopper) | -Inf | Inf | rootz | slide | position (m) |
|
||||
| 2 | angle of the top | -Inf | Inf | rooty | hinge | angle (rad) |
|
||||
| 3 | angle of the thigh joint | -Inf | Inf | thigh_joint | hinge | angle (rad) |
|
||||
| 4 | angle of the leg joint | -Inf | Inf | leg_joint | hinge | angle (rad) |
|
||||
| 5 | angle of the foot joint | -Inf | Inf | foot_joint | hinge | angle (rad) |
|
||||
| 6 | velocity of the x-coordinate of the top | -Inf | Inf | rootx | slide | velocity (m/s) |
|
||||
| 7 | velocity of the z-coordinate (height) of the top | -Inf | Inf | rootz | slide | velocity (m/s) |
|
||||
| 8 | angular velocity of the angle of the top | -Inf | Inf | rooty | hinge | angular velocity (rad/s) |
|
||||
| 9 | angular velocity of the thigh hinge | -Inf | Inf | thigh_joint | hinge | angular velocity (rad/s) |
|
||||
| 10 | angular velocity of the leg hinge | -Inf | Inf | leg_joint | hinge | angular velocity (rad/s) |
|
||||
| 11 | angular velocity of the foot hinge | -Inf | Inf | foot_joint | hinge | angular velocity (rad/s) |
|
||||
|
||||
|
||||
|
||||
**Note:**
|
||||
In practice (and Gym implementation), the first positional element is
|
||||
omitted from the state space since the reward function is calculated based
|
||||
on that value. This value is hidden from the algorithm, which in turn has
|
||||
to develop an abstract understanding of it from the observed rewards.
|
||||
Therefore, observation space has shape `(11,)` instead of `(12,)` and looks like:
|
||||
|
||||
| Num | Observation | Min | Max | Name (in corresponding XML file) | Joint| Unit |
|
||||
|-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------|
|
||||
| 0 | z-coordinate of the top (height of hopper) | -Inf | Inf | rootz | slide | position (m) |
|
||||
| 1 | angle of the top | -Inf | Inf | rooty | hinge | angle (rad) |
|
||||
| 2 | angle of the thigh joint | -Inf | Inf | thigh_joint | hinge | angle (rad) |
|
||||
| 3 | angle of the leg joint | -Inf | Inf | leg_joint | hinge | angle (rad) |
|
||||
| 4 | angle of the foot joint | -Inf | Inf | foot_joint | hinge | angle (rad) |
|
||||
| 5 | velocity of the x-coordinate of the top | -Inf | Inf | rootx | slide | velocity (m/s) |
|
||||
| 6 | velocity of the z-coordinate (height) of the top | -Inf | Inf | rootz | slide | velocity (m/s) |
|
||||
| 7 | angular velocity of the angle of the top | -Inf | Inf | rooty | hinge | angular velocity (rad/s) |
|
||||
| 8 | angular velocity of the thigh hinge | -Inf | Inf | thigh_joint | hinge | angular velocity (rad/s) |
|
||||
| 9 | angular velocity of the leg hinge | -Inf | Inf | leg_joint | hinge | angular velocity (rad/s) |
|
||||
| 10 | angular velocity of the foot hinge | -Inf | Inf | foot_joint | hinge | angular velocity (rad/s) |
|
||||
|
||||
### Rewards
|
||||
The reward consists of three parts:
|
||||
- *alive bonus*: Every timestep that the hopper is alive, it gets a reward of 1,
|
||||
- *reward_forward*: A reward of hopping forward which is measured
|
||||
as *(x-coordinate before action - x-coordinate after action)/dt*. *dt* is
|
||||
the time between actions and is dependeent on the frame_skip parameter
|
||||
(default is 4), where the *dt* for one frame is 0.002 - making the
|
||||
default *dt = 4*0.002 = 0.008*. This reward would be positive if the hopper
|
||||
hops forward (right) desired.
|
||||
- *reward_control*: A negative reward for penalising the hopper if it takes
|
||||
actions that are too large. It is measured as *-coefficient **x**
|
||||
sum(action<sup>2</sup>)* where *coefficient* is a parameter set for the
|
||||
control and has a default value of 0.001
|
||||
|
||||
The total reward returned is ***reward*** *=* *alive bonus + reward_forward + reward_control*
|
||||
|
||||
### Starting State
|
||||
All observations start in state
|
||||
(0.0, 1.25, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0) with a uniform noise
|
||||
in the range of [-0.005, 0.005] added to the values for stochasticity.
|
||||
|
||||
### Episode Termination
|
||||
The episode terminates when any of the following happens:
|
||||
|
||||
1. The episode duration reaches a 1000 timesteps
|
||||
2. Any of the state space values is no longer finite
|
||||
3. The absolute value of any of the state variable indexed (angle and beyond) is greater than 100
|
||||
4. The height of the hopper becomes greater than 0.7 metres (hopper has hopped too high).
|
||||
5. The absolute value of the angle (index 2) is less than 0.2 radians (hopper has fallen down).
|
||||
|
||||
### Arguments
|
||||
|
||||
No additional arguments are currently supported (in v2 and lower), but
|
||||
modifications can be made to the XML file in the assets folder
|
||||
(or by changing the path to a modified XML file in another folder).
|
||||
|
||||
```
|
||||
env = gym.make('Hopper-v2')
|
||||
```
|
||||
|
||||
v3 and beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
|
||||
|
||||
```
|
||||
env = gym.make('Hopper-v3', ctrl_cost_weight=0.1, ....)
|
||||
```
|
||||
|
||||
### Version History
|
||||
|
||||
* v4: all mujoco environments now use the mujoco binidings in mujoco>=2.1.3
|
||||
* v3: support for gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. rgb rendering comes from tracking camera (so agent does not run away from screen)
|
||||
* v2: All continuous control environments now use mujoco_py >= 1.50
|
||||
* v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments.
|
||||
* v0: Initial versions release (1.0.0)
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
xml_file="hopper.xml",
|
||||
forward_reward_weight=1.0,
|
||||
ctrl_cost_weight=1e-3,
|
||||
healthy_reward=1.0,
|
||||
terminate_when_unhealthy=True,
|
||||
healthy_state_range=(-100.0, 100.0),
|
||||
healthy_z_range=(0.7, float("inf")),
|
||||
healthy_angle_range=(-0.2, 0.2),
|
||||
reset_noise_scale=5e-3,
|
||||
exclude_current_positions_from_observation=True,
|
||||
):
|
||||
utils.EzPickle.__init__(**locals())
|
||||
|
||||
self._forward_reward_weight = forward_reward_weight
|
||||
|
||||
self._ctrl_cost_weight = ctrl_cost_weight
|
||||
|
||||
self._healthy_reward = healthy_reward
|
||||
self._terminate_when_unhealthy = terminate_when_unhealthy
|
||||
|
||||
self._healthy_state_range = healthy_state_range
|
||||
self._healthy_z_range = healthy_z_range
|
||||
self._healthy_angle_range = healthy_angle_range
|
||||
|
||||
self._reset_noise_scale = reset_noise_scale
|
||||
|
||||
self._exclude_current_positions_from_observation = (
|
||||
exclude_current_positions_from_observation
|
||||
)
|
||||
|
||||
mujoco_env.MujocoEnv.__init__(self, xml_file, 4)
|
||||
|
||||
@property
|
||||
def healthy_reward(self):
|
||||
return (
|
||||
float(self.is_healthy or self._terminate_when_unhealthy)
|
||||
* self._healthy_reward
|
||||
)
|
||||
|
||||
def control_cost(self, action):
|
||||
control_cost = self._ctrl_cost_weight * np.sum(np.square(action))
|
||||
return control_cost
|
||||
|
||||
@property
|
||||
def is_healthy(self):
|
||||
z, angle = self.data.qpos[1:3]
|
||||
state = self.state_vector()[2:]
|
||||
|
||||
min_state, max_state = self._healthy_state_range
|
||||
min_z, max_z = self._healthy_z_range
|
||||
min_angle, max_angle = self._healthy_angle_range
|
||||
|
||||
healthy_state = np.all(np.logical_and(min_state < state, state < max_state))
|
||||
healthy_z = min_z < z < max_z
|
||||
healthy_angle = min_angle < angle < max_angle
|
||||
|
||||
is_healthy = all((healthy_state, healthy_z, healthy_angle))
|
||||
|
||||
return is_healthy
|
||||
|
||||
@property
|
||||
def done(self):
|
||||
done = not self.is_healthy if self._terminate_when_unhealthy else False
|
||||
return done
|
||||
|
||||
def _get_obs(self):
|
||||
position = self.data.qpos.flat.copy()
|
||||
velocity = np.clip(self.data.qvel.flat.copy(), -10, 10)
|
||||
|
||||
if self._exclude_current_positions_from_observation:
|
||||
position = position[1:]
|
||||
|
||||
observation = np.concatenate((position, velocity)).ravel()
|
||||
return observation
|
||||
|
||||
def step(self, action):
|
||||
x_position_before = self.data.qpos[0]
|
||||
self.do_simulation(action, self.frame_skip)
|
||||
x_position_after = self.data.qpos[0]
|
||||
x_velocity = (x_position_after - x_position_before) / self.dt
|
||||
|
||||
ctrl_cost = self.control_cost(action)
|
||||
|
||||
forward_reward = self._forward_reward_weight * x_velocity
|
||||
healthy_reward = self.healthy_reward
|
||||
|
||||
rewards = forward_reward + healthy_reward
|
||||
costs = ctrl_cost
|
||||
|
||||
observation = self._get_obs()
|
||||
reward = rewards - costs
|
||||
done = self.done
|
||||
info = {
|
||||
"x_position": x_position_after,
|
||||
"x_velocity": x_velocity,
|
||||
}
|
||||
|
||||
return observation, reward, done, info
|
||||
|
||||
def reset_model(self):
|
||||
noise_low = -self._reset_noise_scale
|
||||
noise_high = self._reset_noise_scale
|
||||
|
||||
qpos = self.init_qpos + self.np_random.uniform(
|
||||
low=noise_low, high=noise_high, size=self.model.nq
|
||||
)
|
||||
qvel = self.init_qvel + self.np_random.uniform(
|
||||
low=noise_low, high=noise_high, size=self.model.nv
|
||||
)
|
||||
|
||||
self.set_state(qpos, qvel)
|
||||
|
||||
observation = self._get_obs()
|
||||
return observation
|
||||
|
||||
def viewer_setup(self):
|
||||
for key, value in DEFAULT_CAMERA_CONFIG.items():
|
||||
if isinstance(value, np.ndarray):
|
||||
getattr(self.viewer.cam, key)[:] = value
|
||||
else:
|
||||
setattr(self.viewer.cam, key, value)
|
@@ -12,7 +12,9 @@ def mass_center(model, sim):
|
||||
|
||||
class HumanoidEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
||||
def __init__(self):
|
||||
mujoco_env.MujocoEnv.__init__(self, "humanoid.xml", 5)
|
||||
mujoco_env.MujocoEnv.__init__(
|
||||
self, "humanoid.xml", 5, mujoco_bindings="mujoco_py"
|
||||
)
|
||||
utils.EzPickle.__init__(self)
|
||||
|
||||
def _get_obs(self):
|
||||
|
@@ -18,198 +18,6 @@ def mass_center(model, sim):
|
||||
|
||||
|
||||
class HumanoidEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
||||
"""
|
||||
### Description
|
||||
|
||||
This environment is based on the environment introduced by Tassa, Erez and Todorov
|
||||
in ["Synthesis and stabilization of complex behaviors through online trajectory optimization"](https://ieeexplore.ieee.org/document/6386025).
|
||||
The 3D bipedal robot is designed to simulate a human. It has a torso (abdomen) with a pair of
|
||||
legs and arms. The legs each consist of two links, and so the arms (representing the knees and
|
||||
elbows respectively). The goal of the environment is to walk forward as fast as possible without falling over.
|
||||
|
||||
### Action Space
|
||||
The action space is a `Box(-1, 1, (17,), float32)`. An action represents the torques applied at the hinge joints.
|
||||
|
||||
| Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit |
|
||||
|-----|----------------------|---------------|----------------|---------------------------------------|-------|------|
|
||||
| 0 | Torque applied on the hinge in the y-coordinate of the abdomen | -0.4 | 0.4 | hip_1 (front_left_leg) | hinge | torque (N m) |
|
||||
| 1 | Torque applied on the hinge in the z-coordinate of the abdomen | -0.4 | 0.4 | angle_1 (front_left_leg) | hinge | torque (N m) |
|
||||
| 2 | Torque applied on the hinge in the x-coordinate of the abdomen | -0.4 | 0.4 | hip_2 (front_right_leg) | hinge | torque (N m) |
|
||||
| 3 | Torque applied on the rotor between torso/abdomen and the right hip (x-coordinate) | -0.4 | 0.4 | right_hip_x (right_thigh) | hinge | torque (N m) |
|
||||
| 4 | Torque applied on the rotor between torso/abdomen and the right hip (z-coordinate) | -0.4 | 0.4 | right_hip_z (right_thigh) | hinge | torque (N m) |
|
||||
| 5 | Torque applied on the rotor between torso/abdomen and the right hip (y-coordinate) | -0.4 | 0.4 | right_hip_y (right_thigh) | hinge | torque (N m) |
|
||||
| 6 | Torque applied on the rotor between the right hip/thigh and the right shin | -0.4 | 0.4 | right_knee | hinge | torque (N m) |
|
||||
| 7 | Torque applied on the rotor between torso/abdomen and the left hip (x-coordinate) | -0.4 | 0.4 | left_hip_x (left_thigh) | hinge | torque (N m) |
|
||||
| 8 | Torque applied on the rotor between torso/abdomen and the left hip (z-coordinate) | -0.4 | 0.4 | left_hip_z (left_thigh) | hinge | torque (N m) |
|
||||
| 9 | Torque applied on the rotor between torso/abdomen and the left hip (y-coordinate) | -0.4 | 0.4 | left_hip_y (left_thigh) | hinge | torque (N m) |
|
||||
| 10 | Torque applied on the rotor between the left hip/thigh and the left shin | -0.4 | 0.4 | left_knee | hinge | torque (N m) |
|
||||
| 11 | Torque applied on the rotor between the torso and right upper arm (coordinate -1) | -0.4 | 0.4 | right_shoulder1 | hinge | torque (N m) |
|
||||
| 12 | Torque applied on the rotor between the torso and right upper arm (coordinate -2) | -0.4 | 0.4 | right_shoulder2 | hinge | torque (N m) |
|
||||
| 13 | Torque applied on the rotor between the right upper arm and right lower arm | -0.4 | 0.4 | right_elbow | hinge | torque (N m) |
|
||||
| 14 | Torque applied on the rotor between the torso and left upper arm (coordinate -1) | -0.4 | 0.4 | left_shoulder1 | hinge | torque (N m) |
|
||||
| 15 | Torque applied on the rotor between the torso and left upper arm (coordinate -2) | -0.4 | 0.4 | left_shoulder2 | hinge | torque (N m) |
|
||||
| 16 | Torque applied on the rotor between the left upper arm and left lower arm | -0.4 | 0.4 | left_elbow | hinge | torque (N m) |
|
||||
|
||||
### Observation Space
|
||||
|
||||
Observations consist of positional values of different body parts of the Humanoid,
|
||||
followed by the velocities of those individual parts (their derivatives) with all the
|
||||
positions ordered before all the velocities.
|
||||
|
||||
By default, observations do not include the x- and y-coordinates of the torso. These may
|
||||
be included by passing `exclude_current_positions_from_observation=False` during construction.
|
||||
In that case, the observation space will have 378 dimensions where the first two dimensions
|
||||
represent the x- and y-coordinates of the torso.
|
||||
Regardless of whether `exclude_current_positions_from_observation` was set to true or false, the x- and y-coordinates
|
||||
will be returned in `info` with keys `"x_position"` and `"y_position"`, respectively.
|
||||
|
||||
However, by default, the observation is a `ndarray` with shape `(376,)` where the elements correspond to the following:
|
||||
|
||||
| Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit |
|
||||
|-----|---------------------------------------------------------|----------------|-----------------|----------------------------------------|-------|------|
|
||||
| 0 | z-coordinate of the torso (centre) | -Inf | Inf | root | free | position (m) |
|
||||
| 1 | x-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) |
|
||||
| 2 | y-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) |
|
||||
| 3 | z-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) |
|
||||
| 4 | w-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) |
|
||||
| 5 | z-angle of the abdomen (in lower_waist) | -Inf | Inf | abdomen_z | hinge | angle (rad) |
|
||||
| 6 | y-angle of the abdomen (in lower_waist) | -Inf | Inf | abdomen_y | hinge | angle (rad) |
|
||||
| 7 | x-angle of the abdomen (in pelvis) | -Inf | Inf | abdomen_x | hinge | angle (rad) |
|
||||
| 8 | x-coordinate of angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_x | hinge | angle (rad) |
|
||||
| 9 | z-coordinate of angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_z | hinge | angle (rad) |
|
||||
| 19 | y-coordinate of angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_y | hinge | angle (rad) |
|
||||
| 11 | angle between right hip and the right shin (in right_knee) | -Inf | Inf | right_knee | hinge | angle (rad) |
|
||||
| 12 | x-coordinate of angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_x | hinge | angle (rad) |
|
||||
| 13 | z-coordinate of angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_z | hinge | angle (rad) |
|
||||
| 14 | y-coordinate of angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_y | hinge | angle (rad) |
|
||||
| 15 | angle between left hip and the left shin (in left_knee) | -Inf | Inf | left_knee | hinge | angle (rad) |
|
||||
| 16 | coordinate-1 (multi-axis) angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder1 | hinge | angle (rad) |
|
||||
| 17 | coordinate-2 (multi-axis) angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder2 | hinge | angle (rad) |
|
||||
| 18 | angle between right upper arm and right_lower_arm | -Inf | Inf | right_elbow | hinge | angle (rad) |
|
||||
| 19 | coordinate-1 (multi-axis) angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder1 | hinge | angle (rad) |
|
||||
| 20 | coordinate-2 (multi-axis) angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder2 | hinge | angle (rad) |
|
||||
| 21 | angle between left upper arm and left_lower_arm | -Inf | Inf | left_elbow | hinge | angle (rad) |
|
||||
| 22 | x-coordinate velocity of the torso (centre) | -Inf | Inf | root | free | velocity (m/s) |
|
||||
| 23 | y-coordinate velocity of the torso (centre) | -Inf | Inf | root | free | velocity (m/s) |
|
||||
| 24 | z-coordinate velocity of the torso (centre) | -Inf | Inf | root | free | velocity (m/s) |
|
||||
| 25 | x-coordinate angular velocity of the torso (centre) | -Inf | Inf | root | free | anglular velocity (rad/s) |
|
||||
| 26 | y-coordinate angular velocity of the torso (centre) | -Inf | Inf | root | free | anglular velocity (rad/s) |
|
||||
| 27 | z-coordinate angular velocity of the torso (centre) | -Inf | Inf | root | free | anglular velocity (rad/s) |
|
||||
| 28 | z-coordinate of angular velocity of the abdomen (in lower_waist) | -Inf | Inf | abdomen_z | hinge | anglular velocity (rad/s) |
|
||||
| 29 | y-coordinate of angular velocity of the abdomen (in lower_waist) | -Inf | Inf | abdomen_y | hinge | anglular velocity (rad/s) |
|
||||
| 30 | x-coordinate of angular velocity of the abdomen (in pelvis) | -Inf | Inf | abdomen_x | hinge | aanglular velocity (rad/s) |
|
||||
| 31 | x-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_x | hinge | anglular velocity (rad/s) |
|
||||
| 32 | z-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_z | hinge | anglular velocity (rad/s) |
|
||||
| 33 | y-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_y | hinge | anglular velocity (rad/s) |
|
||||
| 34 | angular velocity of the angle between right hip and the right shin (in right_knee) | -Inf | Inf | right_knee | hinge | anglular velocity (rad/s) |
|
||||
| 35 | x-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_x | hinge | anglular velocity (rad/s) |
|
||||
| 36 | z-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_z | hinge | anglular velocity (rad/s) |
|
||||
| 37 | y-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_y | hinge | anglular velocity (rad/s) |
|
||||
| 38 | angular velocity of the angle between left hip and the left shin (in left_knee) | -Inf | Inf | left_knee | hinge | anglular velocity (rad/s) |
|
||||
| 39 | coordinate-1 (multi-axis) of the angular velocity of the angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder1 | hinge | anglular velocity (rad/s) |
|
||||
| 40 | coordinate-2 (multi-axis) of the angular velocity of the angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder2 | hinge | anglular velocity (rad/s) |
|
||||
| 41 | angular velocity of the angle between right upper arm and right_lower_arm | -Inf | Inf | right_elbow | hinge | anglular velocity (rad/s) |
|
||||
| 42 | coordinate-1 (multi-axis) of the angular velocity of the angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder1 | hinge | anglular velocity (rad/s) |
|
||||
| 43 | coordinate-2 (multi-axis) of the angular velocity of the angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder2 | hinge | anglular velocity (rad/s) |
|
||||
| 44 | angular velocitty of the angle between left upper arm and left_lower_arm | -Inf | Inf | left_elbow | hinge | anglular velocity (rad/s) |
|
||||
|
||||
Additionally, after all the positional and velocity based values in the table,
|
||||
the observation contains (in order):
|
||||
- *cinert:* Mass and inertia of a single rigid body relative to the center of mass
|
||||
(this is an intermediate result of transition). It has shape 14*10 (*nbody * 10*)
|
||||
and hence adds to another 140 elements in the state space.
|
||||
- *cvel:* Center of mass based velocity. It has shape 14 * 6 (*nbody * 6*) and hence
|
||||
adds another 84 elements in the state space
|
||||
- *qfrc_actuator:* Constraint force generated as the actuator force. This has shape
|
||||
`(23,)` *(nv * 1)* and hence adds another 23 elements to the state space.
|
||||
- *cfrc_ext:* This is the center of mass based external force on the body. It has shape
|
||||
14 * 6 (*nbody * 6*) and hence adds to another 84 elements in the state space.
|
||||
where *nbody* stands for the number of bodies in the robot and *nv* stands for the
|
||||
number of degrees of freedom (*= dim(qvel)*)
|
||||
|
||||
The (x,y,z) coordinates are translational DOFs while the orientations are rotational
|
||||
DOFs expressed as quaternions. One can read more about free joints on the
|
||||
[Mujoco Documentation](https://mujoco.readthedocs.io/en/latest/XMLreference.html).
|
||||
|
||||
**Note:** There have been reported issues that using a Mujoco-Py version > 2.0
|
||||
results in the contact forces always being 0. As such we recommend to use a Mujoco-Py
|
||||
version < 2.0 when using the Humanoid environment if you would like to report results
|
||||
with contact forces (if contact forces are not used in your experiments, you can use
|
||||
version > 2.0).
|
||||
|
||||
### Rewards
|
||||
The reward consists of three parts:
|
||||
- *healthy_reward*: Every timestep that the humanoid is alive (see section Episode Termination for definition), it gets a reward of fixed value `healthy_reward`
|
||||
- *forward_reward*: A reward of walking forward which is measured as *`forward_reward_weight` *
|
||||
(average center of mass before action - average center of mass after action)/dt*.
|
||||
*dt* is the time between actions and is dependent on the frame_skip parameter
|
||||
(default is 5), where the frametime is 0.003 - making the default *dt = 5 * 0.003 = 0.015*.
|
||||
This reward would be positive if the humanoid walks forward (in positive x-direction). The calculation
|
||||
for the center of mass is defined in the `.py` file for the Humanoid.
|
||||
- *ctrl_cost*: A negative reward for penalising the humanoid if it has too
|
||||
large of a control force. If there are *nu* actuators/controls, then the control has
|
||||
shape `nu x 1`. It is measured as *`ctrl_cost_weight` * sum(control<sup>2</sup>)*.
|
||||
- *contact_cost*: A negative reward for penalising the humanoid if the external
|
||||
contact force is too large. It is calculated by clipping
|
||||
*`contact_cost_weight` * sum(external contact force<sup>2</sup>)* to the interval specified by `contact_cost_range`.
|
||||
|
||||
The total reward returned is ***reward*** *=* *healthy_reward + forward_reward - ctrl_cost - contact_cost* and `info` will also contain the individual reward terms
|
||||
|
||||
### Starting State
|
||||
All observations start in state
|
||||
(0.0, 0.0, 1.4, 1.0, 0.0 ... 0.0) with a uniform noise in the range
|
||||
of [-`reset_noise_scale`, `reset_noise_scale`] added to the positional and velocity values (values in the table)
|
||||
for stochasticity. Note that the initial z coordinate is intentionally
|
||||
selected to be high, thereby indicating a standing up humanoid. The initial
|
||||
orientation is designed to make it face forward as well.
|
||||
|
||||
### Episode Termination
|
||||
The humanoid is said to be unhealthy if the z-position of the torso is no longer contained in the
|
||||
closed interval specified by the argument `healthy_z_range`.
|
||||
|
||||
If `terminate_when_unhealthy=True` is passed during construction (which is the default),
|
||||
the episode terminates when any of the following happens:
|
||||
|
||||
1. The episode duration reaches a 1000 timesteps
|
||||
3. The humanoid is unhealthy
|
||||
|
||||
If `terminate_when_unhealthy=False` is passed, the episode is terminated only when 1000 timesteps are exceeded.
|
||||
|
||||
|
||||
### Arguments
|
||||
|
||||
No additional arguments are currently supported in v2 and lower.
|
||||
|
||||
```
|
||||
env = gym.make('Humanoid-v2')
|
||||
```
|
||||
|
||||
v3 and beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
|
||||
|
||||
```
|
||||
env = gym.make('Humanoid-v3', ctrl_cost_weight=0.1, ....)
|
||||
```
|
||||
|
||||
| Parameter | Type | Default |Description |
|
||||
|-------------------------|------------|-------------------|-------------------------------|
|
||||
| `xml_file` | **str** | `"humanoid.xml"` | Path to a MuJoCo model |
|
||||
| `forward_reward_weight` | **float** | `1.25` | Weight for *forward_reward* term (see section on reward) |
|
||||
| `ctrl_cost_weight` | **float** | `0.1` | Weight for *ctrl_cost* term (see section on reward) |
|
||||
| `contact_cost_weight` | **float** | `5e-7` | Weight for *contact_cost* term (see section on reward) |
|
||||
| `healthy_reward` | **float** | `5.0` | Constant reward given if the humanoid is "healthy" after timestep |
|
||||
| `terminate_when_unhealthy` | **bool**| `True` | If true, issue a done signal if the z-coordinate of the torso is no longer in the `healthy_z_range` |
|
||||
| `healthy_z_range` | **tuple** | `(1.0, 2.0)` | The humanoid is considered healthy if the z-coordinate of the torso is in this range |
|
||||
| `reset_noise_scale` | **float** | `1e-2` | Scale of random perturbations of initial position and velocity (see section on Starting State) |
|
||||
| `exclude_current_positions_from_observation`| **bool** | `True`| Whether or not to omit the x- and y-coordinates from observations. Excluding the position can serve as an inductive bias to induce position-agnostic behavior in policies |
|
||||
|
||||
### Version History
|
||||
|
||||
* v3: support for gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. rgb rendering comes from tracking camera (so agent does not run away from screen)
|
||||
* v2: All continuous control environments now use mujoco_py >= 1.50
|
||||
* v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments.
|
||||
* v0: Initial versions release (1.0.0)
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
xml_file="humanoid.xml",
|
||||
@@ -239,7 +47,7 @@ class HumanoidEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
||||
exclude_current_positions_from_observation
|
||||
)
|
||||
|
||||
mujoco_env.MujocoEnv.__init__(self, xml_file, 5)
|
||||
mujoco_env.MujocoEnv.__init__(self, xml_file, 5, mujoco_bindings="mujoco_py")
|
||||
|
||||
@property
|
||||
def healthy_reward(self):
|
||||
|
331
gym/envs/mujoco/humanoid_v4.py
Normal file
331
gym/envs/mujoco/humanoid_v4.py
Normal file
@@ -0,0 +1,331 @@
|
||||
import numpy as np
|
||||
|
||||
from gym import utils
|
||||
from gym.envs.mujoco import mujoco_env
|
||||
|
||||
DEFAULT_CAMERA_CONFIG = {
|
||||
"trackbodyid": 1,
|
||||
"distance": 4.0,
|
||||
"lookat": np.array((0.0, 0.0, 2.0)),
|
||||
"elevation": -20.0,
|
||||
}
|
||||
|
||||
|
||||
def mass_center(model, data):
|
||||
mass = np.expand_dims(model.body_mass, axis=1)
|
||||
xpos = data.xipos
|
||||
return (np.sum(mass * xpos, axis=0) / np.sum(mass))[0:2].copy()
|
||||
|
||||
|
||||
class HumanoidEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
||||
"""
|
||||
### Description
|
||||
|
||||
This environment is based on the environment introduced by Tassa, Erez and Todorov
|
||||
in ["Synthesis and stabilization of complex behaviors through online trajectory optimization"](https://ieeexplore.ieee.org/document/6386025).
|
||||
The 3D bipedal robot is designed to simulate a human. It has a torso (abdomen) with a pair of
|
||||
legs and arms. The legs each consist of two links, and so the arms (representing the knees and
|
||||
elbows respectively). The goal of the environment is to walk forward as fast as possible without falling over.
|
||||
|
||||
### Action Space
|
||||
The agent take a 17-element vector for actions.
|
||||
The action space is a continuous `(action, ...)` all in `[-1, 1]`, where `action`
|
||||
represents the numerical torques applied at the hinge joints.
|
||||
|
||||
| Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit |
|
||||
|-----|----------------------|---------------|----------------|---------------------------------------|-------|------|
|
||||
| 0 | Torque applied on the hinge in the y-coordinate of the abdomen | -0.4 | 0.4 | hip_1 (front_left_leg) | hinge | torque (N m) |
|
||||
| 1 | Torque applied on the hinge in the z-coordinate of the abdomen | -0.4 | 0.4 | angle_1 (front_left_leg) | hinge | torque (N m) |
|
||||
| 2 | Torque applied on the hinge in the x-coordinate of the abdomen | -0.4 | 0.4 | hip_2 (front_right_leg) | hinge | torque (N m) |
|
||||
| 3 | Torque applied on the rotor between torso/abdomen and the right hip (x-coordinate) | -0.4 | 0.4 | right_hip_x (right_thigh) | hinge | torque (N m) |
|
||||
| 4 | Torque applied on the rotor between torso/abdomen and the right hip (z-coordinate) | -0.4 | 0.4 | right_hip_z (right_thigh) | hinge | torque (N m) |
|
||||
| 5 | Torque applied on the rotor between torso/abdomen and the right hip (y-coordinate) | -0.4 | 0.4 | right_hip_y (right_thigh) | hinge | torque (N m) |
|
||||
| 6 | Torque applied on the rotor between the right hip/thigh and the right shin | -0.4 | 0.4 | right_knee | hinge | torque (N m) |
|
||||
| 7 | Torque applied on the rotor between torso/abdomen and the left hip (x-coordinate) | -0.4 | 0.4 | left_hip_x (left_thigh) | hinge | torque (N m) |
|
||||
| 8 | Torque applied on the rotor between torso/abdomen and the left hip (z-coordinate) | -0.4 | 0.4 | left_hip_z (left_thigh) | hinge | torque (N m) |
|
||||
| 9 | Torque applied on the rotor between torso/abdomen and the left hip (y-coordinate) | -0.4 | 0.4 | left_hip_y (left_thigh) | hinge | torque (N m) |
|
||||
| 10 | Torque applied on the rotor between the left hip/thigh and the left shin | -0.4 | 0.4 | left_knee | hinge | torque (N m) |
|
||||
| 11 | Torque applied on the rotor between the torso and right upper arm (coordinate -1) | -0.4 | 0.4 | right_shoulder1 | hinge | torque (N m) |
|
||||
| 12 | Torque applied on the rotor between the torso and right upper arm (coordinate -2) | -0.4 | 0.4 | right_shoulder2 | hinge | torque (N m) |
|
||||
| 13 | Torque applied on the rotor between the right upper arm and right lower arm | -0.4 | 0.4 | right_elbow | hinge | torque (N m) |
|
||||
| 14 | Torque applied on the rotor between the torso and left upper arm (coordinate -1) | -0.4 | 0.4 | left_shoulder1 | hinge | torque (N m) |
|
||||
| 15 | Torque applied on the rotor between the torso and left upper arm (coordinate -2) | -0.4 | 0.4 | left_shoulder2 | hinge | torque (N m) |
|
||||
| 16 | Torque applied on the rotor between the left upper arm and left lower arm | -0.4 | 0.4 | left_elbow | hinge | torque (N m) |
|
||||
|
||||
### Observation Space
|
||||
|
||||
The state space consists of positional values of different body parts of the Humanoid,
|
||||
followed by the velocities of those individual parts (their derivatives) with all the
|
||||
positions ordered before all the velocities.
|
||||
|
||||
The observation is a `ndarray` with shape `(376,)` where the elements correspond to the following:
|
||||
|
||||
| Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit |
|
||||
|-----|---------------------------------------------------------|----------------|-----------------|----------------------------------------|-------|------|
|
||||
| 0 | x-coordinate of the torso (centre) | -Inf | Inf | root | free | position (m) |
|
||||
| 1 | y-coordinate of the torso (centre) | -Inf | Inf | root | free | position (m) |
|
||||
| 2 | z-coordinate of the torso (centre) | -Inf | Inf | root | free | position (m) |
|
||||
| 3 | x-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) |
|
||||
| 4 | y-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) |
|
||||
| 5 | z-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) |
|
||||
| 6 | w-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) |
|
||||
| 7 | z-angle of the abdomen (in lower_waist) | -Inf | Inf | abdomen_z | hinge | angle (rad) |
|
||||
| 8 | y-angle of the abdomen (in lower_waist) | -Inf | Inf | abdomen_y | hinge | angle (rad) |
|
||||
| 9 | x-angle of the abdomen (in pelvis) | -Inf | Inf | abdomen_x | hinge | angle (rad) |
|
||||
| 10 | x-coordinate of angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_x | hinge | angle (rad) |
|
||||
| 11 | z-coordinate of angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_z | hinge | angle (rad) |
|
||||
| 12 | y-coordinate of angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_y | hinge | angle (rad) |
|
||||
| 13 | angle between right hip and the right shin (in right_knee) | -Inf | Inf | right_knee | hinge | angle (rad) |
|
||||
| 14 | x-coordinate of angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_x | hinge | angle (rad) |
|
||||
| 15 | z-coordinate of angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_z | hinge | angle (rad) |
|
||||
| 16 | y-coordinate of angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_y | hinge | angle (rad) |
|
||||
| 17 | angle between left hip and the left shin (in left_knee) | -Inf | Inf | left_knee | hinge | angle (rad) |
|
||||
| 18 | coordinate-1 (multi-axis) angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder1 | hinge | angle (rad) |
|
||||
| 19 | coordinate-2 (multi-axis) angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder2 | hinge | angle (rad) |
|
||||
| 20 | angle between right upper arm and right_lower_arm | -Inf | Inf | right_elbow | hinge | angle (rad) |
|
||||
| 21 | coordinate-1 (multi-axis) angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder1 | hinge | angle (rad) |
|
||||
| 22 | coordinate-2 (multi-axis) angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder2 | hinge | angle (rad) |
|
||||
| 23 | angle between left upper arm and left_lower_arm | -Inf | Inf | left_elbow | hinge | angle (rad) |
|
||||
| 24 | x-coordinate velocity of the torso (centre) | -Inf | Inf | root | free | velocity (m/s) |
|
||||
| 25 | y-coordinate velocity of the torso (centre) | -Inf | Inf | root | free | velocity (m/s) |
|
||||
| 26 | z-coordinate velocity of the torso (centre) | -Inf | Inf | root | free | velocity (m/s) |
|
||||
| 27 | x-coordinate angular velocity of the torso (centre) | -Inf | Inf | root | free | anglular velocity (rad/s) |
|
||||
| 28 | y-coordinate angular velocity of the torso (centre) | -Inf | Inf | root | free | anglular velocity (rad/s) |
|
||||
| 29 | z-coordinate angular velocity of the torso (centre) | -Inf | Inf | root | free | anglular velocity (rad/s) |
|
||||
| 30 | z-coordinate of angular velocity of the abdomen (in lower_waist) | -Inf | Inf | abdomen_z | hinge | anglular velocity (rad/s) |
|
||||
| 31 | y-coordinate of angular velocity of the abdomen (in lower_waist) | -Inf | Inf | abdomen_y | hinge | anglular velocity (rad/s) |
|
||||
| 32 | x-coordinate of angular velocity of the abdomen (in pelvis) | -Inf | Inf | abdomen_x | hinge | aanglular velocity (rad/s) |
|
||||
| 33 | x-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_x | hinge | anglular velocity (rad/s) |
|
||||
| 34 | z-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_z | hinge | anglular velocity (rad/s) |
|
||||
| 35 | y-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_y | hinge | anglular velocity (rad/s) |
|
||||
| 36 | angular velocity of the angle between right hip and the right shin (in right_knee) | -Inf | Inf | right_knee | hinge | anglular velocity (rad/s) |
|
||||
| 37 | x-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_x | hinge | anglular velocity (rad/s) |
|
||||
| 38 | z-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_z | hinge | anglular velocity (rad/s) |
|
||||
| 39 | y-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_y | hinge | anglular velocity (rad/s) |
|
||||
| 40 | angular velocity of the angle between left hip and the left shin (in left_knee) | -Inf | Inf | left_knee | hinge | anglular velocity (rad/s) |
|
||||
| 41 | coordinate-1 (multi-axis) of the angular velocity of the angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder1 | hinge | anglular velocity (rad/s) |
|
||||
| 42 | coordinate-2 (multi-axis) of the angular velocity of the angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder2 | hinge | anglular velocity (rad/s) |
|
||||
| 43 | angular velocity of the angle between right upper arm and right_lower_arm | -Inf | Inf | right_elbow | hinge | anglular velocity (rad/s) |
|
||||
| 44 | coordinate-1 (multi-axis) of the angular velocity of the angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder1 | hinge | anglular velocity (rad/s) |
|
||||
| 45 | coordinate-2 (multi-axis) of the angular velocity of the angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder2 | hinge | anglular velocity (rad/s) |
|
||||
| 46 | angular velocitty of the angle between left upper arm and left_lower_arm | -Inf | Inf | left_elbow | hinge | anglular velocity (rad/s) |
|
||||
|
||||
Additionally, after all the positional and velocity based values in the table,
|
||||
the state_space consists of (in order):
|
||||
- *cinert:* Mass and inertia of a single rigid body relative to the center of mass
|
||||
(this is an intermediate result of transition). It has shape 14*10 (*nbody * 10*)
|
||||
and hence adds to another 140 elements in the state space.
|
||||
- *cvel:* Center of mass based velocity. It has shape 14 * 6 (*nbody * 6*) and hence
|
||||
adds another 84 elements in the state space
|
||||
- *qfrc_actuator:* Constraint force generated as the actuator force. This has shape
|
||||
`(23,)` *(nv * 1)* and hence adds another 23 elements to the state space.
|
||||
- *cfrc_ext:* This is the center of mass based external force on the body. It has shape
|
||||
14 * 6 (*nbody * 6*) and hence adds to another 84 elements in the state space.
|
||||
where *nbody* stands for the number of bodies in the robot and *nv* stands for the
|
||||
number of degrees of freedom (*= dim(qvel)*)
|
||||
|
||||
The (x,y,z) coordinates are translational DOFs while the orientations are rotational
|
||||
DOFs expressed as quaternions. One can read more about free joints on the
|
||||
[Mujoco Documentation](https://mujoco.readthedocs.io/en/latest/XMLreference.html).
|
||||
|
||||
**Note:** There are 47 elements in the table above - giving rise to `(378,)`
|
||||
elements in the state space. In practice (and Gym implementation), the first two
|
||||
positional elements are omitted from the state space since the reward function is
|
||||
calculated based on the x-coordinate value. This value is hidden from the algorithm,
|
||||
which in turn has to develop an abstract understanding of it from the observed rewards.
|
||||
Therefore, observation space has shape `(376,)` instead of `(378,)` and the table should
|
||||
not have the first two rows.
|
||||
|
||||
**Note:** Humanoid-v4 environment no longer has the following contact forces issue.
|
||||
If using previous Humanoid versions from v4, there have been reported issues that using
|
||||
a Mujoco-Py version > 2.0 results in the contact forces always being 0. As such we recommend
|
||||
to use a Mujoco-Py version < 2.0 when using the Humanoid environment if you would like to report
|
||||
results with contact forces (if contact forces are not used in your experiments, you can use
|
||||
version > 2.0).
|
||||
|
||||
### Rewards
|
||||
The reward consists of three parts:
|
||||
- *alive_bonus*: Every timestep that the humanoid is alive, it gets a reward of 5.
|
||||
- *lin_vel_cost*: A reward of walking forward which is measured as *1.25 *
|
||||
(average center of mass before action - average center of mass after action)/dt*.
|
||||
*dt* is the time between actions and is dependent on the frame_skip parameter
|
||||
(default is 5), where the *dt* for one frame is 0.003 - making the default *dt = 5 * 0.003 = 0.015*.
|
||||
This reward would be positive if the humanoid walks forward (right) desired. The calculation
|
||||
for the center of mass is defined in the `.py` file for the Humanoid.
|
||||
- *quad_ctrl_cost*: A negative reward for penalising the humanoid if it has too
|
||||
large of a control force. If there are *nu* actuators/controls, then the control has
|
||||
shape `nu x 1`. It is measured as *0.1 **x** sum(control<sup>2</sup>)*.
|
||||
- *quad_impact_cost*: A negative reward for penalising the humanoid if the external
|
||||
contact force is too large. It is calculated as
|
||||
*min(0.5 * 0.000001 * sum(external contact force<sup>2</sup>), 10)*.
|
||||
|
||||
The total reward returned is ***reward*** *=* *alive_bonus + lin_vel_cost - quad_ctrl_cost - quad_impact_cost*
|
||||
|
||||
### Starting State
|
||||
All observations start in state
|
||||
(0.0, 0.0, 1.4, 1.0, 0.0 ... 0.0) with a uniform noise in the range
|
||||
of [-0.01, 0.01] added to the positional and velocity values (values in the table)
|
||||
for stochasticity. Note that the initial z coordinate is intentionally
|
||||
selected to be high, thereby indicating a standing up humanoid. The initial
|
||||
orientation is designed to make it face forward as well.
|
||||
|
||||
### Episode Termination
|
||||
The episode terminates when any of the following happens:
|
||||
|
||||
1. The episode duration reaches a 1000 timesteps
|
||||
2. Any of the state space values is no longer finite
|
||||
3. The z-coordinate of the torso (index 0 in state space OR index 2 in the table) is **not** in the range `[1.0, 2.0]` (the humanoid has fallen or is about to fall beyond recovery).
|
||||
|
||||
### Arguments
|
||||
|
||||
No additional arguments are currently supported (in v2 and lower), but
|
||||
modifications can be made to the XML file in the assets folder (or by changing
|
||||
the path to a modified XML file in another folder)..
|
||||
|
||||
```
|
||||
env = gym.make('Ant-v2')
|
||||
```
|
||||
|
||||
v3 and beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
|
||||
|
||||
```
|
||||
env = gym.make('Ant-v3', ctrl_cost_weight=0.1, ....)
|
||||
```
|
||||
|
||||
### Version History
|
||||
|
||||
* v4: all mujoco environments now use the mujoco binidings in mujoco>=2.1.3
|
||||
* v3: support for gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. rgb rendering comes from tracking camera (so agent does not run away from screen)
|
||||
* v2: All continuous control environments now use mujoco_py >= 1.50
|
||||
* v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments.
|
||||
* v0: Initial versions release (1.0.0)
|
||||
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
xml_file="humanoid.xml",
|
||||
forward_reward_weight=1.25,
|
||||
ctrl_cost_weight=0.1,
|
||||
healthy_reward=5.0,
|
||||
terminate_when_unhealthy=True,
|
||||
healthy_z_range=(1.0, 2.0),
|
||||
reset_noise_scale=1e-2,
|
||||
exclude_current_positions_from_observation=True,
|
||||
):
|
||||
utils.EzPickle.__init__(**locals())
|
||||
|
||||
self._forward_reward_weight = forward_reward_weight
|
||||
self._ctrl_cost_weight = ctrl_cost_weight
|
||||
self._healthy_reward = healthy_reward
|
||||
self._terminate_when_unhealthy = terminate_when_unhealthy
|
||||
self._healthy_z_range = healthy_z_range
|
||||
|
||||
self._reset_noise_scale = reset_noise_scale
|
||||
|
||||
self._exclude_current_positions_from_observation = (
|
||||
exclude_current_positions_from_observation
|
||||
)
|
||||
|
||||
mujoco_env.MujocoEnv.__init__(self, xml_file, 5)
|
||||
|
||||
@property
|
||||
def healthy_reward(self):
|
||||
return (
|
||||
float(self.is_healthy or self._terminate_when_unhealthy)
|
||||
* self._healthy_reward
|
||||
)
|
||||
|
||||
def control_cost(self, action):
|
||||
control_cost = self._ctrl_cost_weight * np.sum(np.square(self.data.ctrl))
|
||||
return control_cost
|
||||
|
||||
@property
|
||||
def is_healthy(self):
|
||||
min_z, max_z = self._healthy_z_range
|
||||
is_healthy = min_z < self.data.qpos[2] < max_z
|
||||
|
||||
return is_healthy
|
||||
|
||||
@property
|
||||
def done(self):
|
||||
done = (not self.is_healthy) if self._terminate_when_unhealthy else False
|
||||
return done
|
||||
|
||||
def _get_obs(self):
|
||||
position = self.data.qpos.flat.copy()
|
||||
velocity = self.data.qvel.flat.copy()
|
||||
|
||||
com_inertia = self.data.cinert.flat.copy()
|
||||
com_velocity = self.data.cvel.flat.copy()
|
||||
|
||||
actuator_forces = self.data.qfrc_actuator.flat.copy()
|
||||
external_contact_forces = self.data.cfrc_ext.flat.copy()
|
||||
|
||||
if self._exclude_current_positions_from_observation:
|
||||
position = position[2:]
|
||||
|
||||
return np.concatenate(
|
||||
(
|
||||
position,
|
||||
velocity,
|
||||
com_inertia,
|
||||
com_velocity,
|
||||
actuator_forces,
|
||||
external_contact_forces,
|
||||
)
|
||||
)
|
||||
|
||||
def step(self, action):
|
||||
xy_position_before = mass_center(self.model, self.data)
|
||||
self.do_simulation(action, self.frame_skip)
|
||||
xy_position_after = mass_center(self.model, self.data)
|
||||
|
||||
xy_velocity = (xy_position_after - xy_position_before) / self.dt
|
||||
x_velocity, y_velocity = xy_velocity
|
||||
|
||||
ctrl_cost = self.control_cost(action)
|
||||
|
||||
forward_reward = self._forward_reward_weight * x_velocity
|
||||
healthy_reward = self.healthy_reward
|
||||
|
||||
rewards = forward_reward + healthy_reward
|
||||
|
||||
observation = self._get_obs()
|
||||
reward = rewards - ctrl_cost
|
||||
done = self.done
|
||||
info = {
|
||||
"reward_linvel": forward_reward,
|
||||
"reward_quadctrl": -ctrl_cost,
|
||||
"reward_alive": healthy_reward,
|
||||
"x_position": xy_position_after[0],
|
||||
"y_position": xy_position_after[1],
|
||||
"distance_from_origin": np.linalg.norm(xy_position_after, ord=2),
|
||||
"x_velocity": x_velocity,
|
||||
"y_velocity": y_velocity,
|
||||
"forward_reward": forward_reward,
|
||||
}
|
||||
|
||||
return observation, reward, done, info
|
||||
|
||||
def reset_model(self):
|
||||
noise_low = -self._reset_noise_scale
|
||||
noise_high = self._reset_noise_scale
|
||||
|
||||
qpos = self.init_qpos + self.np_random.uniform(
|
||||
low=noise_low, high=noise_high, size=self.model.nq
|
||||
)
|
||||
qvel = self.init_qvel + self.np_random.uniform(
|
||||
low=noise_low, high=noise_high, size=self.model.nv
|
||||
)
|
||||
self.set_state(qpos, qvel)
|
||||
|
||||
observation = self._get_obs()
|
||||
return observation
|
||||
|
||||
def viewer_setup(self):
|
||||
for key, value in DEFAULT_CAMERA_CONFIG.items():
|
||||
if isinstance(value, np.ndarray):
|
||||
getattr(self.viewer.cam, key)[:] = value
|
||||
else:
|
||||
setattr(self.viewer.cam, key, value)
|
@@ -5,178 +5,10 @@ from gym.envs.mujoco import mujoco_env
|
||||
|
||||
|
||||
class HumanoidStandupEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
||||
"""
|
||||
### Description
|
||||
|
||||
This environment is based on the environment introduced by Tassa, Erez and Todorov
|
||||
in ["Synthesis and stabilization of complex behaviors through online trajectory optimization"](https://ieeexplore.ieee.org/document/6386025).
|
||||
The 3D bipedal robot is designed to simulate a human. It has a torso (abdomen) with a
|
||||
pair of legs and arms. The legs each consist of two links, and so the arms (representing the
|
||||
knees and elbows respectively). The environment starts with the humanoid layiing on the ground,
|
||||
and then the goal of the environment is to make the humanoid standup and then keep it standing
|
||||
by applying torques on the various hinges.
|
||||
|
||||
### Action Space
|
||||
The agent take a 17-element vector for actions.
|
||||
|
||||
The action space is a continuous `(action, ...)` all in `[-1, 1]`, where `action`
|
||||
represents the numerical torques applied at the hinge joints.
|
||||
|
||||
| Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit |
|
||||
|-----|----------------------|---------------|----------------|---------------------------------------|-------|------|
|
||||
| 0 | Torque applied on the hinge in the y-coordinate of the abdomen | -0.4 | 0.4 | hip_1 (front_left_leg) | hinge | torque (N m) |
|
||||
| 1 | Torque applied on the hinge in the z-coordinate of the abdomen | -0.4 | 0.4 | angle_1 (front_left_leg) | hinge | torque (N m) |
|
||||
| 2 | Torque applied on the hinge in the x-coordinate of the abdomen | -0.4 | 0.4 | hip_2 (front_right_leg) | hinge | torque (N m) |
|
||||
| 3 | Torque applied on the rotor between torso/abdomen and the right hip (x-coordinate) | -0.4 | 0.4 | right_hip_x (right_thigh) | hinge | torque (N m) |
|
||||
| 4 | Torque applied on the rotor between torso/abdomen and the right hip (z-coordinate) | -0.4 | 0.4 | right_hip_z (right_thigh) | hinge | torque (N m) |
|
||||
| 5 | Torque applied on the rotor between torso/abdomen and the right hip (y-coordinate) | -0.4 | 0.4 | right_hip_y (right_thigh) | hinge | torque (N m) |
|
||||
| 6 | Torque applied on the rotor between the right hip/thigh and the right shin | -0.4 | 0.4 | right_knee | hinge | torque (N m) |
|
||||
| 7 | Torque applied on the rotor between torso/abdomen and the left hip (x-coordinate) | -0.4 | 0.4 | left_hip_x (left_thigh) | hinge | torque (N m) |
|
||||
| 8 | Torque applied on the rotor between torso/abdomen and the left hip (z-coordinate) | -0.4 | 0.4 | left_hip_z (left_thigh) | hinge | torque (N m) |
|
||||
| 9 | Torque applied on the rotor between torso/abdomen and the left hip (y-coordinate) | -0.4 | 0.4 | left_hip_y (left_thigh) | hinge | torque (N m) |
|
||||
| 10 | Torque applied on the rotor between the left hip/thigh and the left shin | -0.4 | 0.4 | left_knee | hinge | torque (N m) |
|
||||
| 11 | Torque applied on the rotor between the torso and right upper arm (coordinate -1) | -0.4 | 0.4 | right_shoulder1 | hinge | torque (N m) |
|
||||
| 12 | Torque applied on the rotor between the torso and right upper arm (coordinate -2) | -0.4 | 0.4 | right_shoulder2 | hinge | torque (N m) |
|
||||
| 13 | Torque applied on the rotor between the right upper arm and right lower arm | -0.4 | 0.4 | right_elbow | hinge | torque (N m) |
|
||||
| 14 | Torque applied on the rotor between the torso and left upper arm (coordinate -1) | -0.4 | 0.4 | left_shoulder1 | hinge | torque (N m) |
|
||||
| 15 | Torque applied on the rotor between the torso and left upper arm (coordinate -2) | -0.4 | 0.4 | left_shoulder2 | hinge | torque (N m) |
|
||||
| 16 | Torque applied on the rotor between the left upper arm and left lower arm | -0.4 | 0.4 | left_elbow | hinge | torque (N m) |
|
||||
|
||||
### Observation Space
|
||||
|
||||
The state space consists of positional values of different body parts of the Humanoid,
|
||||
followed by the velocities of those individual parts (their derivatives) with all the positions ordered before all the velocities.
|
||||
|
||||
**Note:** The x- and y-coordinates of the torso are being omitted to produce position-agnostic behavior in policies
|
||||
|
||||
The observation is a `ndarray` with shape `(376,)` where the elements correspond to the following:
|
||||
|
||||
| Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit |
|
||||
|-----|---------------------------------------------------------|----------------|-----------------|----------------------------------------|-------|------|
|
||||
| 0 | z-coordinate of the torso (centre) | -Inf | Inf | root | free | position (m) |
|
||||
| 1 | x-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) |
|
||||
| 2 | y-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) |
|
||||
| 3 | z-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) |
|
||||
| 4 | w-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) |
|
||||
| 5 | z-angle of the abdomen (in lower_waist) | -Inf | Inf | abdomen_z | hinge | angle (rad) |
|
||||
| 6 | y-angle of the abdomen (in lower_waist) | -Inf | Inf | abdomen_y | hinge | angle (rad) |
|
||||
| 7 | x-angle of the abdomen (in pelvis) | -Inf | Inf | abdomen_x | hinge | angle (rad) |
|
||||
| 8 | x-coordinate of angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_x | hinge | angle (rad) |
|
||||
| 9 | z-coordinate of angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_z | hinge | angle (rad) |
|
||||
| 10 | y-coordinate of angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_y | hinge | angle (rad) |
|
||||
| 11 | angle between right hip and the right shin (in right_knee) | -Inf | Inf | right_knee | hinge | angle (rad) |
|
||||
| 12 | x-coordinate of angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_x | hinge | angle (rad) |
|
||||
| 13 | z-coordinate of angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_z | hinge | angle (rad) |
|
||||
| 14 | y-coordinate of angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_y | hinge | angle (rad) |
|
||||
| 15 | angle between left hip and the left shin (in left_knee) | -Inf | Inf | left_knee | hinge | angle (rad) |
|
||||
| 16 | coordinate-1 (multi-axis) angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder1 | hinge | angle (rad) |
|
||||
| 17 | coordinate-2 (multi-axis) angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder2 | hinge | angle (rad) |
|
||||
| 18 | angle between right upper arm and right_lower_arm | -Inf | Inf | right_elbow | hinge | angle (rad) |
|
||||
| 19 | coordinate-1 (multi-axis) angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder1 | hinge | angle (rad) |
|
||||
| 20 | coordinate-2 (multi-axis) angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder2 | hinge | angle (rad) |
|
||||
| 21 | angle between left upper arm and left_lower_arm | -Inf | Inf | left_elbow | hinge | angle (rad) |
|
||||
| 22 | x-coordinate velocity of the torso (centre) | -Inf | Inf | root | free | velocity (m/s) |
|
||||
| 23 | y-coordinate velocity of the torso (centre) | -Inf | Inf | root | free | velocity (m/s) |
|
||||
| 24 | z-coordinate velocity of the torso (centre) | -Inf | Inf | root | free | velocity (m/s) |
|
||||
| 25 | x-coordinate angular velocity of the torso (centre) | -Inf | Inf | root | free | anglular velocity (rad/s) |
|
||||
| 26 | y-coordinate angular velocity of the torso (centre) | -Inf | Inf | root | free | anglular velocity (rad/s) |
|
||||
| 27 | z-coordinate angular velocity of the torso (centre) | -Inf | Inf | root | free | anglular velocity (rad/s) |
|
||||
| 28 | z-coordinate of angular velocity of the abdomen (in lower_waist) | -Inf | Inf | abdomen_z | hinge | anglular velocity (rad/s) |
|
||||
| 29 | y-coordinate of angular velocity of the abdomen (in lower_waist) | -Inf | Inf | abdomen_y | hinge | anglular velocity (rad/s) |
|
||||
| 30 | x-coordinate of angular velocity of the abdomen (in pelvis) | -Inf | Inf | abdomen_x | hinge | aanglular velocity (rad/s) |
|
||||
| 31 | x-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_x | hinge | anglular velocity (rad/s) |
|
||||
| 32 | z-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_z | hinge | anglular velocity (rad/s) |
|
||||
| 33 | y-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_y | hinge | anglular velocity (rad/s) |
|
||||
| 35 | angular velocity of the angle between right hip and the right shin (in right_knee) | -Inf | Inf | right_knee | hinge | anglular velocity (rad/s) |
|
||||
| 36 | x-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_x | hinge | anglular velocity (rad/s) |
|
||||
| 37 | z-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_z | hinge | anglular velocity (rad/s) |
|
||||
| 38 | y-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_y | hinge | anglular velocity (rad/s) |
|
||||
| 39 | angular velocity of the angle between left hip and the left shin (in left_knee) | -Inf | Inf | left_knee | hinge | anglular velocity (rad/s) |
|
||||
| 40 | coordinate-1 (multi-axis) of the angular velocity of the angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder1 | hinge | anglular velocity (rad/s) |
|
||||
| 41 | coordinate-2 (multi-axis) of the angular velocity of the angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder2 | hinge | anglular velocity (rad/s) |
|
||||
| 42 | angular velocity of the angle between right upper arm and right_lower_arm | -Inf | Inf | right_elbow | hinge | anglular velocity (rad/s) |
|
||||
| 43 | coordinate-1 (multi-axis) of the angular velocity of the angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder1 | hinge | anglular velocity (rad/s) |
|
||||
| 44 | coordinate-2 (multi-axis) of the angular velocity of the angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder2 | hinge | anglular velocity (rad/s) |
|
||||
| 45 | angular velocitty of the angle between left upper arm and left_lower_arm | -Inf | Inf | left_elbow | hinge | anglular velocity (rad/s) |
|
||||
|
||||
|
||||
Additionally, after all the positional and velocity based values in the table,
|
||||
the state_space consists of (in order):
|
||||
- *cinert:* Mass and inertia of a single rigid body relative to the center of mass
|
||||
(this is an intermediate result of transition). It has shape 14*10 (*nbody * 10*)
|
||||
and hence adds to another 140 elements in the state space.
|
||||
- *cvel:* Center of mass based velocity. It has shape 14 * 6 (*nbody * 6*) and hence
|
||||
adds another 84 elements in the state space
|
||||
- *qfrc_actuator:* Constraint force generated as the actuator force. This has shape
|
||||
`(23,)` *(nv * 1)* and hence adds another 23 elements to the state space.
|
||||
- *cfrc_ext:* This is the center of mass based external force on the body. It has shape
|
||||
14 * 6 (*nbody * 6*) and hence adds to another 84 elements in the state space.
|
||||
where *nbody* stands for the number of bodies in the robot and *nv* stands for the number
|
||||
of degrees of freedom (*= dim(qvel)*)
|
||||
|
||||
The (x,y,z) coordinates are translational DOFs while the orientations are rotational
|
||||
DOFs expressed as quaternions. One can read more about free joints on the
|
||||
[Mujoco Documentation](https://mujoco.readthedocs.io/en/latest/XMLreference.html).
|
||||
|
||||
**Note:** There have been reported issues that using a Mujoco-Py version > 2.0 results
|
||||
in the contact forces always being 0. As such we recommend to use a Mujoco-Py version < 2.0
|
||||
when using the Humanoid environment if you would like to report results with contact forces
|
||||
(if contact forces are not used in your experiments, you can use version > 2.0).
|
||||
|
||||
### Rewards
|
||||
The reward consists of three parts:
|
||||
- *uph_cost*: A reward for moving upward (in an attempt to stand up). This is not a relative
|
||||
reward which meaures how much upward it has moved from the last timestep, but it is an
|
||||
absolute reward which measures how much upward the Humanoid has moved overall. It is
|
||||
measured as *(z coordinate after action - 0)/(atomic timestep)*, where *z coordinate after
|
||||
action* is index 0 in the state/index 2 in the table, and *atomic timestep* is the time for
|
||||
one frame of movement even though the simulation has a framerate of 5 (done in order to inflate
|
||||
rewards a little for fassteer learning).
|
||||
- *quad_ctrl_cost*: A negative reward for penalising the humanoid if it has too large of
|
||||
a control force. If there are *nu* actuators/controls, then the control has shape `nu x 1`.
|
||||
It is measured as *0.1 **x** sum(control<sup>2</sup>)*.
|
||||
- *quad_impact_cost*: A negative reward for penalising the humanoid if the external
|
||||
contact force is too large. It is calculated as *min(0.5 * 0.000001 * sum(external
|
||||
contact force<sup>2</sup>), 10)*.
|
||||
|
||||
The total reward returned is ***reward*** *=* *uph_cost + 1 - quad_ctrl_cost - quad_impact_cost*
|
||||
|
||||
### Starting State
|
||||
All observations start in state
|
||||
(0.0, 0.0, 0.105, 1.0, 0.0 ... 0.0) with a uniform noise in the range of
|
||||
[-0.01, 0.01] added to the positional and velocity values (values in the table)
|
||||
for stochasticity. Note that the initial z coordinate is intentionally selected
|
||||
to be low, thereby indicating a laying down humanoid. The initial orientation is
|
||||
designed to make it face forward as well.
|
||||
|
||||
### Episode Termination
|
||||
The episode terminates when any of the following happens:
|
||||
|
||||
1. The episode duration reaches a 1000 timesteps
|
||||
2. Any of the state space values is no longer finite
|
||||
|
||||
### Arguments
|
||||
|
||||
No additional arguments are currently supported.
|
||||
|
||||
```
|
||||
env = gym.make('HumanoidStandup-v2')
|
||||
```
|
||||
|
||||
There is no v3 for HumanoidStandup, unlike the robot environments where a v3 and
|
||||
beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
|
||||
|
||||
|
||||
### Version History
|
||||
|
||||
* v2: All continuous control environments now use mujoco_py >= 1.50
|
||||
* v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments.
|
||||
* v0: Initial versions release (1.0.0)
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
mujoco_env.MujocoEnv.__init__(self, "humanoidstandup.xml", 5)
|
||||
mujoco_env.MujocoEnv.__init__(
|
||||
self, "humanoidstandup.xml", 5, mujoco_bindings="mujoco_py"
|
||||
)
|
||||
utils.EzPickle.__init__(self)
|
||||
|
||||
def _get_obs(self):
|
||||
|
247
gym/envs/mujoco/humanoidstandup_v4.py
Normal file
247
gym/envs/mujoco/humanoidstandup_v4.py
Normal file
@@ -0,0 +1,247 @@
|
||||
import numpy as np
|
||||
|
||||
from gym import utils
|
||||
from gym.envs.mujoco import mujoco_env
|
||||
|
||||
|
||||
class HumanoidStandupEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
||||
"""
|
||||
### Description
|
||||
|
||||
This environment is based on the environment introduced by Tassa, Erez and Todorov
|
||||
in ["Synthesis and stabilization of complex behaviors through online trajectory optimization"](https://ieeexplore.ieee.org/document/6386025).
|
||||
The 3D bipedal robot is designed to simulate a human. It has a torso (abdomen) with a
|
||||
pair of legs and arms. The legs each consist of two links, and so the arms (representing the
|
||||
knees and elbows respectively). The environment starts with the humanoid layiing on the ground,
|
||||
and then the goal of the environment is to make the humanoid standup and then keep it standing
|
||||
by applying torques on the various hinges.
|
||||
|
||||
### Action Space
|
||||
The agent take a 17-element vector for actions.
|
||||
|
||||
The action space is a continuous `(action, ...)` all in `[-1, 1]`, where `action`
|
||||
represents the numerical torques applied at the hinge joints.
|
||||
|
||||
| Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit |
|
||||
|-----|----------------------|---------------|----------------|---------------------------------------|-------|------|
|
||||
| 0 | Torque applied on the hinge in the y-coordinate of the abdomen | -0.4 | 0.4 | hip_1 (front_left_leg) | hinge | torque (N m) |
|
||||
| 1 | Torque applied on the hinge in the z-coordinate of the abdomen | -0.4 | 0.4 | angle_1 (front_left_leg) | hinge | torque (N m) |
|
||||
| 2 | Torque applied on the hinge in the x-coordinate of the abdomen | -0.4 | 0.4 | hip_2 (front_right_leg) | hinge | torque (N m) |
|
||||
| 3 | Torque applied on the rotor between torso/abdomen and the right hip (x-coordinate) | -0.4 | 0.4 | right_hip_x (right_thigh) | hinge | torque (N m) |
|
||||
| 4 | Torque applied on the rotor between torso/abdomen and the right hip (z-coordinate) | -0.4 | 0.4 | right_hip_z (right_thigh) | hinge | torque (N m) |
|
||||
| 5 | Torque applied on the rotor between torso/abdomen and the right hip (y-coordinate) | -0.4 | 0.4 | right_hip_y (right_thigh) | hinge | torque (N m) |
|
||||
| 6 | Torque applied on the rotor between the right hip/thigh and the right shin | -0.4 | 0.4 | right_knee | hinge | torque (N m) |
|
||||
| 7 | Torque applied on the rotor between torso/abdomen and the left hip (x-coordinate) | -0.4 | 0.4 | left_hip_x (left_thigh) | hinge | torque (N m) |
|
||||
| 8 | Torque applied on the rotor between torso/abdomen and the left hip (z-coordinate) | -0.4 | 0.4 | left_hip_z (left_thigh) | hinge | torque (N m) |
|
||||
| 9 | Torque applied on the rotor between torso/abdomen and the left hip (y-coordinate) | -0.4 | 0.4 | left_hip_y (left_thigh) | hinge | torque (N m) |
|
||||
| 10 | Torque applied on the rotor between the left hip/thigh and the left shin | -0.4 | 0.4 | left_knee | hinge | torque (N m) |
|
||||
| 11 | Torque applied on the rotor between the torso and right upper arm (coordinate -1) | -0.4 | 0.4 | right_shoulder1 | hinge | torque (N m) |
|
||||
| 12 | Torque applied on the rotor between the torso and right upper arm (coordinate -2) | -0.4 | 0.4 | right_shoulder2 | hinge | torque (N m) |
|
||||
| 13 | Torque applied on the rotor between the right upper arm and right lower arm | -0.4 | 0.4 | right_elbow | hinge | torque (N m) |
|
||||
| 14 | Torque applied on the rotor between the torso and left upper arm (coordinate -1) | -0.4 | 0.4 | left_shoulder1 | hinge | torque (N m) |
|
||||
| 15 | Torque applied on the rotor between the torso and left upper arm (coordinate -2) | -0.4 | 0.4 | left_shoulder2 | hinge | torque (N m) |
|
||||
| 16 | Torque applied on the rotor between the left upper arm and left lower arm | -0.4 | 0.4 | left_elbow | hinge | torque (N m) |
|
||||
|
||||
### Observation Space
|
||||
|
||||
The state space consists of positional values of different body parts of the Humanoid,
|
||||
followed by the velocities of those individual parts (their derivatives) with all the positions ordered before all the velocities.
|
||||
|
||||
The observation is a `ndarray` with shape `(376,)` where the elements correspond to the following:
|
||||
|
||||
| Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit |
|
||||
|-----|---------------------------------------------------------|----------------|-----------------|----------------------------------------|-------|------|
|
||||
| 0 | x-coordinate of the torso (centre) | -Inf | Inf | root | free | position (m) |
|
||||
| 1 | y-coordinate of the torso (centre) | -Inf | Inf | root | free | position (m) |
|
||||
| 2 | z-coordinate of the torso (centre) | -Inf | Inf | root | free | position (m) |
|
||||
| 3 | x-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) |
|
||||
| 4 | y-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) |
|
||||
| 5 | z-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) |
|
||||
| 6 | w-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) |
|
||||
| 7 | z-angle of the abdomen (in lower_waist) | -Inf | Inf | abdomen_z | hinge | angle (rad) |
|
||||
| 8 | y-angle of the abdomen (in lower_waist) | -Inf | Inf | abdomen_y | hinge | angle (rad) |
|
||||
| 9 | x-angle of the abdomen (in pelvis) | -Inf | Inf | abdomen_x | hinge | angle (rad) |
|
||||
| 10 | x-coordinate of angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_x | hinge | angle (rad) |
|
||||
| 11 | z-coordinate of angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_z | hinge | angle (rad) |
|
||||
| 12 | y-coordinate of angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_y | hinge | angle (rad) |
|
||||
| 13 | angle between right hip and the right shin (in right_knee) | -Inf | Inf | right_knee | hinge | angle (rad) |
|
||||
| 14 | x-coordinate of angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_x | hinge | angle (rad) |
|
||||
| 15 | z-coordinate of angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_z | hinge | angle (rad) |
|
||||
| 16 | y-coordinate of angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_y | hinge | angle (rad) |
|
||||
| 17 | angle between left hip and the left shin (in left_knee) | -Inf | Inf | left_knee | hinge | angle (rad) |
|
||||
| 18 | coordinate-1 (multi-axis) angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder1 | hinge | angle (rad) |
|
||||
| 19 | coordinate-2 (multi-axis) angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder2 | hinge | angle (rad) |
|
||||
| 20 | angle between right upper arm and right_lower_arm | -Inf | Inf | right_elbow | hinge | angle (rad) |
|
||||
| 21 | coordinate-1 (multi-axis) angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder1 | hinge | angle (rad) |
|
||||
| 22 | coordinate-2 (multi-axis) angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder2 | hinge | angle (rad) |
|
||||
| 23 | angle between left upper arm and left_lower_arm | -Inf | Inf | left_elbow | hinge | angle (rad) |
|
||||
| 24 | x-coordinate velocity of the torso (centre) | -Inf | Inf | root | free | velocity (m/s) |
|
||||
| 25 | y-coordinate velocity of the torso (centre) | -Inf | Inf | root | free | velocity (m/s) |
|
||||
| 26 | z-coordinate velocity of the torso (centre) | -Inf | Inf | root | free | velocity (m/s) |
|
||||
| 27 | x-coordinate angular velocity of the torso (centre) | -Inf | Inf | root | free | anglular velocity (rad/s) |
|
||||
| 28 | y-coordinate angular velocity of the torso (centre) | -Inf | Inf | root | free | anglular velocity (rad/s) |
|
||||
| 29 | z-coordinate angular velocity of the torso (centre) | -Inf | Inf | root | free | anglular velocity (rad/s) |
|
||||
| 30 | z-coordinate of angular velocity of the abdomen (in lower_waist) | -Inf | Inf | abdomen_z | hinge | anglular velocity (rad/s) |
|
||||
| 31 | y-coordinate of angular velocity of the abdomen (in lower_waist) | -Inf | Inf | abdomen_y | hinge | anglular velocity (rad/s) |
|
||||
| 32 | x-coordinate of angular velocity of the abdomen (in pelvis) | -Inf | Inf | abdomen_x | hinge | aanglular velocity (rad/s) |
|
||||
| 33 | x-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_x | hinge | anglular velocity (rad/s) |
|
||||
| 34 | z-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_z | hinge | anglular velocity (rad/s) |
|
||||
| 35 | y-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_y | hinge | anglular velocity (rad/s) |
|
||||
| 36 | angular velocity of the angle between right hip and the right shin (in right_knee) | -Inf | Inf | right_knee | hinge | anglular velocity (rad/s) |
|
||||
| 37 | x-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_x | hinge | anglular velocity (rad/s) |
|
||||
| 38 | z-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_z | hinge | anglular velocity (rad/s) |
|
||||
| 39 | y-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_y | hinge | anglular velocity (rad/s) |
|
||||
| 40 | angular velocity of the angle between left hip and the left shin (in left_knee) | -Inf | Inf | left_knee | hinge | anglular velocity (rad/s) |
|
||||
| 41 | coordinate-1 (multi-axis) of the angular velocity of the angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder1 | hinge | anglular velocity (rad/s) |
|
||||
| 42 | coordinate-2 (multi-axis) of the angular velocity of the angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder2 | hinge | anglular velocity (rad/s) |
|
||||
| 43 | angular velocity of the angle between right upper arm and right_lower_arm | -Inf | Inf | right_elbow | hinge | anglular velocity (rad/s) |
|
||||
| 44 | coordinate-1 (multi-axis) of the angular velocity of the angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder1 | hinge | anglular velocity (rad/s) |
|
||||
| 45 | coordinate-2 (multi-axis) of the angular velocity of the angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder2 | hinge | anglular velocity (rad/s) |
|
||||
| 46 | angular velocitty of the angle between left upper arm and left_lower_arm | -Inf | Inf | left_elbow | hinge | anglular velocity (rad/s) |
|
||||
|
||||
|
||||
Additionally, after all the positional and velocity based values in the table,
|
||||
the state_space consists of (in order):
|
||||
- *cinert:* Mass and inertia of a single rigid body relative to the center of mass
|
||||
(this is an intermediate result of transition). It has shape 14*10 (*nbody * 10*)
|
||||
and hence adds to another 140 elements in the state space.
|
||||
- *cvel:* Center of mass based velocity. It has shape 14 * 6 (*nbody * 6*) and hence
|
||||
adds another 84 elements in the state space
|
||||
- *qfrc_actuator:* Constraint force generated as the actuator force. This has shape
|
||||
`(23,)` *(nv * 1)* and hence adds another 23 elements to the state space.
|
||||
- *cfrc_ext:* This is the center of mass based external force on the body. It has shape
|
||||
14 * 6 (*nbody * 6*) and hence adds to another 84 elements in the state space.
|
||||
where *nbody* stands for the number of bodies in the robot and *nv* stands for the number
|
||||
of degrees of freedom (*= dim(qvel)*)
|
||||
|
||||
The (x,y,z) coordinates are translational DOFs while the orientations are rotational
|
||||
DOFs expressed as quaternions. One can read more about free joints on the
|
||||
[Mujoco Documentation](https://mujoco.readthedocs.io/en/latest/XMLreference.html).
|
||||
|
||||
**Note:** There are 47 elements in the table above - giving rise to `(378,)`
|
||||
elements in the state space. In practice (and Gym implementation), the first two
|
||||
positional elements are omitted from the state space since the reward function is
|
||||
calculated based on the x-coordinate value. This value is hidden from the algorithm,
|
||||
which in turn has to develop an abstract understanding of it from the observed rewards.
|
||||
Therefore, observation space has shape `(376,)` instead of `(378,)` and the table should not have the first two rows.
|
||||
|
||||
**Note:** HumanoidStandup-v4 environment no longer has the following contact forces issue.
|
||||
If using previous HumanoidStandup versions from v4, there have been reported issues that using a Mujoco-Py version > 2.0 results
|
||||
in the contact forces always being 0. As such we recommend to use a Mujoco-Py version < 2.0
|
||||
when using the Humanoid environment if you would like to report results with contact forces
|
||||
(if contact forces are not used in your experiments, you can use version > 2.0).
|
||||
|
||||
### Rewards
|
||||
The reward consists of three parts:
|
||||
- *uph_cost*: A reward for moving upward (in an attempt to stand up). This is not a relative
|
||||
reward which meaures how much upward it has moved from the last timestep, but it is an
|
||||
absolute reward which measures how much upward the Humanoid has moved overall. It is
|
||||
measured as *(z coordinate after action - 0)/(atomic timestep)*, where *z coordinate after
|
||||
action* is index 0 in the state/index 2 in the table, and *atomic timestep* is the time for
|
||||
one frame of movement even though the simulation has a framerate of 5 (done in order to inflate
|
||||
rewards a little for fassteer learning).
|
||||
- *quad_ctrl_cost*: A negative reward for penalising the humanoid if it has too large of
|
||||
a control force. If there are *nu* actuators/controls, then the control has shape `nu x 1`.
|
||||
It is measured as *0.1 **x** sum(control<sup>2</sup>)*.
|
||||
- *quad_impact_cost*: A negative reward for penalising the humanoid if the external
|
||||
contact force is too large. It is calculated as *min(0.5 * 0.000001 * sum(external
|
||||
contact force<sup>2</sup>), 10)*.
|
||||
|
||||
The total reward returned is ***reward*** *=* *uph_cost + 1 - quad_ctrl_cost - quad_impact_cost*
|
||||
|
||||
### Starting State
|
||||
All observations start in state
|
||||
(0.0, 0.0, 0.105, 1.0, 0.0 ... 0.0) with a uniform noise in the range of
|
||||
[-0.01, 0.01] added to the positional and velocity values (values in the table)
|
||||
for stochasticity. Note that the initial z coordinate is intentionally selected
|
||||
to be low, thereby indicating a laying down humanoid. The initial orientation is
|
||||
designed to make it face forward as well.
|
||||
|
||||
### Episode Termination
|
||||
The episode terminates when any of the following happens:
|
||||
|
||||
1. The episode duration reaches a 1000 timesteps
|
||||
2. Any of the state space values is no longer finite
|
||||
|
||||
### Arguments
|
||||
|
||||
No additional arguments are currently supported (in v2 and lower), but
|
||||
modifications can be made to the XML file in the assets folder
|
||||
(or by changing the path to a modified XML file in another folder)..
|
||||
|
||||
```
|
||||
env = gym.make('HumanoidStandup-v2')
|
||||
```
|
||||
|
||||
There is no v3 for HumanoidStandup, unlike the robot environments where a v3 and
|
||||
beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
|
||||
|
||||
|
||||
### Version History
|
||||
|
||||
* v4: all mujoco environments now use the mujoco binidings in mujoco>=2.1.3
|
||||
* v3: support for gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. rgb rendering comes from tracking camera (so agent does not run away from screen)
|
||||
* v2: All continuous control environments now use mujoco_py >= 1.50
|
||||
* v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments.
|
||||
* v0: Initial versions release (1.0.0)
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
mujoco_env.MujocoEnv.__init__(self, "humanoidstandup.xml", 5)
|
||||
utils.EzPickle.__init__(self)
|
||||
|
||||
def _get_obs(self):
|
||||
data = self.data
|
||||
return np.concatenate(
|
||||
[
|
||||
data.qpos.flat[2:],
|
||||
data.qvel.flat,
|
||||
data.cinert.flat,
|
||||
data.cvel.flat,
|
||||
data.qfrc_actuator.flat,
|
||||
data.cfrc_ext.flat,
|
||||
]
|
||||
)
|
||||
|
||||
def step(self, a):
|
||||
self.do_simulation(a, self.frame_skip)
|
||||
pos_after = self.data.qpos[2]
|
||||
data = self.data
|
||||
uph_cost = (pos_after - 0) / self.model.opt.timestep
|
||||
|
||||
quad_ctrl_cost = 0.1 * np.square(data.ctrl).sum()
|
||||
quad_impact_cost = 0.5e-6 * np.square(data.cfrc_ext).sum()
|
||||
quad_impact_cost = min(quad_impact_cost, 10)
|
||||
reward = uph_cost - quad_ctrl_cost - quad_impact_cost + 1
|
||||
|
||||
done = bool(False)
|
||||
return (
|
||||
self._get_obs(),
|
||||
reward,
|
||||
done,
|
||||
dict(
|
||||
reward_linup=uph_cost,
|
||||
reward_quadctrl=-quad_ctrl_cost,
|
||||
reward_impact=-quad_impact_cost,
|
||||
),
|
||||
)
|
||||
|
||||
def reset_model(self):
|
||||
c = 0.01
|
||||
self.set_state(
|
||||
self.init_qpos + self.np_random.uniform(low=-c, high=c, size=self.model.nq),
|
||||
self.init_qvel
|
||||
+ self.np_random.uniform(
|
||||
low=-c,
|
||||
high=c,
|
||||
size=self.model.nv,
|
||||
),
|
||||
)
|
||||
return self._get_obs()
|
||||
|
||||
def viewer_setup(self):
|
||||
self.viewer.cam.trackbodyid = 1
|
||||
self.viewer.cam.distance = self.model.stat.extent * 1.0
|
||||
self.viewer.cam.lookat[2] = 0.8925
|
||||
self.viewer.cam.elevation = -20
|
@@ -5,113 +5,10 @@ from gym.envs.mujoco import mujoco_env
|
||||
|
||||
|
||||
class InvertedDoublePendulumEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
||||
"""
|
||||
### Description
|
||||
|
||||
This environment originates from control theory and builds on the cartpole
|
||||
environment based on the work done by Barto, Sutton, and Anderson in
|
||||
["Neuronlike adaptive elements that can solve difficult learning control problems"](https://ieeexplore.ieee.org/document/6313077),
|
||||
powered by the Mujoco physics simulator - allowing for more complex experiments
|
||||
(such as varying the effects of gravity or constraints). This environment involves a cart that can
|
||||
moved linearly, with a pole fixed on it and a second pole fixed on the other end of the first one
|
||||
(leaving the second pole as the only one with one free end). The cart can be pushed left or right,
|
||||
and the goal is to balance the second pole on top of the first pole, which is in turn on top of the
|
||||
cart, by applying continuous forces on the cart.
|
||||
|
||||
### Action Space
|
||||
The agent take a 1-element vector for actions.
|
||||
The action space is a continuous `(action)` in `[-1, 1]`, where `action` represents the
|
||||
numerical force applied to the cart (with magnitude representing the amount of force and
|
||||
sign representing the direction)
|
||||
|
||||
| Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit |
|
||||
|-----|---------------------------|-------------|-------------|----------------------------------|-------|-----------|
|
||||
| 0 | Force applied on the cart | -1 | 1 | slider | slide | Force (N) |
|
||||
|
||||
### Observation Space
|
||||
|
||||
The state space consists of positional values of different body parts of the pendulum system,
|
||||
followed by the velocities of those individual parts (their derivatives) with all the
|
||||
positions ordered before all the velocities.
|
||||
|
||||
The observation is a `ndarray` with shape `(11,)` where the elements correspond to the following:
|
||||
|
||||
| Num | Observation | Min | Max | Name (in corresponding XML file) | Joint| Unit |
|
||||
|-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------|
|
||||
| 0 | position of the cart along the linear surface | -Inf | Inf | slider | slide | position (m) |
|
||||
| 1 | sine of the angle between the cart and the first pole | -Inf | Inf | sin(hinge) | hinge | unitless |
|
||||
| 2 | sine of the angle between the two poles | -Inf | Inf | sin(hinge2) | hinge | unitless |
|
||||
| 3 | cosine of the angle between the cart and the first pole | -Inf | Inf | cos(hinge) | hinge | unitless |
|
||||
| 4 | cosine of the angle between the two poles | -Inf | Inf | cos(hinge2) | hinge | unitless |
|
||||
| 5 | velocity of the cart | -Inf | Inf | slider | slide | velocity (m/s) |
|
||||
| 6 | angular velocity of the angle between the cart and the first pole | -Inf | Inf | hinge | hinge | angular velocity (rad/s) |
|
||||
| 7 | angular velocity of the angle between the two poles | -Inf | Inf | hinge2 | hinge | angular velocity (rad/s) |
|
||||
| 8 | constraint force - 1 | -Inf | Inf | | | Force (N) |
|
||||
| 9 | constraint force - 2 | -Inf | Inf | | | Force (N) |
|
||||
| 10 | constraint force - 3 | -Inf | Inf | | | Force (N) |
|
||||
|
||||
|
||||
There is physical contact between the robots and their environment - and Mujoco
|
||||
attempts at getting realisitic physics simulations for the possible physical contact
|
||||
dynamics by aiming for physical accuracy and computational efficiency.
|
||||
|
||||
There is one constraint force for contacts for each degree of freedom (3).
|
||||
The approach and handling of constraints by Mujoco is unique to the simulator
|
||||
and is based on their research. Once can find more information in their
|
||||
[*documentation*](https://mujoco.readthedocs.io/en/latest/computation.html)
|
||||
or in their paper
|
||||
["Analytically-invertible dynamics with contacts and constraints: Theory and implementation in MuJoCo"](https://homes.cs.washington.edu/~todorov/papers/TodorovICRA14.pdf).
|
||||
|
||||
|
||||
### Rewards
|
||||
|
||||
The reward consists of two parts:
|
||||
- *alive_bonus*: The goal is to make the second inverted pendulum stand upright
|
||||
(within a certain angle limit) as long as possible - as such a reward of +10 is awarded
|
||||
for each timestep that the second pole is upright.
|
||||
- *distance_penalty*: This reward is a measure of how far the *tip* of the second pendulum
|
||||
(the only free end) moves, and it is calculated as
|
||||
*0.01 * x<sup>2</sup> + (y - 2)<sup>2</sup>*, where *x* is the x-coordinate of the tip
|
||||
and *y* is the y-coordinate of the tip of the second pole.
|
||||
- *velocity_penalty*: A negative reward for penalising the agent if it moves too
|
||||
fast *0.001 * v<sub>1</sub><sup>2</sup> + 0.005 * v<sub>2</sub> <sup>2</sup>*
|
||||
|
||||
The total reward returned is ***reward*** *=* *alive_bonus - distance_penalty - velocity_penalty*
|
||||
|
||||
### Starting State
|
||||
All observations start in state
|
||||
(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0) with a uniform noise in the range
|
||||
of [-0.1, 0.1] added to the positional values (cart position and pole angles) and standard
|
||||
normal force with a standard deviation of 0.1 added to the velocity values for stochasticity.
|
||||
|
||||
### Episode Termination
|
||||
The episode terminates when any of the following happens:
|
||||
|
||||
1. The episode duration reaches 1000 timesteps.
|
||||
2. Any of the state space values is no longer finite.
|
||||
3. The y_coordinate of the tip of the second pole *is less than or equal* to 1. The maximum standing height of the system is 1.196 m when all the parts are perpendicularly vertical on top of each other).
|
||||
|
||||
### Arguments
|
||||
|
||||
No additional arguments are currently supported.
|
||||
|
||||
```
|
||||
env = gym.make('InvertedDoublePendulum-v2')
|
||||
```
|
||||
There is no v3 for InvertedPendulum, unlike the robot environments where a v3 and
|
||||
beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
|
||||
|
||||
|
||||
### Version History
|
||||
|
||||
* v2: All continuous control environments now use mujoco_py >= 1.50
|
||||
* v1: max_time_steps raised to 1000 for robot based tasks (including inverted pendulum)
|
||||
* v0: Initial versions release (1.0.0)
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
mujoco_env.MujocoEnv.__init__(self, "inverted_double_pendulum.xml", 5)
|
||||
mujoco_env.MujocoEnv.__init__(
|
||||
self, "inverted_double_pendulum.xml", 5, mujoco_bindings="mujoco_py"
|
||||
)
|
||||
utils.EzPickle.__init__(self)
|
||||
|
||||
def step(self, action):
|
||||
|
156
gym/envs/mujoco/inverted_double_pendulum_v4.py
Normal file
156
gym/envs/mujoco/inverted_double_pendulum_v4.py
Normal file
@@ -0,0 +1,156 @@
|
||||
import numpy as np
|
||||
|
||||
from gym import utils
|
||||
from gym.envs.mujoco import mujoco_env
|
||||
|
||||
|
||||
class InvertedDoublePendulumEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
||||
"""
|
||||
### Description
|
||||
|
||||
This environment originates from control theory and builds on the cartpole
|
||||
environment based on the work done by Barto, Sutton, and Anderson in
|
||||
["Neuronlike adaptive elements that can solve difficult learning control problems"](https://ieeexplore.ieee.org/document/6313077),
|
||||
powered by the Mujoco physics simulator - allowing for more complex experiments
|
||||
(such as varying the effects of gravity or constraints). This environment involves a cart that can
|
||||
moved linearly, with a pole fixed on it and a second pole fixed on the other end of the first one
|
||||
(leaving the second pole as the only one with one free end). The cart can be pushed left or right,
|
||||
and the goal is to balance the second pole on top of the first pole, which is in turn on top of the
|
||||
cart, by applying continuous forces on the cart.
|
||||
|
||||
### Action Space
|
||||
The agent take a 1-element vector for actions.
|
||||
The action space is a continuous `(action)` in `[-1, 1]`, where `action` represents the
|
||||
numerical force applied to the cart (with magnitude representing the amount of force and
|
||||
sign representing the direction)
|
||||
|
||||
| Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit |
|
||||
|-----|---------------------------|-------------|-------------|----------------------------------|-------|-----------|
|
||||
| 0 | Force applied on the cart | -1 | 1 | slider | slide | Force (N) |
|
||||
|
||||
### Observation Space
|
||||
|
||||
The state space consists of positional values of different body parts of the pendulum system,
|
||||
followed by the velocities of those individual parts (their derivatives) with all the
|
||||
positions ordered before all the velocities.
|
||||
|
||||
The observation is a `ndarray` with shape `(11,)` where the elements correspond to the following:
|
||||
|
||||
| Num | Observation | Min | Max | Name (in corresponding XML file) | Joint| Unit |
|
||||
|-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------|
|
||||
| 0 | position of the cart along the linear surface | -Inf | Inf | slider | slide | position (m) |
|
||||
| 1 | sine of the angle between the cart and the first pole | -Inf | Inf | sin(hinge) | hinge | unitless |
|
||||
| 2 | sine of the angle between the two poles | -Inf | Inf | sin(hinge2) | hinge | unitless |
|
||||
| 3 | cosine of the angle between the cart and the first pole | -Inf | Inf | cos(hinge) | hinge | unitless |
|
||||
| 4 | cosine of the angle between the two poles | -Inf | Inf | cos(hinge2) | hinge | unitless |
|
||||
| 5 | velocity of the cart | -Inf | Inf | slider | slide | velocity (m/s) |
|
||||
| 6 | angular velocity of the angle between the cart and the first pole | -Inf | Inf | hinge | hinge | angular velocity (rad/s) |
|
||||
| 7 | angular velocity of the angle between the two poles | -Inf | Inf | hinge2 | hinge | angular velocity (rad/s) |
|
||||
| 8 | constraint force - 1 | -Inf | Inf | | | Force (N) |
|
||||
| 9 | constraint force - 2 | -Inf | Inf | | | Force (N) |
|
||||
| 10 | constraint force - 3 | -Inf | Inf | | | Force (N) |
|
||||
|
||||
|
||||
There is physical contact between the robots and their environment - and Mujoco
|
||||
attempts at getting realisitic physics simulations for the possible physical contact
|
||||
dynamics by aiming for physical accuracy and computational efficiency.
|
||||
|
||||
There is one constraint force for contacts for each degree of freedom (3).
|
||||
The approach and handling of constraints by Mujoco is unique to the simulator
|
||||
and is based on their research. Once can find more information in their
|
||||
[*documentation*](https://mujoco.readthedocs.io/en/latest/computation.html)
|
||||
or in their paper
|
||||
["Analytically-invertible dynamics with contacts and constraints: Theory and implementation in MuJoCo"](https://homes.cs.washington.edu/~todorov/papers/TodorovICRA14.pdf).
|
||||
|
||||
|
||||
### Rewards
|
||||
|
||||
The reward consists of two parts:
|
||||
- *alive_bonus*: The goal is to make the second inverted pendulum stand upright
|
||||
(within a certain angle limit) as long as possible - as such a reward of +10 is awarded
|
||||
for each timestep that the second pole is upright.
|
||||
- *distance_penalty*: This reward is a measure of how far the *tip* of the second pendulum
|
||||
(the only free end) moves, and it is calculated as
|
||||
*0.01 * x<sup>2</sup> + (y - 2)<sup>2</sup>*, where *x* is the x-coordinate of the tip
|
||||
and *y* is the y-coordinate of the tip of the second pole.
|
||||
- *velocity_penalty*: A negative reward for penalising the agent if it moves too
|
||||
fast *0.001 * v<sub>1</sub><sup>2</sup> + 0.005 * v<sub>2</sub> <sup>2</sup>*
|
||||
|
||||
The total reward returned is ***reward*** *=* *alive_bonus - distance_penalty - velocity_penalty*
|
||||
|
||||
### Starting State
|
||||
All observations start in state
|
||||
(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0) with a uniform noise in the range
|
||||
of [-0.1, 0.1] added to the positional values (cart position and pole angles) and standard
|
||||
normal force with a standard deviation of 0.1 added to the velocity values for stochasticity.
|
||||
|
||||
### Episode Termination
|
||||
The episode terminates when any of the following happens:
|
||||
|
||||
1. The episode duration reaches 1000 timesteps.
|
||||
2. Any of the state space values is no longer finite.
|
||||
3. The y_coordinate of the tip of the second pole *is less than or equal* to 1. The maximum standing height of the system is 1.196 m when all the parts are perpendicularly vertical on top of each other).
|
||||
|
||||
### Arguments
|
||||
|
||||
No additional arguments are currently supported (in v2 and lower), but modifications can
|
||||
be made to the XML file in the assets folder (or by changing the path to a modified XML
|
||||
file in another folder)..
|
||||
|
||||
```
|
||||
env = gym.make('InvertedDoublePendulum-v2')
|
||||
```
|
||||
There is no v3 for InvertedPendulum, unlike the robot environments where a v3 and
|
||||
beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
|
||||
|
||||
|
||||
### Version History
|
||||
|
||||
* v4: all mujoco environments now use the mujoco binidings in mujoco>=2.1.3
|
||||
* v3: support for gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. rgb rendering comes from tracking camera (so agent does not run away from screen)
|
||||
* v2: All continuous control environments now use mujoco_py >= 1.50
|
||||
* v1: max_time_steps raised to 1000 for robot based tasks (including inverted pendulum)
|
||||
* v0: Initial versions release (1.0.0)
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
mujoco_env.MujocoEnv.__init__(self, "inverted_double_pendulum.xml", 5)
|
||||
utils.EzPickle.__init__(self)
|
||||
|
||||
def step(self, action):
|
||||
self.do_simulation(action, self.frame_skip)
|
||||
ob = self._get_obs()
|
||||
x, _, y = self.data.site_xpos[0]
|
||||
dist_penalty = 0.01 * x**2 + (y - 2) ** 2
|
||||
v1, v2 = self.data.qvel[1:3]
|
||||
vel_penalty = 1e-3 * v1**2 + 5e-3 * v2**2
|
||||
alive_bonus = 10
|
||||
r = alive_bonus - dist_penalty - vel_penalty
|
||||
done = bool(y <= 1)
|
||||
return ob, r, done, {}
|
||||
|
||||
def _get_obs(self):
|
||||
return np.concatenate(
|
||||
[
|
||||
self.data.qpos[:1], # cart x pos
|
||||
np.sin(self.data.qpos[1:]), # link angles
|
||||
np.cos(self.data.qpos[1:]),
|
||||
np.clip(self.data.qvel, -10, 10),
|
||||
np.clip(self.data.qfrc_constraint, -10, 10),
|
||||
]
|
||||
).ravel()
|
||||
|
||||
def reset_model(self):
|
||||
self.set_state(
|
||||
self.init_qpos
|
||||
+ self.np_random.uniform(low=-0.1, high=0.1, size=self.model.nq),
|
||||
self.init_qvel + self.np_random.standard_normal(self.model.nv) * 0.1,
|
||||
)
|
||||
return self._get_obs()
|
||||
|
||||
def viewer_setup(self):
|
||||
v = self.viewer
|
||||
v.cam.trackbodyid = 0
|
||||
v.cam.distance = self.model.stat.extent * 0.5
|
||||
v.cam.lookat[2] = 0.12250000000000005 # v.model.stat.center[2]
|
@@ -5,85 +5,11 @@ from gym.envs.mujoco import mujoco_env
|
||||
|
||||
|
||||
class InvertedPendulumEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
||||
"""
|
||||
### Description
|
||||
|
||||
This environment is the cartpole environment based on the work done by
|
||||
Barto, Sutton, and Anderson in ["Neuronlike adaptive elements that can
|
||||
solve difficult learning control problems"](https://ieeexplore.ieee.org/document/6313077),
|
||||
just like in the classic environments but now powered by the Mujoco physics simulator -
|
||||
allowing for more complex experiments (such as varying the effects of gravity).
|
||||
This environment involves a cart that can moved linearly, with a pole fixed on it
|
||||
at one end and having another end free. The cart can be pushed left or right, and the
|
||||
goal is to balance the pole on the top of the cart by applying forces on the cart.
|
||||
|
||||
### Action Space
|
||||
The agent take a 1-element vector for actions.
|
||||
|
||||
The action space is a continuous `(action)` in `[-3, 3]`, where `action` represents
|
||||
the numerical force applied to the cart (with magnitude representing the amount of
|
||||
force and sign representing the direction)
|
||||
|
||||
| Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit |
|
||||
|-----|---------------------------|-------------|-------------|----------------------------------|-------|-----------|
|
||||
| 0 | Force applied on the cart | -3 | 3 | slider | slide | Force (N) |
|
||||
|
||||
### Observation Space
|
||||
|
||||
The state space consists of positional values of different body parts of
|
||||
the pendulum system, followed by the velocities of those individual parts (their derivatives)
|
||||
with all the positions ordered before all the velocities.
|
||||
|
||||
The observation is a `ndarray` with shape `(4,)` where the elements correspond to the following:
|
||||
|
||||
| Num | Observation | Min | Max | Name (in corresponding XML file) | Joint| Unit |
|
||||
|-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------|
|
||||
| 0 | position of the cart along the linear surface | -Inf | Inf | slider | slide | position (m) |
|
||||
| 1 | vertical angle of the pole on the cart | -Inf | Inf | hinge | hinge | angle (rad) |
|
||||
| 2 | linear velocity of the cart | -Inf | Inf | slider | slide | velocity (m/s) |
|
||||
| 3 | angular velocity of the pole on the cart | -Inf | Inf | hinge | hinge | anglular velocity (rad/s) |
|
||||
|
||||
|
||||
### Rewards
|
||||
|
||||
The goal is to make the inverted pendulum stand upright (within a certain angle limit)
|
||||
as long as possible - as such a reward of +1 is awarded for each timestep that
|
||||
the pole is upright.
|
||||
|
||||
### Starting State
|
||||
All observations start in state
|
||||
(0.0, 0.0, 0.0, 0.0) with a uniform noise in the range
|
||||
of [-0.01, 0.01] added to the values for stochasticity.
|
||||
|
||||
### Episode Termination
|
||||
The episode terminates when any of the following happens:
|
||||
|
||||
1. The episode duration reaches 1000 timesteps.
|
||||
2. Any of the state space values is no longer finite.
|
||||
3. The absolutely value of the vertical angle between the pole and the cart is greater than 0.2 radian.
|
||||
|
||||
### Arguments
|
||||
|
||||
No additional arguments are currently supported.
|
||||
|
||||
```
|
||||
env = gym.make('InvertedPendulum-v2')
|
||||
```
|
||||
There is no v3 for InvertedPendulum, unlike the robot environments where a
|
||||
v3 and beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
|
||||
|
||||
|
||||
### Version History
|
||||
|
||||
* v2: All continuous control environments now use mujoco_py >= 1.50
|
||||
* v1: max_time_steps raised to 1000 for robot based tasks (including inverted pendulum)
|
||||
* v0: Initial versions release (1.0.0)
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
utils.EzPickle.__init__(self)
|
||||
mujoco_env.MujocoEnv.__init__(self, "inverted_pendulum.xml", 2)
|
||||
mujoco_env.MujocoEnv.__init__(
|
||||
self, "inverted_pendulum.xml", 2, mujoco_bindings="mujoco_py"
|
||||
)
|
||||
|
||||
def step(self, a):
|
||||
reward = 1.0
|
||||
|
116
gym/envs/mujoco/inverted_pendulum_v4.py
Normal file
116
gym/envs/mujoco/inverted_pendulum_v4.py
Normal file
@@ -0,0 +1,116 @@
|
||||
import numpy as np
|
||||
|
||||
from gym import utils
|
||||
from gym.envs.mujoco import mujoco_env
|
||||
|
||||
|
||||
class InvertedPendulumEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
||||
"""
|
||||
### Description
|
||||
|
||||
This environment is the cartpole environment based on the work done by
|
||||
Barto, Sutton, and Anderson in ["Neuronlike adaptive elements that can
|
||||
solve difficult learning control problems"](https://ieeexplore.ieee.org/document/6313077),
|
||||
just like in the classic environments but now powered by the Mujoco physics simulator -
|
||||
allowing for more complex experiments (such as varying the effects of gravity).
|
||||
This environment involves a cart that can moved linearly, with a pole fixed on it
|
||||
at one end and having another end free. The cart can be pushed left or right, and the
|
||||
goal is to balance the pole on the top of the cart by applying forces on the cart.
|
||||
|
||||
### Action Space
|
||||
The agent take a 1-element vector for actions.
|
||||
|
||||
The action space is a continuous `(action)` in `[-3, 3]`, where `action` represents
|
||||
the numerical force applied to the cart (with magnitude representing the amount of
|
||||
force and sign representing the direction)
|
||||
|
||||
| Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit |
|
||||
|-----|---------------------------|-------------|-------------|----------------------------------|-------|-----------|
|
||||
| 0 | Force applied on the cart | -3 | 3 | slider | slide | Force (N) |
|
||||
|
||||
### Observation Space
|
||||
|
||||
The state space consists of positional values of different body parts of
|
||||
the pendulum system, followed by the velocities of those individual parts (their derivatives)
|
||||
with all the positions ordered before all the velocities.
|
||||
|
||||
The observation is a `ndarray` with shape `(4,)` where the elements correspond to the following:
|
||||
|
||||
| Num | Observation | Min | Max | Name (in corresponding XML file) | Joint| Unit |
|
||||
|-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------|
|
||||
| 0 | position of the cart along the linear surface | -Inf | Inf | slider | slide | position (m) |
|
||||
| 1 | vertical angle of the pole on the cart | -Inf | Inf | hinge | hinge | angle (rad) |
|
||||
| 2 | linear velocity of the cart | -Inf | Inf | slider | slide | velocity (m/s) |
|
||||
| 3 | angular velocity of the pole on the cart | -Inf | Inf | hinge | hinge | anglular velocity (rad/s) |
|
||||
|
||||
|
||||
### Rewards
|
||||
|
||||
The goal is to make the inverted pendulum stand upright (within a certain angle limit)
|
||||
as long as possible - as such a reward of +1 is awarded for each timestep that
|
||||
the pole is upright.
|
||||
|
||||
### Starting State
|
||||
All observations start in state
|
||||
(0.0, 0.0, 0.0, 0.0) with a uniform noise in the range
|
||||
of [-0.01, 0.01] added to the values for stochasticity.
|
||||
|
||||
### Episode Termination
|
||||
The episode terminates when any of the following happens:
|
||||
|
||||
1. The episode duration reaches 1000 timesteps.
|
||||
2. Any of the state space values is no longer finite.
|
||||
3. The absolutely value of the vertical angle between the pole and the cart is greater than 0.2 radian.
|
||||
|
||||
### Arguments
|
||||
|
||||
No additional arguments are currently supported (in v2 and lower),
|
||||
but modifications can be made to the XML file in the assets folder
|
||||
(or by changing the path to a modified XML file in another folder).
|
||||
|
||||
```
|
||||
env = gym.make('InvertedPendulum-v2')
|
||||
```
|
||||
There is no v3 for InvertedPendulum, unlike the robot environments where a
|
||||
v3 and beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
|
||||
|
||||
|
||||
### Version History
|
||||
|
||||
* v4: all mujoco environments now use the mujoco binidings in mujoco>=2.1.3
|
||||
* v3: support for gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. rgb rendering comes from tracking camera (so agent does not run away from screen)
|
||||
* v2: All continuous control environments now use mujoco_py >= 1.50
|
||||
* v1: max_time_steps raised to 1000 for robot based tasks (including inverted pendulum)
|
||||
* v0: Initial versions release (1.0.0)
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
utils.EzPickle.__init__(self)
|
||||
mujoco_env.MujocoEnv.__init__(self, "inverted_pendulum.xml", 2)
|
||||
|
||||
def step(self, a):
|
||||
reward = 1.0
|
||||
self.do_simulation(a, self.frame_skip)
|
||||
ob = self._get_obs()
|
||||
notdone = np.isfinite(ob).all() and (np.abs(ob[1]) <= 0.2)
|
||||
done = not notdone
|
||||
return ob, reward, done, {}
|
||||
|
||||
def reset_model(self):
|
||||
qpos = self.init_qpos + self.np_random.uniform(
|
||||
size=self.model.nq, low=-0.01, high=0.01
|
||||
)
|
||||
qvel = self.init_qvel + self.np_random.uniform(
|
||||
size=self.model.nv, low=-0.01, high=0.01
|
||||
)
|
||||
self.set_state(qpos, qvel)
|
||||
return self._get_obs()
|
||||
|
||||
def _get_obs(self):
|
||||
return np.concatenate([self.data.qpos, self.data.qvel]).ravel()
|
||||
|
||||
def viewer_setup(self):
|
||||
v = self.viewer
|
||||
v.cam.trackbodyid = 0
|
||||
v.cam.distance = self.model.stat.extent
|
@@ -1,4 +1,3 @@
|
||||
import os
|
||||
from collections import OrderedDict
|
||||
from os import path
|
||||
from typing import Optional
|
||||
@@ -6,18 +5,9 @@ from typing import Optional
|
||||
import numpy as np
|
||||
|
||||
import gym
|
||||
from gym import error, spaces
|
||||
from gym import error, logger, spaces
|
||||
|
||||
try:
|
||||
import mujoco_py
|
||||
except ImportError as e:
|
||||
raise error.DependencyNotInstalled(
|
||||
"{}. (HINT: you need to install mujoco_py, and also perform the setup instructions here: https://github.com/openai/mujoco-py/.)".format(
|
||||
e
|
||||
)
|
||||
)
|
||||
|
||||
DEFAULT_SIZE = 500
|
||||
DEFAULT_SIZE = 480
|
||||
|
||||
|
||||
def convert_observation_to_space(observation):
|
||||
@@ -43,28 +33,64 @@ def convert_observation_to_space(observation):
|
||||
class MujocoEnv(gym.Env):
|
||||
"""Superclass for all MuJoCo environments."""
|
||||
|
||||
def __init__(self, model_path, frame_skip):
|
||||
def __init__(self, model_path, frame_skip, mujoco_bindings="mujoco"):
|
||||
|
||||
if model_path.startswith("/"):
|
||||
fullpath = model_path
|
||||
else:
|
||||
fullpath = os.path.join(os.path.dirname(__file__), "assets", model_path)
|
||||
fullpath = path.join(path.dirname(__file__), "assets", model_path)
|
||||
if not path.exists(fullpath):
|
||||
raise OSError(f"File {fullpath} does not exist")
|
||||
self.frame_skip = frame_skip
|
||||
self.model = mujoco_py.load_model_from_path(fullpath)
|
||||
self.sim = mujoco_py.MjSim(self.model)
|
||||
|
||||
if mujoco_bindings == "mujoco_py":
|
||||
logger.warn(
|
||||
"This version of the mujoco environments depends "
|
||||
"on the mujoco-py bindings, which are no longer maintained "
|
||||
"and may stop working. Please upgrade to the v4 versions of "
|
||||
"the environments (which depend on the mujoco python bindings instead), unless "
|
||||
"you are trying to precisely replicate previous works)."
|
||||
)
|
||||
try:
|
||||
import mujoco_py
|
||||
|
||||
self._mujoco_bindings = mujoco_py
|
||||
except ImportError as e:
|
||||
raise error.DependencyNotInstalled(
|
||||
"{}. (HINT: you need to install mujoco_py, and also perform the setup instructions here: https://github.com/openai/mujoco-py/.)".format(
|
||||
e
|
||||
)
|
||||
)
|
||||
|
||||
self.model = self._mujoco_bindings.load_model_from_path(fullpath)
|
||||
self.sim = self._mujoco_bindings.MjSim(self.model)
|
||||
self.data = self.sim.data
|
||||
self.viewer = None
|
||||
|
||||
elif mujoco_bindings == "mujoco":
|
||||
try:
|
||||
import mujoco
|
||||
|
||||
self._mujoco_bindings = mujoco
|
||||
|
||||
except ImportError as e:
|
||||
raise error.DependencyNotInstalled(
|
||||
f"{e}. (HINT: you need to install mujoco)"
|
||||
)
|
||||
self.model = self._mujoco_bindings.MjModel.from_xml_path(fullpath)
|
||||
self.data = self._mujoco_bindings.MjData(self.model)
|
||||
|
||||
self.init_qpos = self.data.qpos.ravel().copy()
|
||||
self.init_qvel = self.data.qvel.ravel().copy()
|
||||
self._viewers = {}
|
||||
|
||||
self.frame_skip = frame_skip
|
||||
|
||||
self.viewer = None
|
||||
|
||||
self.metadata = {
|
||||
"render_modes": ["human", "rgb_array", "depth_array"],
|
||||
"render_fps": int(np.round(1.0 / self.dt)),
|
||||
}
|
||||
|
||||
self.init_qpos = self.sim.data.qpos.ravel().copy()
|
||||
self.init_qvel = self.sim.data.qvel.ravel().copy()
|
||||
|
||||
self._set_action_space()
|
||||
|
||||
action = self.action_space.sample()
|
||||
@@ -111,7 +137,12 @@ class MujocoEnv(gym.Env):
|
||||
options: Optional[dict] = None,
|
||||
):
|
||||
super().reset(seed=seed)
|
||||
|
||||
if self._mujoco_bindings.__name__ == "mujoco_py":
|
||||
self.sim.reset()
|
||||
else:
|
||||
self._mujoco_bindings.mj_resetData(self.model, self.data)
|
||||
|
||||
ob = self.reset_model()
|
||||
if not return_info:
|
||||
return ob
|
||||
@@ -120,12 +151,19 @@ class MujocoEnv(gym.Env):
|
||||
|
||||
def set_state(self, qpos, qvel):
|
||||
assert qpos.shape == (self.model.nq,) and qvel.shape == (self.model.nv,)
|
||||
old_state = self.sim.get_state()
|
||||
new_state = mujoco_py.MjSimState(
|
||||
old_state.time, qpos, qvel, old_state.act, old_state.udd_state
|
||||
if self._mujoco_bindings.__name__ == "mujoco_py":
|
||||
state = self.sim.get_state()
|
||||
state = self._mujoco_bindings.MjSimState(
|
||||
state.time, qpos, qvel, state.act, state.udd_state
|
||||
)
|
||||
self.sim.set_state(new_state)
|
||||
self.sim.set_state(state)
|
||||
self.sim.forward()
|
||||
else:
|
||||
self.data.qpos[:] = np.copy(qpos)
|
||||
self.data.qvel[:] = np.copy(qvel)
|
||||
if self.model.na == 0:
|
||||
self.data.act[:] = None
|
||||
self._mujoco_bindings.mj_forward(self.model, self.data)
|
||||
|
||||
@property
|
||||
def dt(self):
|
||||
@@ -135,9 +173,21 @@ class MujocoEnv(gym.Env):
|
||||
if np.array(ctrl).shape != self.action_space.shape:
|
||||
raise ValueError("Action dimension mismatch")
|
||||
|
||||
if self._mujoco_bindings.__name__ == "mujoco_py":
|
||||
self.sim.data.ctrl[:] = ctrl
|
||||
else:
|
||||
self.data.ctrl[:] = ctrl
|
||||
for _ in range(n_frames):
|
||||
if self._mujoco_bindings.__name__ == "mujoco_py":
|
||||
self.sim.step()
|
||||
else:
|
||||
self._mujoco_bindings.mj_step(self.model, self.data)
|
||||
|
||||
# As of MuJoCo 2.0, force-related quantities like cacc are not computed
|
||||
# unless there's a force sensor in the model.
|
||||
# See https://github.com/openai/gym/issues/1541
|
||||
if self._mujoco_bindings.__name__ != "mujoco_py":
|
||||
self._mujoco_bindings.mj_rnePostConstraint(self.model, self.data)
|
||||
|
||||
def render(
|
||||
self,
|
||||
@@ -158,19 +208,25 @@ class MujocoEnv(gym.Env):
|
||||
if no_camera_specified:
|
||||
camera_name = "track"
|
||||
|
||||
if camera_id is None and camera_name in self.model._camera_name2id:
|
||||
if camera_id is None:
|
||||
if self._mujoco_bindings.__name__ == "mujoco_py":
|
||||
if camera_name in self.model._camera_name2id:
|
||||
camera_id = self.model.camera_name2id(camera_name)
|
||||
else:
|
||||
camera_id = self._mujoco_bindings.mj_name2id(
|
||||
self.model,
|
||||
self._mujoco_bindings.mjtObj.mjOBJ_CAMERA,
|
||||
camera_name,
|
||||
)
|
||||
|
||||
self._get_viewer(mode).render(width, height, camera_id=camera_id)
|
||||
|
||||
if mode == "rgb_array":
|
||||
# window size used for old mujoco-py:
|
||||
data = self._get_viewer(mode).read_pixels(width, height, depth=False)
|
||||
# original image is upside-down, so flip it
|
||||
return data[::-1, :, :]
|
||||
elif mode == "depth_array":
|
||||
self._get_viewer(mode).render(width, height)
|
||||
# window size used for old mujoco-py:
|
||||
# Extract depth part of the read_pixels() tuple
|
||||
data = self._get_viewer(mode).read_pixels(width, height, depth=True)[1]
|
||||
# original image is upside-down, so flip it
|
||||
@@ -180,24 +236,40 @@ class MujocoEnv(gym.Env):
|
||||
|
||||
def close(self):
|
||||
if self.viewer is not None:
|
||||
# self.viewer.finish()
|
||||
self.viewer = None
|
||||
self._viewers = {}
|
||||
|
||||
def _get_viewer(self, mode):
|
||||
def _get_viewer(self, mode, width=DEFAULT_SIZE, height=DEFAULT_SIZE):
|
||||
self.viewer = self._viewers.get(mode)
|
||||
if self.viewer is None:
|
||||
if mode == "human":
|
||||
self.viewer = mujoco_py.MjViewer(self.sim)
|
||||
if self._mujoco_bindings.__name__ == "mujoco_py":
|
||||
self.viewer = self._mujoco_bindings.MjViewer(self.sim)
|
||||
else:
|
||||
from gym.envs.mujoco.mujoco_rendering import Viewer
|
||||
|
||||
self.viewer = Viewer(self.model, self.data)
|
||||
elif mode == "rgb_array" or mode == "depth_array":
|
||||
self.viewer = mujoco_py.MjRenderContextOffscreen(self.sim, -1)
|
||||
if self._mujoco_bindings.__name__ == "mujoco_py":
|
||||
self.viewer = self._mujoco_bindings.MjRenderContextOffscreen(
|
||||
self.sim, -1
|
||||
)
|
||||
else:
|
||||
from gym.envs.mujoco.mujoco_rendering import RenderContextOffscreen
|
||||
|
||||
self.viewer = RenderContextOffscreen(
|
||||
width, height, self.model, self.data
|
||||
)
|
||||
|
||||
self.viewer_setup()
|
||||
self._viewers[mode] = self.viewer
|
||||
return self.viewer
|
||||
|
||||
def get_body_com(self, body_name):
|
||||
if self._mujoco_bindings.__name__ == "mujoco_py":
|
||||
return self.data.get_body_xpos(body_name)
|
||||
else:
|
||||
return self.data.body(body_name).xpos
|
||||
|
||||
def state_vector(self):
|
||||
return np.concatenate([self.sim.data.qpos.flat, self.sim.data.qvel.flat])
|
||||
return np.concatenate([self.data.qpos.flat, self.data.qvel.flat])
|
||||
|
554
gym/envs/mujoco/mujoco_rendering.py
Normal file
554
gym/envs/mujoco/mujoco_rendering.py
Normal file
@@ -0,0 +1,554 @@
|
||||
import collections
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from threading import Lock
|
||||
|
||||
import glfw
|
||||
import imageio
|
||||
import mujoco
|
||||
import numpy as np
|
||||
|
||||
|
||||
def _import_egl(width, height):
|
||||
from mujoco.egl import GLContext
|
||||
|
||||
return GLContext(width, height)
|
||||
|
||||
|
||||
def _import_glfw(width, height):
|
||||
from mujoco.glfw import GLContext
|
||||
|
||||
return GLContext(width, height)
|
||||
|
||||
|
||||
def _import_osmesa(width, height):
|
||||
from mujoco.osmesa import GLContext
|
||||
|
||||
return GLContext(width, height)
|
||||
|
||||
|
||||
_ALL_RENDERERS = collections.OrderedDict(
|
||||
[
|
||||
("glfw", _import_glfw),
|
||||
("egl", _import_egl),
|
||||
("osmesa", _import_osmesa),
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
class RenderContext:
|
||||
"""Render context superclass for offscreen and window rendering."""
|
||||
|
||||
def __init__(self, model, data, offscreen=True):
|
||||
|
||||
self.model = model
|
||||
self.data = data
|
||||
self.offscreen = offscreen
|
||||
max_geom = 1000
|
||||
|
||||
mujoco.mj_forward(self.model, self.data)
|
||||
|
||||
self.scn = mujoco.MjvScene(self.model, max_geom)
|
||||
self.cam = mujoco.MjvCamera()
|
||||
self.vopt = mujoco.MjvOption()
|
||||
self.pert = mujoco.MjvPerturb()
|
||||
self.con = mujoco.MjrContext(self.model, mujoco.mjtFontScale.mjFONTSCALE_150)
|
||||
|
||||
self._markers = []
|
||||
self._overlays = {}
|
||||
|
||||
self._init_camera()
|
||||
self._set_mujoco_buffers()
|
||||
|
||||
def _set_mujoco_buffers(self):
|
||||
if self.offscreen:
|
||||
mujoco.mjr_setBuffer(mujoco.mjtFramebuffer.mjFB_OFFSCREEN, self.con)
|
||||
if self.con.currentBuffer != mujoco.mjtFramebuffer.mjFB_OFFSCREEN:
|
||||
raise RuntimeError("Offscreen rendering not supported")
|
||||
else:
|
||||
mujoco.mjr_setBuffer(mujoco.mjtFramebuffer.mjFB_WINDOW, self.con)
|
||||
if self.con.currentBuffer != mujoco.mjtFramebuffer.mjFB_WINDOW:
|
||||
raise RuntimeError("Window rendering not supported")
|
||||
|
||||
def update_offscreen_size(self, width, height):
|
||||
if width != self.con.offWidth or height != self.con.offHeight:
|
||||
self.model.vis.global_.offwidth = width
|
||||
self.model.vis.global_.offheight = height
|
||||
self.con.free()
|
||||
self._set_mujoco_buffers()
|
||||
|
||||
def render(self, width, height, camera_id=None, segmentation=False):
|
||||
rect = mujoco.MjrRect(left=0, bottom=0, width=width, height=height)
|
||||
|
||||
# Sometimes buffers are too small.
|
||||
if width > self.con.offWidth or height > self.con.offHeight:
|
||||
new_width = max(width, self.model.vis.global_.offwidth)
|
||||
new_height = max(height, self.model.vis.global_.offheight)
|
||||
self.update_offscreen_size(new_width, new_height)
|
||||
|
||||
if camera_id is not None:
|
||||
if camera_id == -1:
|
||||
self.cam.type = mujoco.mjtCamera.mjCAMERA_FREE
|
||||
else:
|
||||
self.cam.type = mujoco.mjtCamera.mjCAMERA_FIXED
|
||||
self.cam.fixedcamid = camera_id
|
||||
|
||||
mujoco.mjv_updateScene(
|
||||
self.model,
|
||||
self.data,
|
||||
self.vopt,
|
||||
self.pert,
|
||||
self.cam,
|
||||
mujoco.mjtCatBit.mjCAT_ALL,
|
||||
self.scn,
|
||||
)
|
||||
|
||||
if segmentation:
|
||||
self.scn.flags[mujoco.mjtRndFlag.mjRND_SEGMENT] = 1
|
||||
self.scn.flags[mujoco.mjtRndFlag.mjRND_IDCOLOR] = 1
|
||||
|
||||
for marker_params in self._markers:
|
||||
self._add_marker_to_scene(marker_params)
|
||||
|
||||
mujoco.mjr_render(rect, self.scn, self.con)
|
||||
|
||||
for gridpos, (text1, text2) in self._overlays.items():
|
||||
mujoco.mjr_overlay(
|
||||
mujoco.mjtFontScale.mjFONTSCALE_150,
|
||||
gridpos,
|
||||
rect,
|
||||
text1.encode(),
|
||||
text2.encode(),
|
||||
self.con,
|
||||
)
|
||||
|
||||
if segmentation:
|
||||
self.scn.flags[mujoco.mjtRndFlag.mjRND_SEGMENT] = 0
|
||||
self.scn.flags[mujoco.mjtRndFlag.mjRND_IDCOLOR] = 0
|
||||
|
||||
def read_pixels(self, width, height, depth=True, segmentation=False):
|
||||
rect = mujoco.MjrRect(left=0, bottom=0, width=width, height=height)
|
||||
|
||||
rgb_arr = np.zeros(3 * rect.width * rect.height, dtype=np.uint8)
|
||||
depth_arr = np.zeros(rect.width * rect.height, dtype=np.float32)
|
||||
|
||||
mujoco.mjr_readPixels(rgb_arr, depth_arr, rect, self.con)
|
||||
rgb_img = rgb_arr.reshape(rect.height, rect.width, 3)
|
||||
|
||||
ret_img = rgb_img
|
||||
if segmentation:
|
||||
seg_img = (
|
||||
rgb_img[:, :, 0]
|
||||
+ rgb_img[:, :, 1] * (2**8)
|
||||
+ rgb_img[:, :, 2] * (2**16)
|
||||
)
|
||||
seg_img[seg_img >= (self.scn.ngeom + 1)] = 0
|
||||
seg_ids = np.full((self.scn.ngeom + 1, 2), fill_value=-1, dtype=np.int32)
|
||||
|
||||
for i in range(self.scn.ngeom):
|
||||
geom = self.scn.geoms[i]
|
||||
if geom.segid != -1:
|
||||
seg_ids[geom.segid + 1, 0] = geom.objtype
|
||||
seg_ids[geom.segid + 1, 1] = geom.objid
|
||||
ret_img = seg_ids[seg_img]
|
||||
|
||||
if depth:
|
||||
depth_img = depth_arr.reshape(rect.height, rect.width)
|
||||
return (ret_img, depth_img)
|
||||
else:
|
||||
return ret_img
|
||||
|
||||
def _init_camera(self):
|
||||
self.cam.type = mujoco.mjtCamera.mjCAMERA_FREE
|
||||
self.cam.fixedcamid = -1
|
||||
for i in range(3):
|
||||
self.cam.lookat[i] = np.median(self.data.geom_xpos[:, i])
|
||||
self.cam.distance = self.model.stat.extent
|
||||
|
||||
def add_overlay(self, gridpos: int, text1: str, text2: str):
|
||||
"""Overlays text on the scene."""
|
||||
if gridpos not in self._overlays:
|
||||
self._overlays[gridpos] = ["", ""]
|
||||
self._overlays[gridpos][0] += text1 + "\n"
|
||||
self._overlays[gridpos][1] += text2 + "\n"
|
||||
|
||||
def add_marker(self, **marker_params):
|
||||
self._markers.append(marker_params)
|
||||
|
||||
def _add_marker_to_scene(self, marker):
|
||||
if self.scn.ngeom >= self.scn.maxgeom:
|
||||
raise RuntimeError("Ran out of geoms. maxgeom: %d" % self.scn.maxgeom)
|
||||
|
||||
g = self.scn.geoms[self.scn.ngeom]
|
||||
# default values.
|
||||
g.dataid = -1
|
||||
g.objtype = mujoco.mjtObj.mjOBJ_UNKNOWN
|
||||
g.objid = -1
|
||||
g.category = mujoco.mjtCatBit.mjCAT_DECOR
|
||||
g.texid = -1
|
||||
g.texuniform = 0
|
||||
g.texrepeat[0] = 1
|
||||
g.texrepeat[1] = 1
|
||||
g.emission = 0
|
||||
g.specular = 0.5
|
||||
g.shininess = 0.5
|
||||
g.reflectance = 0
|
||||
g.type = mujoco.mjtGeom.mjGEOM_BOX
|
||||
g.size[:] = np.ones(3) * 0.1
|
||||
g.mat[:] = np.eye(3)
|
||||
g.rgba[:] = np.ones(4)
|
||||
|
||||
for key, value in marker.items():
|
||||
if isinstance(value, (int, float, mujoco._enums.mjtGeom)):
|
||||
setattr(g, key, value)
|
||||
elif isinstance(value, (tuple, list, np.ndarray)):
|
||||
attr = getattr(g, key)
|
||||
attr[:] = np.asarray(value).reshape(attr.shape)
|
||||
elif isinstance(value, str):
|
||||
assert key == "label", "Only label is a string in mjtGeom."
|
||||
if value is None:
|
||||
g.label[0] = 0
|
||||
else:
|
||||
g.label = value
|
||||
elif hasattr(g, key):
|
||||
raise ValueError(
|
||||
"mjtGeom has attr {} but type {} is invalid".format(
|
||||
key, type(value)
|
||||
)
|
||||
)
|
||||
else:
|
||||
raise ValueError("mjtGeom doesn't have field %s" % key)
|
||||
|
||||
self.scn.ngeom += 1
|
||||
|
||||
|
||||
class RenderContextOffscreen(RenderContext):
|
||||
"""Offscreen rendering class with opengl context."""
|
||||
|
||||
def __init__(self, width, height, model, data):
|
||||
|
||||
self._get_opengl_backend(width, height)
|
||||
self.opengl_context.make_current()
|
||||
|
||||
super().__init__(model, data, offscreen=True)
|
||||
|
||||
def _get_opengl_backend(self, width, height):
|
||||
|
||||
backend = os.environ.get("MUJOCO_GL")
|
||||
if backend is not None:
|
||||
try:
|
||||
self.opengl_context = _ALL_RENDERERS[backend](width, height)
|
||||
except KeyError:
|
||||
raise RuntimeError(
|
||||
"Environment variable {} must be one of {!r}: got {!r}.".format(
|
||||
"MUJOCO_GL", _ALL_RENDERERS.keys(), backend
|
||||
)
|
||||
)
|
||||
|
||||
else:
|
||||
for name, _ in _ALL_RENDERERS.items():
|
||||
try:
|
||||
self.opengl_context = _ALL_RENDERERS[name](width, height)
|
||||
backend = name
|
||||
break
|
||||
except: # noqa:E722
|
||||
pass
|
||||
if backend is None:
|
||||
raise RuntimeError(
|
||||
"No OpenGL backend could be imported. Attempting to create a "
|
||||
"rendering context will result in a RuntimeError."
|
||||
)
|
||||
|
||||
|
||||
class Viewer(RenderContext):
|
||||
"""Class for window rendering in all MuJoCo environments."""
|
||||
|
||||
def __init__(self, model, data):
|
||||
self._gui_lock = Lock()
|
||||
self._button_left_pressed = False
|
||||
self._button_right_pressed = False
|
||||
self._last_mouse_x = 0
|
||||
self._last_mouse_y = 0
|
||||
self._paused = False
|
||||
self._transparent = False
|
||||
self._contacts = False
|
||||
self._render_every_frame = True
|
||||
self._image_idx = 0
|
||||
self._image_path = "/tmp/frame_%07d.png"
|
||||
self._time_per_render = 1 / 60.0
|
||||
self._run_speed = 1.0
|
||||
self._loop_count = 0
|
||||
self._advance_by_one_step = False
|
||||
self._hide_menu = False
|
||||
|
||||
# glfw init
|
||||
glfw.init()
|
||||
width, height = glfw.get_video_mode(glfw.get_primary_monitor()).size
|
||||
self.window = glfw.create_window(width // 2, height // 2, "mujoco", None, None)
|
||||
glfw.make_context_current(self.window)
|
||||
glfw.swap_interval(1)
|
||||
|
||||
framebuffer_width, framebuffer_height = glfw.get_framebuffer_size(self.window)
|
||||
window_width, _ = glfw.get_window_size(self.window)
|
||||
self._scale = framebuffer_width * 1.0 / window_width
|
||||
|
||||
# set callbacks
|
||||
glfw.set_cursor_pos_callback(self.window, self._cursor_pos_callback)
|
||||
glfw.set_mouse_button_callback(self.window, self._mouse_button_callback)
|
||||
glfw.set_scroll_callback(self.window, self._scroll_callback)
|
||||
glfw.set_key_callback(self.window, self._key_callback)
|
||||
|
||||
# get viewport
|
||||
self.viewport = mujoco.MjrRect(0, 0, framebuffer_width, framebuffer_height)
|
||||
|
||||
super().__init__(model, data, offscreen=False)
|
||||
|
||||
def _key_callback(self, key, action):
|
||||
if action != glfw.RELEASE:
|
||||
return
|
||||
# Switch cameras
|
||||
elif key == glfw.KEY_TAB:
|
||||
self.cam.fixedcamid += 1
|
||||
self.cam.type = mujoco.mjtCamera.mjCAMERA_FIXED
|
||||
if self.cam.fixedcamid >= self.model.ncam:
|
||||
self.cam.fixedcamid = -1
|
||||
self.cam.type = mujoco.mjtCamera.mjCAMERA_FREE
|
||||
# Pause simulation
|
||||
elif key == glfw.KEY_SPACE and self._paused is not None:
|
||||
self._paused = not self._paused
|
||||
# Advances simulation by one step.
|
||||
elif key == glfw.KEY_RIGHT and self._paused is not None:
|
||||
self._advance_by_one_step = True
|
||||
self._paused = True
|
||||
# Slows down simulation
|
||||
elif key == glfw.KEY_S:
|
||||
self._run_speed /= 2.0
|
||||
# Speeds up simulation
|
||||
elif key == glfw.KEY_F:
|
||||
self._run_speed *= 2.0
|
||||
# Turn off / turn on rendering every frame.
|
||||
elif key == glfw.KEY_D:
|
||||
self._render_every_frame = not self._render_every_frame
|
||||
# Capture screenshot
|
||||
elif key == glfw.KEY_T:
|
||||
img = np.zeros(
|
||||
(
|
||||
glfw.get_framebuffer_size(self.window)[1],
|
||||
glfw.get_framebuffer_size(self.window)[0],
|
||||
3,
|
||||
),
|
||||
dtype=np.uint8,
|
||||
)
|
||||
mujoco.mjr_readPixels(img, None, self.viewport, self.con)
|
||||
imageio.imwrite(self._image_path % self._image_idx, np.flipud(img))
|
||||
self._image_idx += 1
|
||||
# Display contact forces
|
||||
elif key == glfw.KEY_C:
|
||||
self._contacts = not self._contacts
|
||||
self.vopt.flags[mujoco.mjtVisFlag.mjVIS_CONTACTPOINT] = self._contacts
|
||||
self.vopt.flags[mujoco.mjtVisFlag.mjVIS_CONTACTFORCE] = self._contacts
|
||||
# Display coordinate frames
|
||||
elif key == glfw.KEY_E:
|
||||
self.vopt.frame = 1 - self.vopt.frame
|
||||
# Hide overlay menu
|
||||
elif key == glfw.KEY_H:
|
||||
self._hide_menu = not self._hide_menu
|
||||
# Make transparent
|
||||
elif key == glfw.KEY_R:
|
||||
self._transparent = not self._transparent
|
||||
if self._transparent:
|
||||
self.model.geom_rgba[:, 3] /= 5.0
|
||||
else:
|
||||
self.model.geom_rgba[:, 3] *= 5.0
|
||||
# Geom group visibility
|
||||
elif key in (glfw.KEY_0, glfw.KEY_1, glfw.KEY_2, glfw.KEY_3, glfw.KEY_4):
|
||||
self.vopt.geomgroup[key - glfw.KEY_0] ^= 1
|
||||
# Quit
|
||||
if key == glfw.KEY_ESCAPE:
|
||||
print("Pressed ESC")
|
||||
print("Quitting.")
|
||||
glfw.terminate()
|
||||
sys.exit(0)
|
||||
|
||||
def _cursor_pos_callback(self, window, xpos, ypos):
|
||||
if not (self._button_left_pressed or self._button_right_pressed):
|
||||
return
|
||||
|
||||
mod_shift = (
|
||||
glfw.get_key(window, glfw.KEY_LEFT_SHIFT) == glfw.PRESS
|
||||
or glfw.get_key(window, glfw.KEY_RIGHT_SHIFT) == glfw.PRESS
|
||||
)
|
||||
if self._button_right_pressed:
|
||||
action = (
|
||||
mujoco.mjtMouse.mjMOUSE_MOVE_H
|
||||
if mod_shift
|
||||
else mujoco.mjtMouse.mjMOUSE_MOVE_V
|
||||
)
|
||||
elif self._button_left_pressed:
|
||||
action = (
|
||||
mujoco.mjtMouse.mjMOUSE_ROTATE_H
|
||||
if mod_shift
|
||||
else mujoco.mjtMouse.mjMOUSE_ROTATE_V
|
||||
)
|
||||
else:
|
||||
action = mujoco.mjtMouse.mjMOUSE_ZOOM
|
||||
|
||||
dx = int(self._scale * xpos) - self._last_mouse_x
|
||||
dy = int(self._scale * ypos) - self._last_mouse_y
|
||||
width, height = glfw.get_framebuffer_size(window)
|
||||
|
||||
with self._gui_lock:
|
||||
mujoco.mjv_moveCamera(
|
||||
self.model, action, dx / height, dy / height, self.scn, self.cam
|
||||
)
|
||||
|
||||
self._last_mouse_x = int(self._scale * xpos)
|
||||
self._last_mouse_y = int(self._scale * ypos)
|
||||
|
||||
def _mouse_button_callback(self, window, button, act, mods):
|
||||
self._button_left_pressed = (
|
||||
glfw.get_mouse_button(window, glfw.MOUSE_BUTTON_LEFT) == glfw.PRESS
|
||||
)
|
||||
self._button_right_pressed = (
|
||||
glfw.get_mouse_button(window, glfw.MOUSE_BUTTON_RIGHT) == glfw.PRESS
|
||||
)
|
||||
|
||||
x, y = glfw.get_cursor_pos(window)
|
||||
self._last_mouse_x = int(self._scale * x)
|
||||
self._last_mouse_y = int(self._scale * y)
|
||||
|
||||
def _scroll_callback(self, window, x_offset, y_offset):
|
||||
with self._gui_lock:
|
||||
mujoco.mjv_moveCamera(
|
||||
self.model,
|
||||
mujoco.mjtMouse.mjMOUSE_ZOOM,
|
||||
0,
|
||||
-0.05 * y_offset,
|
||||
self.scn,
|
||||
self.cam,
|
||||
)
|
||||
|
||||
def _create_overlay(self):
|
||||
topleft = mujoco.mjtGridPos.mjGRID_TOPLEFT
|
||||
bottomleft = mujoco.mjtGridPos.mjGRID_BOTTOMLEFT
|
||||
|
||||
if self._render_every_frame:
|
||||
self.add_overlay(topleft, "", "")
|
||||
else:
|
||||
self.add_overlay(
|
||||
topleft,
|
||||
"Run speed = %.3f x real time" % self._run_speed,
|
||||
"[S]lower, [F]aster",
|
||||
)
|
||||
self.add_overlay(
|
||||
topleft, "Ren[d]er every frame", "On" if self._render_every_frame else "Off"
|
||||
)
|
||||
self.add_overlay(
|
||||
topleft,
|
||||
"Switch camera (#cams = %d)" % (self.model.ncam + 1),
|
||||
"[Tab] (camera ID = %d)" % self.cam.fixedcamid,
|
||||
)
|
||||
self.add_overlay(topleft, "[C]ontact forces", "On" if self._contacts else "Off")
|
||||
self.add_overlay(topleft, "T[r]ansparent", "On" if self._transparent else "Off")
|
||||
if self._paused is not None:
|
||||
if not self._paused:
|
||||
self.add_overlay(topleft, "Stop", "[Space]")
|
||||
else:
|
||||
self.add_overlay(topleft, "Start", "[Space]")
|
||||
self.add_overlay(
|
||||
topleft, "Advance simulation by one step", "[right arrow]"
|
||||
)
|
||||
self.add_overlay(
|
||||
topleft, "Referenc[e] frames", "On" if self.vopt.frame == 1 else "Off"
|
||||
)
|
||||
self.add_overlay(topleft, "[H]ide Menu", "")
|
||||
if self._image_idx > 0:
|
||||
fname = self._image_path % (self._image_idx - 1)
|
||||
self.add_overlay(topleft, "Cap[t]ure frame", "Saved as %s" % fname)
|
||||
else:
|
||||
self.add_overlay(topleft, "Cap[t]ure frame", "")
|
||||
self.add_overlay(topleft, "Toggle geomgroup visibility", "0-4")
|
||||
|
||||
self.add_overlay(bottomleft, "FPS", "%d%s" % (1 / self._time_per_render, ""))
|
||||
self.add_overlay(
|
||||
bottomleft, "Solver iterations", str(self.data.solver_iter + 1)
|
||||
)
|
||||
self.add_overlay(
|
||||
bottomleft, "Step", str(round(self.data.time / self.model.opt.timestep))
|
||||
)
|
||||
self.add_overlay(bottomleft, "timestep", "%.5f" % self.model.opt.timestep)
|
||||
|
||||
def render(self):
|
||||
# mjv_updateScene, mjr_render, mjr_overlay
|
||||
def update():
|
||||
# fill overlay items
|
||||
self._create_overlay()
|
||||
|
||||
render_start = time.time()
|
||||
if self.window is None:
|
||||
return
|
||||
elif glfw.window_should_close(self.window):
|
||||
glfw.terminate()
|
||||
sys.exit(0)
|
||||
self.viewport.width, self.viewport.height = glfw.get_framebuffer_size(
|
||||
self.window
|
||||
)
|
||||
with self._gui_lock:
|
||||
# update scene
|
||||
mujoco.mjv_updateScene(
|
||||
self.model,
|
||||
self.data,
|
||||
self.vopt,
|
||||
mujoco.MjvPerturb(),
|
||||
self.cam,
|
||||
mujoco.mjtCatBit.mjCAT_ALL.value,
|
||||
self.scn,
|
||||
)
|
||||
# marker items
|
||||
for marker in self._markers:
|
||||
self._add_marker_to_scene(marker)
|
||||
# render
|
||||
mujoco.mjr_render(self.viewport, self.scn, self.con)
|
||||
# overlay items
|
||||
if not self._hide_menu:
|
||||
for gridpos, [t1, t2] in self._overlays.items():
|
||||
mujoco.mjr_overlay(
|
||||
mujoco.mjtFontScale.mjFONTSCALE_150,
|
||||
gridpos,
|
||||
self.viewport,
|
||||
t1,
|
||||
t2,
|
||||
self.con,
|
||||
)
|
||||
glfw.swap_buffers(self.window)
|
||||
glfw.poll_events()
|
||||
self._time_per_render = 0.9 * self._time_per_render + 0.1 * (
|
||||
time.time() - render_start
|
||||
)
|
||||
|
||||
# clear overlay
|
||||
self._overlays.clear()
|
||||
|
||||
if self._paused:
|
||||
while self._paused:
|
||||
update()
|
||||
if self._advance_by_one_step:
|
||||
self._advance_by_one_step = False
|
||||
break
|
||||
else:
|
||||
self._loop_count += self.model.opt.timestep / (
|
||||
self._time_per_render * self._run_speed
|
||||
)
|
||||
if self._render_every_frame:
|
||||
self._loop_count = 1
|
||||
while self._loop_count > 0:
|
||||
update()
|
||||
self._loop_count -= 1
|
||||
|
||||
# clear markers
|
||||
self._markers[:] = []
|
||||
|
||||
def close(self):
|
||||
glfw.terminate()
|
||||
sys.exit(0)
|
@@ -5,131 +5,11 @@ from gym.envs.mujoco import mujoco_env
|
||||
|
||||
|
||||
class PusherEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
||||
"""
|
||||
### Description
|
||||
"Pusher" is a multi-jointed robot arm which is very similar to that of a human.
|
||||
The goal is to move a target cylinder (called *object*) to a goal position using the robot's end effector (called *fingertip*).
|
||||
The robot consists of shoulder, elbow, forearm, and wrist joints.
|
||||
|
||||
### Action Space
|
||||
The action space is a `Box(-2, 2, (7,), float32)`. An action `(a, b)` represents the torques applied at the hinge joints.
|
||||
|
||||
| Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit |
|
||||
|-----|--------------------------------------------------------------------|-------------|-------------|----------------------------------|-------|--------------|
|
||||
| 0 | Rotation of the panning the shoulder | -2 | 2 | r_shoulder_pan_joint | hinge | torque (N m) |
|
||||
| 1 | Rotation of the shoulder lifting joint | -2 | 2 | r_shoulder_lift_joint | hinge | torque (N m) |
|
||||
| 2 | Rotation of the shoulder rolling joint | -2 | 2 | r_upper_arm_roll_joint | hinge | torque (N m) |
|
||||
| 3 | Rotation of hinge joint that flexed the elbow | -2 | 2 | r_elbow_flex_joint | hinge | torque (N m) |
|
||||
| 4 | Rotation of hinge that rolls the forearm | -2 | 2 | r_forearm_roll_joint | hinge | torque (N m) |
|
||||
| 5 | Rotation of flexing the wrist | -2 | 2 | r_wrist_flex_joint | hinge | torque (N m) |
|
||||
| 6 | Rotation of rolling the wrist | -2 | 2 | r_wrist_roll_joint | hinge | torque (N m) |
|
||||
|
||||
### Observation Space
|
||||
|
||||
Observations consist of
|
||||
|
||||
- Angle of rotational joints on the pusher
|
||||
- Anglular velocities of rotational joints on the pusher
|
||||
- The coordinates of the fingertip of the pusher
|
||||
- The coordinates of the object to be moved
|
||||
- The coordinates of the goal position
|
||||
|
||||
The observation is a `ndarray` with shape `(23,)` where the elements correspond to the table below.
|
||||
An analogy can be drawn to a human arm in order to help understand the state space, with the words flex and roll meaning the
|
||||
same as human joints.
|
||||
|
||||
| Num | Observation | Min | Max | Name (in corresponding XML file) | Joint| Unit |
|
||||
|-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------|
|
||||
| 0 | Rotation of the panning the shoulder | -Inf | Inf | r_shoulder_pan_joint | hinge | angle (rad) |
|
||||
| 1 | Rotation of the shoulder lifting joint | -Inf | Inf | r_shoulder_lift_joint | hinge | angle (rad) |
|
||||
| 2 | Rotation of the shoulder rolling joint | -Inf | Inf | r_upper_arm_roll_joint | hinge | angle (rad) |
|
||||
| 3 | Rotation of hinge joint that flexed the elbow | -Inf | Inf | r_elbow_flex_joint | hinge | angle (rad) |
|
||||
| 4 | Rotation of hinge that rolls the forearm | -Inf | Inf | r_forearm_roll_joint | hinge | angle (rad) |
|
||||
| 5 | Rotation of flexing the wrist | -Inf | Inf | r_wrist_flex_joint | hinge | angle (rad) |
|
||||
| 6 | Rotation of rolling the wrist | -Inf | Inf | r_wrist_roll_joint | hinge | angle (rad) |
|
||||
| 7 | Rotational velocity of the panning the shoulder | -Inf | Inf | r_shoulder_pan_joint | hinge | angular velocity (rad/s) |
|
||||
| 8 | Rotational velocity of the shoulder lifting joint | -Inf | Inf | r_shoulder_lift_joint | hinge | angular velocity (rad/s) |
|
||||
| 9 | Rotational velocity of the shoulder rolling joint | -Inf | Inf | r_upper_arm_roll_joint | hinge | angular velocity (rad/s) |
|
||||
| 10 | Rotational velocity of hinge joint that flexed the elbow | -Inf | Inf | r_elbow_flex_joint | hinge | angular velocity (rad/s) |
|
||||
| 11 | Rotational velocity of hinge that rolls the forearm | -Inf | Inf | r_forearm_roll_joint | hinge | angular velocity (rad/s) |
|
||||
| 12 | Rotational velocity of flexing the wrist | -Inf | Inf | r_wrist_flex_joint | hinge | angular velocity (rad/s) |
|
||||
| 13 | Rotational velocity of rolling the wrist | -Inf | Inf | r_wrist_roll_joint | hinge | angular velocity (rad/s) |
|
||||
| 14 | x-coordinate of the fingertip of the pusher | -Inf | Inf | tips_arm | slide | position (m) |
|
||||
| 15 | y-coordinate of the fingertip of the pusher | -Inf | Inf | tips_arm | slide | position (m) |
|
||||
| 16 | z-coordinate of the fingertip of the pusher | -Inf | Inf | tips_arm | slide | position (m) |
|
||||
| 17 | x-coordinate of the object to be moved | -Inf | Inf | object (obj_slidex) | slide | position (m) |
|
||||
| 18 | y-coordinate of the object to be moved | -Inf | Inf | object (obj_slidey) | slide | position (m) |
|
||||
| 19 | z-coordinate of the object to be moved | -Inf | Inf | object | cylinder | position (m) |
|
||||
| 20 | x-coordinate of the goal position of the object | -Inf | Inf | goal (goal_slidex) | slide | position (m) |
|
||||
| 21 | y-coordinate of the goal position of the object | -Inf | Inf | goal (goal_slidey) | slide | position (m) |
|
||||
| 22 | z-coordinate of the goal position of the object | -Inf | Inf | goal | sphere | position (m) |
|
||||
|
||||
|
||||
### Rewards
|
||||
The reward consists of two parts:
|
||||
- *reward_near *: This reward is a measure of how far the *fingertip*
|
||||
of the pusher (the unattached end) is from the object, with a more negative
|
||||
value assigned for when the pusher's *fingertip* is further away from the
|
||||
target. It is calculated as the negative vector norm of (position of
|
||||
the fingertip - position of target), or *-norm("fingertip" - "target")*.
|
||||
- *reward_dist *: This reward is a measure of how far the object is from
|
||||
the target goal position, with a more negative value assigned for object is
|
||||
further away from the target. It is calculated as the negative vector norm of
|
||||
(position of the object - position of goal), or *-norm("object" - "target")*.
|
||||
- *reward_control*: A negative reward for penalising the pusher if
|
||||
it takes actions that are too large. It is measured as the negative squared
|
||||
Euclidean norm of the action, i.e. as *- sum(action<sup>2</sup>)*.
|
||||
|
||||
The total reward returned is ***reward*** *=* *reward_dist + 0.1 * reward_ctrl + 0.5 * reward_near*
|
||||
|
||||
Unlike other environments, Pusher does not allow you to specify weights for the individual reward terms.
|
||||
However, `info` does contain the keys *reward_dist* and *reward_ctrl*. Thus, if you'd like to weight the terms,
|
||||
you should create a wrapper that computes the weighted reward from `info`.
|
||||
|
||||
|
||||
### Starting State
|
||||
All pusher (not including object and goal) states start in
|
||||
(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0). A uniform noise in the range
|
||||
[-0.005, 0.005] is added to the velocity attributes only. The velocities of
|
||||
the object and goal are permanently set to 0. The object's x-position is selected uniformly
|
||||
between [-0.3, 0] while the y-position is selected uniformly between [-0.2, 0.2], and this
|
||||
process is repeated until the vector norm between the object's (x,y) position and origin is not greater
|
||||
than 0.17. The goal always have the same position of (0.45, -0.05, -0.323).
|
||||
|
||||
The default framerate is 5 with each frame lasting for 0.01, giving rise to a *dt = 5 * 0.01 = 0.05*
|
||||
|
||||
### Episode Termination
|
||||
|
||||
The episode terminates when any of the following happens:
|
||||
|
||||
1. The episode duration reaches a 100 timesteps.
|
||||
2. Any of the state space values is no longer finite.
|
||||
|
||||
### Arguments
|
||||
|
||||
No additional arguments are currently supported (in v2 and lower),
|
||||
but modifications can be made to the XML file in the assets folder
|
||||
(or by changing the path to a modified XML file in another folder)..
|
||||
|
||||
```
|
||||
env = gym.make('Pusher-v2')
|
||||
```
|
||||
|
||||
There is no v3 for Pusher, unlike the robot environments where a v3 and
|
||||
beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
|
||||
|
||||
|
||||
### Version History
|
||||
|
||||
* v2: All continuous control environments now use mujoco_py >= 1.50
|
||||
* v1: max_time_steps raised to 1000 for robot based tasks (not including reacher, which has a max_time_steps of 50). Added reward_threshold to environments.
|
||||
* v0: Initial versions release (1.0.0)
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
utils.EzPickle.__init__(self)
|
||||
mujoco_env.MujocoEnv.__init__(self, "pusher.xml", 5)
|
||||
mujoco_env.MujocoEnv.__init__(
|
||||
self, "pusher.xml", 5, mujoco_bindings="mujoco_py"
|
||||
)
|
||||
|
||||
def step(self, a):
|
||||
vec_1 = self.get_body_com("object") - self.get_body_com("tips_arm")
|
||||
|
185
gym/envs/mujoco/pusher_v4.py
Normal file
185
gym/envs/mujoco/pusher_v4.py
Normal file
@@ -0,0 +1,185 @@
|
||||
import numpy as np
|
||||
|
||||
from gym import utils
|
||||
from gym.envs.mujoco import mujoco_env
|
||||
|
||||
|
||||
class PusherEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
||||
"""
|
||||
### Description
|
||||
"Pusher" is a multi-jointed robot arm which is very similar to that of a human.
|
||||
The goal is to move a target cylinder (called *object*) to a goal position using the robot's end effector (called *fingertip*).
|
||||
The robot consists of shoulder, elbow, forearm, and wrist joints.
|
||||
|
||||
### Action Space
|
||||
The action space is a `Box(-2, 2, (7,), float32)`. An action `(a, b)` represents the torques applied at the hinge joints.
|
||||
|
||||
| Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit |
|
||||
|-----|--------------------------------------------------------------------|-------------|-------------|----------------------------------|-------|--------------|
|
||||
| 0 | Rotation of the panning the shoulder | -2 | 2 | r_shoulder_pan_joint | hinge | torque (N m) |
|
||||
| 1 | Rotation of the shoulder lifting joint | -2 | 2 | r_shoulder_lift_joint | hinge | torque (N m) |
|
||||
| 2 | Rotation of the shoulder rolling joint | -2 | 2 | r_upper_arm_roll_joint | hinge | torque (N m) |
|
||||
| 3 | Rotation of hinge joint that flexed the elbow | -2 | 2 | r_elbow_flex_joint | hinge | torque (N m) |
|
||||
| 4 | Rotation of hinge that rolls the forearm | -2 | 2 | r_forearm_roll_joint | hinge | torque (N m) |
|
||||
| 5 | Rotation of flexing the wrist | -2 | 2 | r_wrist_flex_joint | hinge | torque (N m) |
|
||||
| 6 | Rotation of rolling the wrist | -2 | 2 | r_wrist_roll_joint | hinge | torque (N m) |
|
||||
|
||||
### Observation Space
|
||||
|
||||
Observations consist of
|
||||
|
||||
- Angle of rotational joints on the pusher
|
||||
- Anglular velocities of rotational joints on the pusher
|
||||
- The coordinates of the fingertip of the pusher
|
||||
- The coordinates of the object to be moved
|
||||
- The coordinates of the goal position
|
||||
|
||||
The observation is a `ndarray` with shape `(23,)` where the elements correspond to the table below.
|
||||
An analogy can be drawn to a human arm in order to help understand the state space, with the words flex and roll meaning the
|
||||
same as human joints.
|
||||
|
||||
| Num | Observation | Min | Max | Name (in corresponding XML file) | Joint| Unit |
|
||||
|-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------|
|
||||
| 0 | Rotation of the panning the shoulder | -Inf | Inf | r_shoulder_pan_joint | hinge | angle (rad) |
|
||||
| 1 | Rotation of the shoulder lifting joint | -Inf | Inf | r_shoulder_lift_joint | hinge | angle (rad) |
|
||||
| 2 | Rotation of the shoulder rolling joint | -Inf | Inf | r_upper_arm_roll_joint | hinge | angle (rad) |
|
||||
| 3 | Rotation of hinge joint that flexed the elbow | -Inf | Inf | r_elbow_flex_joint | hinge | angle (rad) |
|
||||
| 4 | Rotation of hinge that rolls the forearm | -Inf | Inf | r_forearm_roll_joint | hinge | angle (rad) |
|
||||
| 5 | Rotation of flexing the wrist | -Inf | Inf | r_wrist_flex_joint | hinge | angle (rad) |
|
||||
| 6 | Rotation of rolling the wrist | -Inf | Inf | r_wrist_roll_joint | hinge | angle (rad) |
|
||||
| 7 | Rotational velocity of the panning the shoulder | -Inf | Inf | r_shoulder_pan_joint | hinge | angular velocity (rad/s) |
|
||||
| 8 | Rotational velocity of the shoulder lifting joint | -Inf | Inf | r_shoulder_lift_joint | hinge | angular velocity (rad/s) |
|
||||
| 9 | Rotational velocity of the shoulder rolling joint | -Inf | Inf | r_upper_arm_roll_joint | hinge | angular velocity (rad/s) |
|
||||
| 10 | Rotational velocity of hinge joint that flexed the elbow | -Inf | Inf | r_elbow_flex_joint | hinge | angular velocity (rad/s) |
|
||||
| 11 | Rotational velocity of hinge that rolls the forearm | -Inf | Inf | r_forearm_roll_joint | hinge | angular velocity (rad/s) |
|
||||
| 12 | Rotational velocity of flexing the wrist | -Inf | Inf | r_wrist_flex_joint | hinge | angular velocity (rad/s) |
|
||||
| 13 | Rotational velocity of rolling the wrist | -Inf | Inf | r_wrist_roll_joint | hinge | angular velocity (rad/s) |
|
||||
| 14 | x-coordinate of the fingertip of the pusher | -Inf | Inf | tips_arm | slide | position (m) |
|
||||
| 15 | y-coordinate of the fingertip of the pusher | -Inf | Inf | tips_arm | slide | position (m) |
|
||||
| 16 | z-coordinate of the fingertip of the pusher | -Inf | Inf | tips_arm | slide | position (m) |
|
||||
| 17 | x-coordinate of the object to be moved | -Inf | Inf | object (obj_slidex) | slide | position (m) |
|
||||
| 18 | y-coordinate of the object to be moved | -Inf | Inf | object (obj_slidey) | slide | position (m) |
|
||||
| 19 | z-coordinate of the object to be moved | -Inf | Inf | object | cylinder | position (m) |
|
||||
| 20 | x-coordinate of the goal position of the object | -Inf | Inf | goal (goal_slidex) | slide | position (m) |
|
||||
| 21 | y-coordinate of the goal position of the object | -Inf | Inf | goal (goal_slidey) | slide | position (m) |
|
||||
| 22 | z-coordinate of the goal position of the object | -Inf | Inf | goal | sphere | position (m) |
|
||||
|
||||
|
||||
### Rewards
|
||||
The reward consists of two parts:
|
||||
- *reward_near *: This reward is a measure of how far the *fingertip*
|
||||
of the pusher (the unattached end) is from the object, with a more negative
|
||||
value assigned for when the pusher's *fingertip* is further away from the
|
||||
target. It is calculated as the negative vector norm of (position of
|
||||
the fingertip - position of target), or *-norm("fingertip" - "target")*.
|
||||
- *reward_dist *: This reward is a measure of how far the object is from
|
||||
the target goal position, with a more negative value assigned for object is
|
||||
further away from the target. It is calculated as the negative vector norm of
|
||||
(position of the object - position of goal), or *-norm("object" - "target")*.
|
||||
- *reward_control*: A negative reward for penalising the pusher if
|
||||
it takes actions that are too large. It is measured as the negative squared
|
||||
Euclidean norm of the action, i.e. as *- sum(action<sup>2</sup>)*.
|
||||
|
||||
The total reward returned is ***reward*** *=* *reward_dist + 0.1 * reward_ctrl + 0.5 * reward_near*
|
||||
|
||||
Unlike other environments, Pusher does not allow you to specify weights for the individual reward terms.
|
||||
However, `info` does contain the keys *reward_dist* and *reward_ctrl*. Thus, if you'd like to weight the terms,
|
||||
you should create a wrapper that computes the weighted reward from `info`.
|
||||
|
||||
|
||||
### Starting State
|
||||
All pusher (not including object and goal) states start in
|
||||
(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0). A uniform noise in the range
|
||||
[-0.005, 0.005] is added to the velocity attributes only. The velocities of
|
||||
the object and goal are permanently set to 0. The object's x-position is selected uniformly
|
||||
between [-0.3, 0] while the y-position is selected uniformly between [-0.2, 0.2], and this
|
||||
process is repeated until the vector norm between the object's (x,y) position and origin is not greater
|
||||
than 0.17. The goal always have the same position of (0.45, -0.05, -0.323).
|
||||
|
||||
The default framerate is 5 with each frame lasting for 0.01, giving rise to a *dt = 5 * 0.01 = 0.05*
|
||||
|
||||
### Episode Termination
|
||||
|
||||
The episode terminates when any of the following happens:
|
||||
|
||||
1. The episode duration reaches a 100 timesteps.
|
||||
2. Any of the state space values is no longer finite.
|
||||
|
||||
### Arguments
|
||||
|
||||
No additional arguments are currently supported (in v2 and lower),
|
||||
but modifications can be made to the XML file in the assets folder
|
||||
(or by changing the path to a modified XML file in another folder)..
|
||||
|
||||
```
|
||||
env = gym.make('Pusher-v2')
|
||||
```
|
||||
|
||||
There is no v3 for Pusher, unlike the robot environments where a v3 and
|
||||
beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
|
||||
|
||||
|
||||
### Version History
|
||||
|
||||
* v4: all mujoco environments now use the mujoco binidings in mujoco>=2.1.3
|
||||
* v2: All continuous control environments now use mujoco_py >= 1.50
|
||||
* v1: max_time_steps raised to 1000 for robot based tasks (not including reacher, which has a max_time_steps of 50). Added reward_threshold to environments.
|
||||
* v0: Initial versions release (1.0.0)
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
utils.EzPickle.__init__(self)
|
||||
mujoco_env.MujocoEnv.__init__(self, "pusher.xml", 5)
|
||||
|
||||
def step(self, a):
|
||||
vec_1 = self.get_body_com("object") - self.get_body_com("tips_arm")
|
||||
vec_2 = self.get_body_com("object") - self.get_body_com("goal")
|
||||
|
||||
reward_near = -np.linalg.norm(vec_1)
|
||||
reward_dist = -np.linalg.norm(vec_2)
|
||||
reward_ctrl = -np.square(a).sum()
|
||||
reward = reward_dist + 0.1 * reward_ctrl + 0.5 * reward_near
|
||||
|
||||
self.do_simulation(a, self.frame_skip)
|
||||
ob = self._get_obs()
|
||||
done = False
|
||||
return ob, reward, done, dict(reward_dist=reward_dist, reward_ctrl=reward_ctrl)
|
||||
|
||||
def viewer_setup(self):
|
||||
self.viewer.cam.trackbodyid = -1
|
||||
self.viewer.cam.distance = 4.0
|
||||
|
||||
def reset_model(self):
|
||||
qpos = self.init_qpos
|
||||
|
||||
self.goal_pos = np.asarray([0, 0])
|
||||
while True:
|
||||
self.cylinder_pos = np.concatenate(
|
||||
[
|
||||
self.np_random.uniform(low=-0.3, high=0, size=1),
|
||||
self.np_random.uniform(low=-0.2, high=0.2, size=1),
|
||||
]
|
||||
)
|
||||
if np.linalg.norm(self.cylinder_pos - self.goal_pos) > 0.17:
|
||||
break
|
||||
|
||||
qpos[-4:-2] = self.cylinder_pos
|
||||
qpos[-2:] = self.goal_pos
|
||||
qvel = self.init_qvel + self.np_random.uniform(
|
||||
low=-0.005, high=0.005, size=self.model.nv
|
||||
)
|
||||
qvel[-4:] = 0
|
||||
self.set_state(qpos, qvel)
|
||||
return self._get_obs()
|
||||
|
||||
def _get_obs(self):
|
||||
return np.concatenate(
|
||||
[
|
||||
self.data.qpos.flat[:7],
|
||||
self.data.qvel.flat[:7],
|
||||
self.get_body_com("tips_arm"),
|
||||
self.get_body_com("object"),
|
||||
self.get_body_com("goal"),
|
||||
]
|
||||
)
|
@@ -5,121 +5,11 @@ from gym.envs.mujoco import mujoco_env
|
||||
|
||||
|
||||
class ReacherEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
||||
"""
|
||||
### Description
|
||||
"Reacher" is a two-jointed robot arm. The goal is to move the robot's end effector (called *fingertip*) close to a
|
||||
target that is spawned at a random position.
|
||||
|
||||
### Action Space
|
||||
The action space is a `Box(-1, 1, (2,), float32)`. An action `(a, b)` represents the torques applied at the hinge joints.
|
||||
|
||||
| Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit |
|
||||
|-----|---------------------------------------------------------------------------------|-------------|-------------|--------------------------|-------|------|
|
||||
| 0 | Torque applied at the first hinge (connecting the link to the point of fixture) | -1 | 1 | joint0 | hinge | torque (N m) |
|
||||
| 1 | Torque applied at the second hinge (connecting the two links) | -1 | 1 | joint1 | hinge | torque (N m) |
|
||||
|
||||
### Observation Space
|
||||
|
||||
Observations consist of
|
||||
|
||||
- The cosine of the angles of the two arms
|
||||
- The sine of the angles of the two arms
|
||||
- The coordinates of the target
|
||||
- The angular velocities of the arms
|
||||
- The vector between the target and the reacher's fingertip (3 dimensional with the last element being 0)
|
||||
|
||||
The observation is a `ndarray` with shape `(11,)` where the elements correspond to the following:
|
||||
|
||||
| Num | Observation | Min | Max | Name (in corresponding XML file) | Joint| Unit |
|
||||
|-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------|
|
||||
| 0 | cosine of the angle of the first arm | -Inf | Inf | cos(joint0) | hinge | unitless |
|
||||
| 1 | cosine of the angle of the second arm | -Inf | Inf | cos(joint1) | hinge | unitless |
|
||||
| 2 | sine of the angle of the first arm | -Inf | Inf | cos(joint0) | hinge | unitless |
|
||||
| 3 | sine of the angle of the second arm | -Inf | Inf | cos(joint1) | hinge | unitless |
|
||||
| 4 | x-coorddinate of the target | -Inf | Inf | target_x | slide | position (m) |
|
||||
| 5 | y-coorddinate of the target | -Inf | Inf | target_y | slide | position (m) |
|
||||
| 6 | angular velocity of the first arm | -Inf | Inf | joint0 | hinge | angular velocity (rad/s) |
|
||||
| 7 | angular velocity of the second arm | -Inf | Inf | joint1 | hinge | angular velocity (rad/s) |
|
||||
| 8 | x-value of position_fingertip - position_target | -Inf | Inf | NA | slide | position (m) |
|
||||
| 9 | y-value of position_fingertip - position_target | -Inf | Inf | NA | slide | position (m) |
|
||||
| 10 | z-value of position_fingertip - position_target (0 since reacher is 2d and z is same for both) | -Inf | Inf | NA | slide | position (m) |
|
||||
|
||||
|
||||
Most Gym environments just return the positions and velocity of the
|
||||
joints in the `.xml` file as the state of the environment. However, in
|
||||
reacher the state is created by combining only certain elements of the
|
||||
position and velocity, and performing some function transformations on them.
|
||||
If one is to read the `.xml` for reacher then they will find 4 joints:
|
||||
|
||||
| Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit |
|
||||
|-----|-----------------------------|----------|----------|----------------------------------|-------|--------------------|
|
||||
| 0 | angle of the first arm | -Inf | Inf | joint0 | hinge | angle (rad) |
|
||||
| 1 | angle of the second arm | -Inf | Inf | joint1 | hinge | angle (rad) |
|
||||
| 2 | x-coordinate of the target | -Inf | Inf | target_x | slide | position (m) |
|
||||
| 3 | y-coordinate of the target | -Inf | Inf | target_y | slide | position (m) |
|
||||
|
||||
|
||||
### Rewards
|
||||
The reward consists of two parts:
|
||||
- *reward_distance*: This reward is a measure of how far the *fingertip*
|
||||
of the reacher (the unattached end) is from the target, with a more negative
|
||||
value assigned for when the reacher's *fingertip* is further away from the
|
||||
target. It is calculated as the negative vector norm of (position of
|
||||
the fingertip - position of target), or *-norm("fingertip" - "target")*.
|
||||
- *reward_control*: A negative reward for penalising the walker if
|
||||
it takes actions that are too large. It is measured as the negative squared
|
||||
Euclidean norm of the action, i.e. as *- sum(action<sup>2</sup>)*.
|
||||
|
||||
The total reward returned is ***reward*** *=* *reward_distance + reward_control*
|
||||
|
||||
Unlike other environments, Reacher does not allow you to specify weights for the individual reward terms.
|
||||
However, `info` does contain the keys *reward_dist* and *reward_ctrl*. Thus, if you'd like to weight the terms,
|
||||
you should create a wrapper that computes the weighted reward from `info`.
|
||||
|
||||
|
||||
### Starting State
|
||||
All observations start in state
|
||||
(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
|
||||
with a noise added for stochasticity. A uniform noise in the range
|
||||
[-0.1, 0.1] is added to the positional attributes, while the target position
|
||||
is selected uniformly at random in a disk of radius 0.2 around the origin.
|
||||
Independent, uniform noise in the
|
||||
range of [-0.005, 0.005] is added to the velocities, and the last
|
||||
element ("fingertip" - "target") is calculated at the end once everything
|
||||
is set. The default setting has a framerate of 2 and a *dt = 2 * 0.01 = 0.02*
|
||||
|
||||
### Episode Termination
|
||||
|
||||
The episode terminates when any of the following happens:
|
||||
|
||||
1. The episode duration reaches a 50 timesteps (with a new random target popping up if the reacher's fingertip reaches it before 50 timesteps)
|
||||
2. Any of the state space values is no longer finite.
|
||||
|
||||
### Arguments
|
||||
|
||||
No additional arguments are currently supported (in v2 and lower),
|
||||
but modifications can be made to the XML file in the assets folder
|
||||
(or by changing the path to a modified XML file in another folder)..
|
||||
|
||||
```
|
||||
env = gym.make('Reacher-v2')
|
||||
```
|
||||
|
||||
There is no v3 for Reacher, unlike the robot environments where a v3 and
|
||||
beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
|
||||
|
||||
|
||||
### Version History
|
||||
|
||||
* v2: All continuous control environments now use mujoco_py >= 1.50
|
||||
* v1: max_time_steps raised to 1000 for robot based tasks (not including reacher, which has a max_time_steps of 50). Added reward_threshold to environments.
|
||||
* v0: Initial versions release (1.0.0)
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
utils.EzPickle.__init__(self)
|
||||
mujoco_env.MujocoEnv.__init__(self, "reacher.xml", 2)
|
||||
mujoco_env.MujocoEnv.__init__(
|
||||
self, "reacher.xml", 2, mujoco_bindings="mujoco_py"
|
||||
)
|
||||
|
||||
def step(self, a):
|
||||
vec = self.get_body_com("fingertip") - self.get_body_com("target")
|
||||
|
165
gym/envs/mujoco/reacher_v4.py
Normal file
165
gym/envs/mujoco/reacher_v4.py
Normal file
@@ -0,0 +1,165 @@
|
||||
import numpy as np
|
||||
|
||||
from gym import utils
|
||||
from gym.envs.mujoco import mujoco_env
|
||||
|
||||
|
||||
class ReacherEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
||||
"""
|
||||
### Description
|
||||
"Reacher" is a two-jointed robot arm. The goal is to move the robot's end effector (called *fingertip*) close to a
|
||||
target that is spawned at a random position.
|
||||
|
||||
### Action Space
|
||||
The action space is a `Box(-1, 1, (2,), float32)`. An action `(a, b)` represents the torques applied at the hinge joints.
|
||||
|
||||
| Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit |
|
||||
|-----|---------------------------------------------------------------------------------|-------------|-------------|--------------------------|-------|------|
|
||||
| 0 | Torque applied at the first hinge (connecting the link to the point of fixture) | -1 | 1 | joint0 | hinge | torque (N m) |
|
||||
| 1 | Torque applied at the second hinge (connecting the two links) | -1 | 1 | joint1 | hinge | torque (N m) |
|
||||
|
||||
### Observation Space
|
||||
|
||||
Observations consist of
|
||||
|
||||
- The cosine of the angles of the two arms
|
||||
- The sine of the angles of the two arms
|
||||
- The coordinates of the target
|
||||
- The angular velocities of the arms
|
||||
- The vector between the target and the reacher's fingertip (3 dimensional with the last element being 0)
|
||||
|
||||
The observation is a `ndarray` with shape `(11,)` where the elements correspond to the following:
|
||||
|
||||
| Num | Observation | Min | Max | Name (in corresponding XML file) | Joint| Unit |
|
||||
|-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------|
|
||||
| 0 | cosine of the angle of the first arm | -Inf | Inf | cos(joint0) | hinge | unitless |
|
||||
| 1 | cosine of the angle of the second arm | -Inf | Inf | cos(joint1) | hinge | unitless |
|
||||
| 2 | sine of the angle of the first arm | -Inf | Inf | cos(joint0) | hinge | unitless |
|
||||
| 3 | sine of the angle of the second arm | -Inf | Inf | cos(joint1) | hinge | unitless |
|
||||
| 4 | x-coorddinate of the target | -Inf | Inf | target_x | slide | position (m) |
|
||||
| 5 | y-coorddinate of the target | -Inf | Inf | target_y | slide | position (m) |
|
||||
| 6 | angular velocity of the first arm | -Inf | Inf | joint0 | hinge | angular velocity (rad/s) |
|
||||
| 7 | angular velocity of the second arm | -Inf | Inf | joint1 | hinge | angular velocity (rad/s) |
|
||||
| 8 | x-value of position_fingertip - position_target | -Inf | Inf | NA | slide | position (m) |
|
||||
| 9 | y-value of position_fingertip - position_target | -Inf | Inf | NA | slide | position (m) |
|
||||
| 10 | z-value of position_fingertip - position_target (0 since reacher is 2d and z is same for both) | -Inf | Inf | NA | slide | position (m) |
|
||||
|
||||
|
||||
Most Gym environments just return the positions and velocity of the
|
||||
joints in the `.xml` file as the state of the environment. However, in
|
||||
reacher the state is created by combining only certain elements of the
|
||||
position and velocity, and performing some function transformations on them.
|
||||
If one is to read the `.xml` for reacher then they will find 4 joints:
|
||||
|
||||
| Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit |
|
||||
|-----|-----------------------------|----------|----------|----------------------------------|-------|--------------------|
|
||||
| 0 | angle of the first arm | -Inf | Inf | joint0 | hinge | angle (rad) |
|
||||
| 1 | angle of the second arm | -Inf | Inf | joint1 | hinge | angle (rad) |
|
||||
| 2 | x-coordinate of the target | -Inf | Inf | target_x | slide | position (m) |
|
||||
| 3 | y-coordinate of the target | -Inf | Inf | target_y | slide | position (m) |
|
||||
|
||||
|
||||
### Rewards
|
||||
The reward consists of two parts:
|
||||
- *reward_distance*: This reward is a measure of how far the *fingertip*
|
||||
of the reacher (the unattached end) is from the target, with a more negative
|
||||
value assigned for when the reacher's *fingertip* is further away from the
|
||||
target. It is calculated as the negative vector norm of (position of
|
||||
the fingertip - position of target), or *-norm("fingertip" - "target")*.
|
||||
- *reward_control*: A negative reward for penalising the walker if
|
||||
it takes actions that are too large. It is measured as the negative squared
|
||||
Euclidean norm of the action, i.e. as *- sum(action<sup>2</sup>)*.
|
||||
|
||||
The total reward returned is ***reward*** *=* *reward_distance + reward_control*
|
||||
|
||||
Unlike other environments, Reacher does not allow you to specify weights for the individual reward terms.
|
||||
However, `info` does contain the keys *reward_dist* and *reward_ctrl*. Thus, if you'd like to weight the terms,
|
||||
you should create a wrapper that computes the weighted reward from `info`.
|
||||
|
||||
|
||||
### Starting State
|
||||
All observations start in state
|
||||
(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
|
||||
with a noise added for stochasticity. A uniform noise in the range
|
||||
[-0.1, 0.1] is added to the positional attributes, while the target position
|
||||
is selected uniformly at random in a disk of radius 0.2 around the origin.
|
||||
Independent, uniform noise in the
|
||||
range of [-0.005, 0.005] is added to the velocities, and the last
|
||||
element ("fingertip" - "target") is calculated at the end once everything
|
||||
is set. The default setting has a framerate of 2 and a *dt = 2 * 0.01 = 0.02*
|
||||
|
||||
### Episode Termination
|
||||
|
||||
The episode terminates when any of the following happens:
|
||||
|
||||
1. The episode duration reaches a 50 timesteps (with a new random target popping up if the reacher's fingertip reaches it before 50 timesteps)
|
||||
2. Any of the state space values is no longer finite.
|
||||
|
||||
### Arguments
|
||||
|
||||
No additional arguments are currently supported (in v2 and lower),
|
||||
but modifications can be made to the XML file in the assets folder
|
||||
(or by changing the path to a modified XML file in another folder)..
|
||||
|
||||
```
|
||||
env = gym.make('Reacher-v2')
|
||||
```
|
||||
|
||||
There is no v3 for Reacher, unlike the robot environments where a v3 and
|
||||
beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
|
||||
|
||||
|
||||
### Version History
|
||||
|
||||
* v4: all mujoco environments now use the mujoco binidings in mujoco>=2.1.3
|
||||
* v2: All continuous control environments now use mujoco_py >= 1.50
|
||||
* v1: max_time_steps raised to 1000 for robot based tasks (not including reacher, which has a max_time_steps of 50). Added reward_threshold to environments.
|
||||
* v0: Initial versions release (1.0.0)
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
utils.EzPickle.__init__(self)
|
||||
mujoco_env.MujocoEnv.__init__(self, "reacher.xml", 2)
|
||||
|
||||
def step(self, a):
|
||||
vec = self.get_body_com("fingertip") - self.get_body_com("target")
|
||||
reward_dist = -np.linalg.norm(vec)
|
||||
reward_ctrl = -np.square(a).sum()
|
||||
reward = reward_dist + reward_ctrl
|
||||
self.do_simulation(a, self.frame_skip)
|
||||
ob = self._get_obs()
|
||||
done = False
|
||||
return ob, reward, done, dict(reward_dist=reward_dist, reward_ctrl=reward_ctrl)
|
||||
|
||||
def viewer_setup(self):
|
||||
self.viewer.cam.trackbodyid = 0
|
||||
|
||||
def reset_model(self):
|
||||
qpos = (
|
||||
self.np_random.uniform(low=-0.1, high=0.1, size=self.model.nq)
|
||||
+ self.init_qpos
|
||||
)
|
||||
while True:
|
||||
self.goal = self.np_random.uniform(low=-0.2, high=0.2, size=2)
|
||||
if np.linalg.norm(self.goal) < 0.2:
|
||||
break
|
||||
qpos[-2:] = self.goal
|
||||
qvel = self.init_qvel + self.np_random.uniform(
|
||||
low=-0.005, high=0.005, size=self.model.nv
|
||||
)
|
||||
qvel[-2:] = 0
|
||||
self.set_state(qpos, qvel)
|
||||
return self._get_obs()
|
||||
|
||||
def _get_obs(self):
|
||||
theta = self.data.qpos.flat[:2]
|
||||
return np.concatenate(
|
||||
[
|
||||
np.cos(theta),
|
||||
np.sin(theta),
|
||||
self.data.qpos.flat[2:],
|
||||
self.data.qvel.flat[:2],
|
||||
self.get_body_com("fingertip") - self.get_body_com("target"),
|
||||
]
|
||||
)
|
@@ -6,7 +6,9 @@ from gym.envs.mujoco import mujoco_env
|
||||
|
||||
class SwimmerEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
||||
def __init__(self):
|
||||
mujoco_env.MujocoEnv.__init__(self, "swimmer.xml", 4)
|
||||
mujoco_env.MujocoEnv.__init__(
|
||||
self, "swimmer.xml", 4, mujoco_bindings="mujoco_py"
|
||||
)
|
||||
utils.EzPickle.__init__(self)
|
||||
|
||||
def step(self, a):
|
||||
|
@@ -9,119 +9,6 @@ DEFAULT_CAMERA_CONFIG = {}
|
||||
|
||||
|
||||
class SwimmerEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
||||
"""
|
||||
### Description
|
||||
|
||||
This environment corresponds to the Swimmer environment described in Rémi Coulom's PhD thesis
|
||||
["Reinforcement Learning Using Neural Networks, with Applications to Motor Control"](https://tel.archives-ouvertes.fr/tel-00003985/document).
|
||||
The environment aims to increase the number of independent state and control
|
||||
variables as compared to the classic control environments. The swimmers
|
||||
consist of three or more segments ('***links***') and one less articulation
|
||||
joints ('***rotors***') - one rotor joint connecting exactly two links to
|
||||
form a linear chain. The swimmer is suspended in a two dimensional pool and
|
||||
always starts in the same position (subject to some deviation drawn from an
|
||||
uniform distribution), and the goal is to move as fast as possible towards
|
||||
the right by applying torque on the rotors and using the fluids friction.
|
||||
|
||||
### Notes
|
||||
|
||||
The problem parameters are:
|
||||
Problem parameters:
|
||||
* *n*: number of body parts
|
||||
* *m<sub>i</sub>*: mass of part *i* (*i* ∈ {1...n})
|
||||
* *l<sub>i</sub>*: length of part *i* (*i* ∈ {1...n})
|
||||
* *k*: viscous-friction coefficient
|
||||
|
||||
While the default environment has *n* = 3, *l<sub>i</sub>* = 0.1,
|
||||
and *k* = 0.1. It is possible to pass a custom MuJoCo XML file during construction to increase the
|
||||
number of links, or to tweak any of the parameters.
|
||||
|
||||
### Action Space
|
||||
The action space is a `Box(-1, 1, (2,), float32)`. An action represents the torques applied between *links*
|
||||
|
||||
| Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit |
|
||||
|-----|------------------------------------|-------------|-------------|----------------------------------|-------|--------------|
|
||||
| 0 | Torque applied on the first rotor | -1 | 1 | rot2 | hinge | torque (N m) |
|
||||
| 1 | Torque applied on the second rotor | -1 | 1 | rot3 | hinge | torque (N m) |
|
||||
|
||||
### Observation Space
|
||||
|
||||
By default, observations consists of:
|
||||
* θ<sub>i</sub>: angle of part *i* with respect to the *x* axis
|
||||
* θ<sub>i</sub>': its derivative with respect to time (angular velocity)
|
||||
|
||||
In the default case, observations do not include the x- and y-coordinates of the front tip. These may
|
||||
be included by passing `exclude_current_positions_from_observation=False` during construction.
|
||||
Then, the observation space will have 10 dimensions where the first two dimensions
|
||||
represent the x- and y-coordinates of the front tip.
|
||||
Regardless of whether `exclude_current_positions_from_observation` was set to true or false, the x- and y-coordinates
|
||||
will be returned in `info` with keys `"x_position"` and `"y_position"`, respectively.
|
||||
|
||||
By default, the observation is a `ndarray` with shape `(8,)` where the elements correspond to the following:
|
||||
|
||||
| Num | Observation | Min | Max | Name (in corresponding XML file) | Joint| Unit |
|
||||
|-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------|
|
||||
| 0 | angle of the front tip | -Inf | Inf | rot | hinge | angle (rad) |
|
||||
| 1 | angle of the second rotor | -Inf | Inf | rot2 | hinge | angle (rad) |
|
||||
| 2 | angle of the second rotor | -Inf | Inf | rot3 | hinge | angle (rad) |
|
||||
| 3 | velocity of the tip along the x-axis | -Inf | Inf | slider1 | slide | velocity (m/s) |
|
||||
| 4 | velocity of the tip along the y-axis | -Inf | Inf | slider2 | slide | velocity (m/s) |
|
||||
| 5 | angular velocity of front tip | -Inf | Inf | rot | hinge | angular velocity (rad/s) |
|
||||
| 6 | angular velocity of second rotor | -Inf | Inf | rot2 | hinge | angular velocity (rad/s) |
|
||||
| 7 | angular velocity of third rotor | -Inf | Inf | rot3 | hinge | angular velocity (rad/s) |
|
||||
|
||||
### Rewards
|
||||
The reward consists of two parts:
|
||||
- *forward_reward*: A reward of moving forward which is measured
|
||||
as *`forward_reward_weight` * (x-coordinate before action - x-coordinate after action)/dt*. *dt* is
|
||||
the time between actions and is dependent on the frame_skip parameter
|
||||
(default is 4), where the frametime is 0.01 - making the
|
||||
default *dt = 4 * 0.01 = 0.04*. This reward would be positive if the swimmer
|
||||
swims right as desired.
|
||||
- *ctrl_cost*: A cost for penalising the swimmer if it takes
|
||||
actions that are too large. It is measured as *`ctrl_cost_weight` *
|
||||
sum(action<sup>2</sup>)* where *`ctrl_cost_weight`* is a parameter set for the
|
||||
control and has a default value of 1e-4
|
||||
|
||||
The total reward returned is ***reward*** *=* *forward_reward - ctrl_cost* and `info` will also contain the individual reward terms
|
||||
|
||||
### Starting State
|
||||
All observations start in state (0,0,0,0,0,0,0,0) with a Uniform noise in the range of [-`reset_noise_scale`, `reset_noise_scale`] is added to the initial state for stochasticity.
|
||||
|
||||
### Episode Termination
|
||||
The episode terminates when the episode length is greater than 1000.
|
||||
|
||||
### Arguments
|
||||
|
||||
No additional arguments are currently supported in v2 and lower.
|
||||
|
||||
```
|
||||
gym.make('Swimmer-v2')
|
||||
```
|
||||
|
||||
v3 and beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
|
||||
|
||||
```
|
||||
env = gym.make('Swimmer-v3', ctrl_cost_weight=0.1, ....)
|
||||
```
|
||||
|
||||
| Parameter | Type | Default |Description |
|
||||
|-------------------------|------------|--------------|-------------------------------|
|
||||
| `xml_file` | **str** | `"swimmer.xml"`| Path to a MuJoCo model |
|
||||
| `forward_reward_weight` | **float** | `1.0` | Weight for *forward_reward* term (see section on reward) |
|
||||
| `ctrl_cost_weight` | **float** | `1e-4` | Weight for *ctrl_cost* term (see section on reward) |
|
||||
| `reset_noise_scale` | **float** | `0.1` | Scale of random perturbations of initial position and velocity (see section on Starting State) |
|
||||
| `exclude_current_positions_from_observation`| **bool** | `True`| Whether or not to omit the x- and y-coordinates from observations. Excluding the position can serve as an inductive bias to induce position-agnostic behavior in policies |
|
||||
|
||||
|
||||
### Version History
|
||||
|
||||
* v3: support for gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. rgb rendering comes from tracking camera (so agent does not run away from screen)
|
||||
* v2: All continuous control environments now use mujoco_py >= 1.50
|
||||
* v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments.
|
||||
* v0: Initial versions release (1.0.0)
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
xml_file="swimmer.xml",
|
||||
@@ -141,7 +28,7 @@ class SwimmerEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
||||
exclude_current_positions_from_observation
|
||||
)
|
||||
|
||||
mujoco_env.MujocoEnv.__init__(self, xml_file, 4)
|
||||
mujoco_env.MujocoEnv.__init__(self, xml_file, 4, mujoco_bindings="mujoco_py")
|
||||
|
||||
def control_cost(self, action):
|
||||
control_cost = self._ctrl_cost_weight * np.sum(np.square(action))
|
||||
|
217
gym/envs/mujoco/swimmer_v4.py
Normal file
217
gym/envs/mujoco/swimmer_v4.py
Normal file
@@ -0,0 +1,217 @@
|
||||
__credits__ = ["Rushiv Arora"]
|
||||
|
||||
import numpy as np
|
||||
|
||||
from gym import utils
|
||||
from gym.envs.mujoco import mujoco_env
|
||||
|
||||
DEFAULT_CAMERA_CONFIG = {}
|
||||
|
||||
|
||||
class SwimmerEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
||||
"""
|
||||
### Description
|
||||
|
||||
This environment corresponds to the Swimmer environment described in Rémi Coulom's PhD thesis
|
||||
["Reinforcement Learning Using Neural Networks, with Applications to Motor Control"](https://tel.archives-ouvertes.fr/tel-00003985/document).
|
||||
The environment aims to increase the number of independent state and control
|
||||
variables as compared to the classic control environments. The swimmers
|
||||
consist of three or more segments ('***links***') and one less articulation
|
||||
joints ('***rotors***') - one rotor joint connecting exactly two links to
|
||||
form a linear chain. The swimmer is suspended in a two dimensional pool and
|
||||
always starts in the same position (subject to some deviation drawn from an
|
||||
uniform distribution), and the goal is to move as fast as possible towards
|
||||
the right by applying torque on the rotors and using the fluids friction.
|
||||
|
||||
### Notes
|
||||
|
||||
The problem parameters are:
|
||||
Problem parameters:
|
||||
* *n*: number of body parts
|
||||
* *m<sub>i</sub>*: mass of part *i* (*i* ∈ {1...n})
|
||||
* *l<sub>i</sub>*: length of part *i* (*i* ∈ {1...n})
|
||||
* *k*: viscous-friction coefficient
|
||||
|
||||
While the default environment has *n* = 3, *l<sub>i</sub>* = 0.1,
|
||||
and *k* = 0.1. It is possible to tweak the MuJoCo XML files to increase the
|
||||
number of links, or to tweak any of the parameters.
|
||||
|
||||
### Action Space
|
||||
The agent take a 2-element vector for actions.
|
||||
The action space is a continuous `(action, action)` in `[-1, 1]`, where
|
||||
`action` represents the numerical torques applied between *links*
|
||||
|
||||
| Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit |
|
||||
|-----|------------------------------------|-------------|-------------|----------------------------------|-------|--------------|
|
||||
| 0 | Torque applied on the first rotor | -1 | 1 | rot2 | hinge | torque (N m) |
|
||||
| 1 | Torque applied on the second rotor | -1 | 1 | rot3 | hinge | torque (N m) |
|
||||
|
||||
### Observation Space
|
||||
|
||||
The state space consists of:
|
||||
* A<sub>0</sub>: position of first point
|
||||
* θ<sub>i</sub>: angle of part *i* with respect to the *x* axis
|
||||
* A<sub>0</sub>, θ<sub>i</sub>: their derivatives with respect to time (velocity and angular velocity)
|
||||
|
||||
The observation is a `ndarray` with shape `(8,)` where the elements correspond to the following:
|
||||
|
||||
| Num | Observation | Min | Max | Name (in corresponding XML file) | Joint| Unit |
|
||||
|-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------|
|
||||
| 0 | x-coordinate of the front tip | -Inf | Inf | slider1 | slide | position (m) |
|
||||
| 1 | y-coordinate of the front tip | -Inf | Inf | slider2 | slide | position (m) |
|
||||
| 2 | angle of the front tip | -Inf | Inf | rot | hinge | angle (rad) |
|
||||
| 3 | angle of the second rotor | -Inf | Inf | rot2 | hinge | angle (rad) |
|
||||
| 4 | angle of the second rotor | -Inf | Inf | rot3 | hinge | angle (rad) |
|
||||
| 5 | velocity of the tip along the x-axis | -Inf | Inf | slider1 | slide | velocity (m/s) |
|
||||
| 6 | velocity of the tip along the y-axis | -Inf | Inf | slider2 | slide | velocity (m/s) |
|
||||
| 7 | angular velocity of front tip | -Inf | Inf | rot | hinge | angular velocity (rad/s) |
|
||||
| 8 | angular velocity of second rotor | -Inf | Inf | rot2 | hinge | angular velocity (rad/s) |
|
||||
| 9 | angular velocity of third rotor | -Inf | Inf | rot3 | hinge | angular velocity (rad/s) |
|
||||
|
||||
**Note:**
|
||||
In practice (and Gym implementation), the first two positional elements are
|
||||
omitted from the state space since the reward function is calculated based
|
||||
on those values. Therefore, observation space has shape `(8,)` and looks like:
|
||||
|
||||
| Num | Observation | Min | Max | Name (in corresponding XML file) | Joint| Unit |
|
||||
|-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------|
|
||||
| 0 | angle of the front tip | -Inf | Inf | rot | hinge | angle (rad) |
|
||||
| 1 | angle of the second rotor | -Inf | Inf | rot2 | hinge | angle (rad) |
|
||||
| 2 | angle of the second rotor | -Inf | Inf | rot3 | hinge | angle (rad) |
|
||||
| 3 | velocity of the tip along the x-axis | -Inf | Inf | slider1 | slide | velocity (m/s) |
|
||||
| 4 | velocity of the tip along the y-axis | -Inf | Inf | slider2 | slide | velocity (m/s) |
|
||||
| 5 | angular velocity of front tip | -Inf | Inf | rot | hinge | angular velocity (rad/s) |
|
||||
| 6 | angular velocity of second rotor | -Inf | Inf | rot2 | hinge | angular velocity (rad/s) |
|
||||
| 7 | angular velocity of third rotor | -Inf | Inf | rot3 | hinge | angular velocity (rad/s) |
|
||||
|
||||
### Rewards
|
||||
The reward consists of two parts:
|
||||
- *reward_front*: A reward of moving forward which is measured
|
||||
as *(x-coordinate before action - x-coordinate after action)/dt*. *dt* is
|
||||
the time between actions and is dependeent on the frame_skip parameter
|
||||
(default is 4), where the *dt* for one frame is 0.01 - making the
|
||||
default *dt = 4 * 0.01 = 0.04*. This reward would be positive if the swimmer
|
||||
swims right as desired.
|
||||
- *reward_control*: A negative reward for penalising the swimmer if it takes
|
||||
actions that are too large. It is measured as *-coefficient x
|
||||
sum(action<sup>2</sup>)* where *coefficient* is a parameter set for the
|
||||
control and has a default value of 0.0001
|
||||
|
||||
The total reward returned is ***reward*** *=* *reward_front + reward_control*
|
||||
|
||||
### Starting State
|
||||
All observations start in state (0,0,0,0,0,0,0,0) with a Uniform noise in the range of [-0.1, 0.1] is added to the initial state for stochasticity.
|
||||
|
||||
### Episode Termination
|
||||
The episode terminates when the episode length is greater than 1000.
|
||||
|
||||
### Arguments
|
||||
|
||||
No additional arguments are currently supported (in v2 and lower), but
|
||||
modifications can be made to the XML file in the assets folder
|
||||
(or by changing the path to a modified XML file in another folder).
|
||||
|
||||
```
|
||||
gym.make('Swimmer-v2')
|
||||
```
|
||||
|
||||
v3 and beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
|
||||
|
||||
```
|
||||
env = gym.make('Swimmer-v3', ctrl_cost_weight=0.1, ....)
|
||||
```
|
||||
|
||||
### Version History
|
||||
|
||||
* v4: all mujoco environments now use the mujoco binidings in mujoco>=2.1.3
|
||||
* v3: support for gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. rgb rendering comes from tracking camera (so agent does not run away from screen)
|
||||
* v2: All continuous control environments now use mujoco_py >= 1.50
|
||||
* v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments.
|
||||
* v0: Initial versions release (1.0.0)
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
xml_file="swimmer.xml",
|
||||
forward_reward_weight=1.0,
|
||||
ctrl_cost_weight=1e-4,
|
||||
reset_noise_scale=0.1,
|
||||
exclude_current_positions_from_observation=True,
|
||||
):
|
||||
utils.EzPickle.__init__(**locals())
|
||||
|
||||
self._forward_reward_weight = forward_reward_weight
|
||||
self._ctrl_cost_weight = ctrl_cost_weight
|
||||
|
||||
self._reset_noise_scale = reset_noise_scale
|
||||
|
||||
self._exclude_current_positions_from_observation = (
|
||||
exclude_current_positions_from_observation
|
||||
)
|
||||
|
||||
mujoco_env.MujocoEnv.__init__(self, xml_file, 4)
|
||||
|
||||
def control_cost(self, action):
|
||||
control_cost = self._ctrl_cost_weight * np.sum(np.square(action))
|
||||
return control_cost
|
||||
|
||||
def step(self, action):
|
||||
xy_position_before = self.data.qpos[0:2].copy()
|
||||
self.do_simulation(action, self.frame_skip)
|
||||
xy_position_after = self.data.qpos[0:2].copy()
|
||||
|
||||
xy_velocity = (xy_position_after - xy_position_before) / self.dt
|
||||
x_velocity, y_velocity = xy_velocity
|
||||
|
||||
forward_reward = self._forward_reward_weight * x_velocity
|
||||
|
||||
ctrl_cost = self.control_cost(action)
|
||||
|
||||
observation = self._get_obs()
|
||||
reward = forward_reward - ctrl_cost
|
||||
done = False
|
||||
info = {
|
||||
"reward_fwd": forward_reward,
|
||||
"reward_ctrl": -ctrl_cost,
|
||||
"x_position": xy_position_after[0],
|
||||
"y_position": xy_position_after[1],
|
||||
"distance_from_origin": np.linalg.norm(xy_position_after, ord=2),
|
||||
"x_velocity": x_velocity,
|
||||
"y_velocity": y_velocity,
|
||||
"forward_reward": forward_reward,
|
||||
}
|
||||
|
||||
return observation, reward, done, info
|
||||
|
||||
def _get_obs(self):
|
||||
position = self.data.qpos.flat.copy()
|
||||
velocity = self.data.qvel.flat.copy()
|
||||
|
||||
if self._exclude_current_positions_from_observation:
|
||||
position = position[2:]
|
||||
|
||||
observation = np.concatenate([position, velocity]).ravel()
|
||||
return observation
|
||||
|
||||
def reset_model(self):
|
||||
noise_low = -self._reset_noise_scale
|
||||
noise_high = self._reset_noise_scale
|
||||
|
||||
qpos = self.init_qpos + self.np_random.uniform(
|
||||
low=noise_low, high=noise_high, size=self.model.nq
|
||||
)
|
||||
qvel = self.init_qvel + self.np_random.uniform(
|
||||
low=noise_low, high=noise_high, size=self.model.nv
|
||||
)
|
||||
|
||||
self.set_state(qpos, qvel)
|
||||
|
||||
observation = self._get_obs()
|
||||
return observation
|
||||
|
||||
def viewer_setup(self):
|
||||
for key, value in DEFAULT_CAMERA_CONFIG.items():
|
||||
if isinstance(value, np.ndarray):
|
||||
getattr(self.viewer.cam, key)[:] = value
|
||||
else:
|
||||
setattr(self.viewer.cam, key, value)
|
@@ -6,7 +6,9 @@ from gym.envs.mujoco import mujoco_env
|
||||
|
||||
class Walker2dEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
||||
def __init__(self):
|
||||
mujoco_env.MujocoEnv.__init__(self, "walker2d.xml", 4)
|
||||
mujoco_env.MujocoEnv.__init__(
|
||||
self, "walker2d.xml", 4, mujoco_bindings="mujoco_py"
|
||||
)
|
||||
utils.EzPickle.__init__(self)
|
||||
|
||||
def step(self, a):
|
||||
|
@@ -12,137 +12,6 @@ DEFAULT_CAMERA_CONFIG = {
|
||||
|
||||
|
||||
class Walker2dEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
||||
"""
|
||||
### Description
|
||||
|
||||
This environment builds on the hopper environment based on the work done by Erez, Tassa, and Todorov
|
||||
in ["Infinite Horizon Model Predictive Control for Nonlinear Periodic Tasks"](http://www.roboticsproceedings.org/rss07/p10.pdf)
|
||||
by adding another set of legs making it possible for the robot to walker forward instead of
|
||||
hop. Like other Mujoco environments, this environment aims to increase the number of independent state
|
||||
and control variables as compared to the classic control environments. The walker is a
|
||||
two-dimensional two-legged figure that consist of four main body parts - a single torso at the top
|
||||
(with the two legs splitting after the torso), two thighs in the middle below the torso, two legs
|
||||
in the bottom below the thighs, and two feet attached to the legs on which the entire body rests.
|
||||
The goal is to make coordinate both sets of feet, legs, and thighs to move in the forward (right)
|
||||
direction by applying torques on the six hinges connecting the six body parts.
|
||||
|
||||
### Action Space
|
||||
The action space is a `Box(-1, 1, (6,), float32)`. An action represents the torques applied at the hinge joints.
|
||||
|
||||
| Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit |
|
||||
|-----|----------------------------------------|-------------|-------------|----------------------------------|-------|--------------|
|
||||
| 0 | Torque applied on the thigh rotor | -1 | 1 | thigh_joint | hinge | torque (N m) |
|
||||
| 1 | Torque applied on the leg rotor | -1 | 1 | leg_joint | hinge | torque (N m) |
|
||||
| 2 | Torque applied on the foot rotor | -1 | 1 | foot_joint | hinge | torque (N m) |
|
||||
| 3 | Torque applied on the left thigh rotor | -1 | 1 | thigh_left_joint | hinge | torque (N m) |
|
||||
| 4 | Torque applied on the left leg rotor | -1 | 1 | leg_left_joint | hinge | torque (N m) |
|
||||
| 5 | Torque applied on the left foot rotor | -1 | 1 | foot_left_joint | hinge | torque (N m) |
|
||||
|
||||
### Observation Space
|
||||
|
||||
Observations consist of positional values of different body parts of the walker,
|
||||
followed by the velocities of those individual parts (their derivatives) with all the positions ordered before all the velocities.
|
||||
|
||||
By default, observations do not include the x-coordinate of the top. It may
|
||||
be included by passing `exclude_current_positions_from_observation=False` during construction.
|
||||
In that case, the observation space will have 18 dimensions where the first dimension
|
||||
represent the x-coordinates of the top of the walker.
|
||||
Regardless of whether `exclude_current_positions_from_observation` was set to true or false, the x-coordinate
|
||||
of the top will be returned in `info` with key `"x_position"`.
|
||||
|
||||
By default, observation is a `ndarray` with shape `(17,)` where the elements correspond to the following:
|
||||
|
||||
| Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit |
|
||||
|-----|--------------------------------------------------------|----------------|-----------------|----------------------------------------|-------|------|
|
||||
| 0 | z-coordinate of the top (height of hopper) | -Inf | Inf | rootz (torso) | slide | position (m) |
|
||||
| 1 | angle of the top | -Inf | Inf | rooty (torso) | hinge | angle (rad) |
|
||||
| 2 | angle of the thigh joint | -Inf | Inf | thigh_joint | hinge | angle (rad) |
|
||||
| 3 | angle of the leg joint | -Inf | Inf | leg_joint | hinge | angle (rad) |
|
||||
| 4 | angle of the foot joint | -Inf | Inf | foot_joint | hinge | angle (rad) |
|
||||
| 5 | angle of the left thigh joint | -Inf | Inf | thigh_left_joint | hinge | angle (rad) |
|
||||
| 6 | angle of the left leg joint | -Inf | Inf | leg_left_joint | hinge | angle (rad) |
|
||||
| 7 | angle of the left foot joint | -Inf | Inf | foot_left_joint | hinge | angle (rad) |
|
||||
| 8 | velocity of the x-coordinate of the top | -Inf | Inf | rootx | slide | velocity (m/s) |
|
||||
| 9 | velocity of the z-coordinate (height) of the top | -Inf | Inf | rootz | slide | velocity (m/s) |
|
||||
| 10 | angular velocity of the angle of the top | -Inf | Inf | rooty | hinge | angular velocity (rad/s) |
|
||||
| 11 | angular velocity of the thigh hinge | -Inf | Inf | thigh_joint | hinge | angular velocity (rad/s) |
|
||||
| 12 | angular velocity of the leg hinge | -Inf | Inf | leg_joint | hinge | angular velocity (rad/s) |
|
||||
| 13 | angular velocity of the foot hinge | -Inf | Inf | foot_joint | hinge | angular velocity (rad/s) |
|
||||
| 14 | angular velocity of the thigh hinge | -Inf | Inf | thigh_left_joint | hinge | angular velocity (rad/s) |
|
||||
| 15 | angular velocity of the leg hinge | -Inf | Inf | leg_left_joint | hinge | angular velocity (rad/s) |
|
||||
| 16 | angular velocity of the foot hinge | -Inf | Inf | foot_left_joint | hinge | angular velocity (rad/s) |
|
||||
|
||||
### Rewards
|
||||
The reward consists of three parts:
|
||||
- *healthy_reward*: Every timestep that the walker is alive, it receives a fixed reward of value `healthy_reward`,
|
||||
- *forward_reward*: A reward of walking forward which is measured as
|
||||
*`forward_reward_weight` * (x-coordinate before action - x-coordinate after action)/dt*.
|
||||
*dt* is the time between actions and is dependeent on the frame_skip parameter
|
||||
(default is 4), where the frametime is 0.002 - making the default
|
||||
*dt = 4 * 0.002 = 0.008*. This reward would be positive if the walker walks forward (right) desired.
|
||||
- *ctrl_cost*: A cost for penalising the walker if it
|
||||
takes actions that are too large. It is measured as
|
||||
*`ctrl_cost_weight` * sum(action<sup>2</sup>)* where *`ctrl_cost_weight`* is
|
||||
a parameter set for the control and has a default value of 0.001
|
||||
|
||||
The total reward returned is ***reward*** *=* *healthy_reward bonus + forward_reward - ctrl_cost* and `info` will also contain the individual reward terms
|
||||
|
||||
### Starting State
|
||||
All observations start in state
|
||||
(0.0, 1.25, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
|
||||
with a uniform noise in the range of [-`reset_noise_scale`, `reset_noise_scale`] added to the values for stochasticity.
|
||||
|
||||
### Episode Termination
|
||||
The walker is said to be unhealthy if any of the following happens:
|
||||
|
||||
1. Any of the state space values is no longer finite
|
||||
2. The height of the walker is ***not*** in the closed interval specified by `healthy_z_range`
|
||||
3. The absolute value of the angle (`observation[1]` if `exclude_current_positions_from_observation=False`, else `observation[2]`) is ***not*** in the closed interval specified by `healthy_angle_range`
|
||||
|
||||
If `terminate_when_unhealthy=True` is passed during construction (which is the default),
|
||||
the episode terminates when any of the following happens:
|
||||
|
||||
1. The episode duration reaches a 1000 timesteps
|
||||
2. The walker is unhealthy
|
||||
|
||||
If `terminate_when_unhealthy=False` is passed, the episode is terminated only when 1000 timesteps are exceeded.
|
||||
|
||||
### Arguments
|
||||
|
||||
No additional arguments are currently supported in v2 and lower.
|
||||
|
||||
```
|
||||
env = gym.make('Walker2d-v2')
|
||||
```
|
||||
|
||||
v3 and beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
|
||||
|
||||
```
|
||||
env = gym.make('Walker2d-v3', ctrl_cost_weight=0.1, ....)
|
||||
```
|
||||
|
||||
| Parameter | Type | Default |Description |
|
||||
|-------------------------|------------|--------------|-------------------------------|
|
||||
| `xml_file` | **str** | `"walker2d.xml"` | Path to a MuJoCo model |
|
||||
| `forward_reward_weight` | **float** | `1.0` | Weight for *forward_reward* term (see section on reward) |
|
||||
| `ctrl_cost_weight` | **float** | `1e-3` | Weight for *ctr_cost* term (see section on reward) |
|
||||
| `healthy_reward` | **float** | `1.0` | Constant reward given if the ant is "healthy" after timestep |
|
||||
| `terminate_when_unhealthy` | **bool**| `True` | If true, issue a done signal if the z-coordinate of the walker is no longer healthy |
|
||||
| `healthy_z_range` | **tuple** | `(0.8, 2)` | The z-coordinate of the top of the walker must be in this range to be considered healthy |
|
||||
| `healthy_angle_range` | **tuple** | `(-1, 1)` | The angle must be in this range to be considered healthy|
|
||||
| `reset_noise_scale` | **float** | `5e-3` | Scale of random perturbations of initial position and velocity (see section on Starting State) |
|
||||
| `exclude_current_positions_from_observation`| **bool** | `True`| Whether or not to omit the x-coordinate from observations. Excluding the position can serve as an inductive bias to induce position-agnostic behavior in policies |
|
||||
|
||||
|
||||
### Version History
|
||||
|
||||
* v3: support for gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. rgb rendering comes from tracking camera (so agent does not run away from screen)
|
||||
* v2: All continuous control environments now use mujoco_py >= 1.50
|
||||
* v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments.
|
||||
* v0: Initial versions release (1.0.0)
|
||||
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
xml_file="walker2d.xml",
|
||||
@@ -172,7 +41,7 @@ class Walker2dEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
||||
exclude_current_positions_from_observation
|
||||
)
|
||||
|
||||
mujoco_env.MujocoEnv.__init__(self, xml_file, 4)
|
||||
mujoco_env.MujocoEnv.__init__(self, xml_file, 4, mujoco_bindings="mujoco_py")
|
||||
|
||||
@property
|
||||
def healthy_reward(self):
|
||||
|
276
gym/envs/mujoco/walker2d_v4.py
Normal file
276
gym/envs/mujoco/walker2d_v4.py
Normal file
@@ -0,0 +1,276 @@
|
||||
import numpy as np
|
||||
|
||||
from gym import utils
|
||||
from gym.envs.mujoco import mujoco_env
|
||||
|
||||
DEFAULT_CAMERA_CONFIG = {
|
||||
"trackbodyid": 2,
|
||||
"distance": 4.0,
|
||||
"lookat": np.array((0.0, 0.0, 1.15)),
|
||||
"elevation": -20.0,
|
||||
}
|
||||
|
||||
DEFAULT_CAMERA_CONFIG = {
|
||||
"trackbodyid": 2,
|
||||
"distance": 4.0,
|
||||
"lookat": np.array((0.0, 0.0, 1.15)),
|
||||
"elevation": -20.0,
|
||||
}
|
||||
|
||||
|
||||
class Walker2dEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
||||
"""
|
||||
### Description
|
||||
|
||||
This environment builds on the hopper environment based on the work done by Erez, Tassa, and Todorov
|
||||
in ["Infinite Horizon Model Predictive Control for Nonlinear Periodic Tasks"](http://www.roboticsproceedings.org/rss07/p10.pdf)
|
||||
by adding another set of legs making it possiible for the robot to walker forward instead of
|
||||
hop. Like other Mujoco environments, this environment aims to increase the number of independent state
|
||||
and control variables as compared to the classic control environments. The walker is a
|
||||
two-dimensional two-legged figure that consist of four main body parts - a single torso at the top
|
||||
(with the two legs splitting after the torso), two thighs in the middle below the torso, two legs
|
||||
in the bottom below the thighs, and two feet attached to the legs on which the entire body rests.
|
||||
The goal is to make coordinate both sets of feet, legs, and thighs to move in the forward (right)
|
||||
direction by applying torques on the six hinges connecting the six body parts.
|
||||
|
||||
### Action Space
|
||||
The agent take a 6-element vector for actions.
|
||||
The action space is a continuous `(action, action, action, action, action, action)`
|
||||
all in `[-1, 1]`, where `action` represents the numerical torques applied at the hinge joints.
|
||||
|
||||
| Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit |
|
||||
|-----|----------------------------------------|-------------|-------------|----------------------------------|-------|--------------|
|
||||
| 0 | Torque applied on the thigh rotor | -1 | 1 | thigh_joint | hinge | torque (N m) |
|
||||
| 1 | Torque applied on the leg rotor | -1 | 1 | leg_joint | hinge | torque (N m) |
|
||||
| 2 | Torque applied on the foot rotor | -1 | 1 | foot_joint | hinge | torque (N m) |
|
||||
| 3 | Torque applied on the left thigh rotor | -1 | 1 | thigh_left_joint | hinge | torque (N m) |
|
||||
| 4 | Torque applied on the left leg rotor | -1 | 1 | leg_left_joint | hinge | torque (N m) |
|
||||
| 5 | Torque applied on the left foot rotor | -1 | 1 | foot_left_joint | hinge | torque (N m) |
|
||||
|
||||
### Observation Space
|
||||
|
||||
The state space consists of positional values of different body parts of the walker,
|
||||
followed by the velocities of those individual parts (their derivatives) with all the positions ordered before all the velocities.
|
||||
|
||||
The observation is a `ndarray` with shape `(17,)` where the elements correspond to the following:
|
||||
|
||||
| Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit |
|
||||
|-----|--------------------------------------------------------|----------------|-----------------|----------------------------------------|-------|------|
|
||||
| 0 | x-coordinate of the top | -Inf | Inf | rootx (torso) | slide | position (m) |
|
||||
| 1 | z-coordinate of the top (height of hopper) | -Inf | Inf | rootz (torso) | slide | position (m) |
|
||||
| 2 | angle of the top | -Inf | Inf | rooty (torso) | hinge | angle (rad) |
|
||||
| 3 | angle of the thigh joint | -Inf | Inf | thigh_joint | hinge | angle (rad) |
|
||||
| 4 | angle of the leg joint | -Inf | Inf | leg_joint | hinge | angle (rad) |
|
||||
| 5 | angle of the foot joint | -Inf | Inf | foot_joint | hinge | angle (rad) |
|
||||
| 6 | angle of the left thigh joint | -Inf | Inf | thigh_left_joint | hinge | angle (rad) |
|
||||
| 7 | angle of the left leg joint | -Inf | Inf | leg_left_joint | hinge | angle (rad) |
|
||||
| 8 | angle of the left foot joint | -Inf | Inf | foot_left_joint | hinge | angle (rad) |
|
||||
| 9 | velocity of the x-coordinate of the top | -Inf | Inf | rootx | slide | velocity (m/s) |
|
||||
| 10 | velocity of the z-coordinate (height) of the top | -Inf | Inf | rootz | slide | velocity (m/s) |
|
||||
| 11 | angular velocity of the angle of the top | -Inf | Inf | rooty | hinge | angular velocity (rad/s) |
|
||||
| 12 | angular velocity of the thigh hinge | -Inf | Inf | thigh_joint | hinge | angular velocity (rad/s) |
|
||||
| 13 | angular velocity of the leg hinge | -Inf | Inf | leg_joint | hinge | angular velocity (rad/s) |
|
||||
| 14 | angular velocity of the foot hinge | -Inf | Inf | foot_joint | hinge | angular velocity (rad/s) |
|
||||
| 15 | angular velocity of the thigh hinge | -Inf | Inf | thigh_left_joint | hinge | angular velocity (rad/s) |
|
||||
| 16 | angular velocity of the leg hinge | -Inf | Inf | leg_left_joint | hinge | angular velocity (rad/s) |
|
||||
| 17 | angular velocity of the foot hinge | -Inf | Inf | foot_left_joint | hinge | angular velocity (rad/s) |
|
||||
|
||||
|
||||
|
||||
**Note:**
|
||||
In practice (and Gym implementation), the first positional element is omitted from the
|
||||
state space since the reward function is calculated based on that value. This value is
|
||||
hidden from the algorithm, which in turn has to develop an abstract understanding of it
|
||||
from the observed rewards. Therefore, observation space has shape `(17,)`
|
||||
instead of `(18,)` and looks like:
|
||||
|
||||
| Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit |
|
||||
|-----|--------------------------------------------------------|----------------|-----------------|----------------------------------------|-------|------|
|
||||
| 0 | z-coordinate of the top (height of hopper) | -Inf | Inf | rootz (torso) | slide | position (m) |
|
||||
| 1 | angle of the top | -Inf | Inf | rooty (torso) | hinge | angle (rad) |
|
||||
| 2 | angle of the thigh joint | -Inf | Inf | thigh_joint | hinge | angle (rad) |
|
||||
| 3 | angle of the leg joint | -Inf | Inf | leg_joint | hinge | angle (rad) |
|
||||
| 4 | angle of the foot joint | -Inf | Inf | foot_joint | hinge | angle (rad) |
|
||||
| 5 | angle of the left thigh joint | -Inf | Inf | thigh_left_joint | hinge | angle (rad) |
|
||||
| 6 | angle of the left leg joint | -Inf | Inf | leg_left_joint | hinge | angle (rad) |
|
||||
| 7 | angle of the left foot joint | -Inf | Inf | foot_left_joint | hinge | angle (rad) |
|
||||
| 8 | velocity of the x-coordinate of the top | -Inf | Inf | rootx | slide | velocity (m/s) |
|
||||
| 9 | velocity of the z-coordinate (height) of the top | -Inf | Inf | rootz | slide | velocity (m/s) |
|
||||
| 10 | angular velocity of the angle of the top | -Inf | Inf | rooty | hinge | angular velocity (rad/s) |
|
||||
| 11 | angular velocity of the thigh hinge | -Inf | Inf | thigh_joint | hinge | angular velocity (rad/s) |
|
||||
| 12 | angular velocity of the leg hinge | -Inf | Inf | leg_joint | hinge | angular velocity (rad/s) |
|
||||
| 13 | angular velocity of the foot hinge | -Inf | Inf | foot_joint | hinge | angular velocity (rad/s) |
|
||||
| 14 | angular velocity of the thigh hinge | -Inf | Inf | thigh_left_joint | hinge | angular velocity (rad/s) |
|
||||
| 15 | angular velocity of the leg hinge | -Inf | Inf | leg_left_joint | hinge | angular velocity (rad/s) |
|
||||
| 16 | angular velocity of the foot hinge | -Inf | Inf | foot_left_joint | hinge | angular velocity (rad/s) |
|
||||
|
||||
### Rewards
|
||||
The reward consists of three parts:
|
||||
- *alive bonus*: Every timestep that the walker is alive, it gets a reward of 1,
|
||||
- *reward_forward*: A reward of walking forward which is measured as
|
||||
*(x-coordinate before action - x-coordinate after action)/dt*.
|
||||
*dt* is the time between actions and is dependeent on the frame_skip parameter
|
||||
(default is 4), where the *dt* for one frame is 0.002 - making the default
|
||||
*dt = 4 * 0.002 = 0.008*. This reward would be positive if the walker walks forward (right) desired.
|
||||
- *reward_control*: A negative reward for penalising the walker if it
|
||||
takes actions that are too large. It is measured as
|
||||
*-coefficient **x** sum(action<sup>2</sup>)* where *coefficient* is
|
||||
a parameter set for the control and has a default value of 0.001
|
||||
|
||||
The total reward returned is ***reward*** *=* *alive bonus + reward_forward + reward_control*
|
||||
|
||||
### Starting State
|
||||
All observations start in state
|
||||
(0.0, 1.25, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
|
||||
with a uniform noise in the range of [-0.005, 0.005] added to the values for stochasticity.
|
||||
|
||||
### Episode Termination
|
||||
The episode terminates when any of the following happens:
|
||||
|
||||
1. The episode duration reaches a 1000 timesteps
|
||||
2. Any of the state space values is no longer finite
|
||||
3. The height of the walker (index 1) is ***not*** in the range `[0.8, 2]`
|
||||
4. The absolute value of the angle (index 2) is ***not*** in the range `[-1, 1]`
|
||||
|
||||
### Arguments
|
||||
|
||||
No additional arguments are currently supported (in v2 and lower),
|
||||
but modifications can be made to the XML file in the assets folder
|
||||
(or by changing the path to a modified XML file in another folder)..
|
||||
|
||||
```
|
||||
env = gym.make('Walker2d-v2')
|
||||
```
|
||||
|
||||
v3 and beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
|
||||
|
||||
```
|
||||
env = gym.make('Walker2d-v3', ctrl_cost_weight=0.1, ....)
|
||||
```
|
||||
|
||||
### Version History
|
||||
|
||||
* v4: all mujoco environments now use the mujoco binidings in mujoco>=2.1.3
|
||||
* v3: support for gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. rgb rendering comes from tracking camera (so agent does not run away from screen)
|
||||
* v2: All continuous control environments now use mujoco_py >= 1.50
|
||||
* v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments.
|
||||
* v0: Initial versions release (1.0.0)
|
||||
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
xml_file="walker2d.xml",
|
||||
forward_reward_weight=1.0,
|
||||
ctrl_cost_weight=1e-3,
|
||||
healthy_reward=1.0,
|
||||
terminate_when_unhealthy=True,
|
||||
healthy_z_range=(0.8, 2.0),
|
||||
healthy_angle_range=(-1.0, 1.0),
|
||||
reset_noise_scale=5e-3,
|
||||
exclude_current_positions_from_observation=True,
|
||||
):
|
||||
utils.EzPickle.__init__(**locals())
|
||||
|
||||
self._forward_reward_weight = forward_reward_weight
|
||||
self._ctrl_cost_weight = ctrl_cost_weight
|
||||
|
||||
self._healthy_reward = healthy_reward
|
||||
self._terminate_when_unhealthy = terminate_when_unhealthy
|
||||
|
||||
self._healthy_z_range = healthy_z_range
|
||||
self._healthy_angle_range = healthy_angle_range
|
||||
|
||||
self._reset_noise_scale = reset_noise_scale
|
||||
|
||||
self._exclude_current_positions_from_observation = (
|
||||
exclude_current_positions_from_observation
|
||||
)
|
||||
|
||||
mujoco_env.MujocoEnv.__init__(self, xml_file, 4)
|
||||
|
||||
@property
|
||||
def healthy_reward(self):
|
||||
return (
|
||||
float(self.is_healthy or self._terminate_when_unhealthy)
|
||||
* self._healthy_reward
|
||||
)
|
||||
|
||||
def control_cost(self, action):
|
||||
control_cost = self._ctrl_cost_weight * np.sum(np.square(action))
|
||||
return control_cost
|
||||
|
||||
@property
|
||||
def is_healthy(self):
|
||||
z, angle = self.data.qpos[1:3]
|
||||
|
||||
min_z, max_z = self._healthy_z_range
|
||||
min_angle, max_angle = self._healthy_angle_range
|
||||
|
||||
healthy_z = min_z < z < max_z
|
||||
healthy_angle = min_angle < angle < max_angle
|
||||
is_healthy = healthy_z and healthy_angle
|
||||
|
||||
return is_healthy
|
||||
|
||||
@property
|
||||
def done(self):
|
||||
done = not self.is_healthy if self._terminate_when_unhealthy else False
|
||||
return done
|
||||
|
||||
def _get_obs(self):
|
||||
position = self.data.qpos.flat.copy()
|
||||
velocity = np.clip(self.data.qvel.flat.copy(), -10, 10)
|
||||
|
||||
if self._exclude_current_positions_from_observation:
|
||||
position = position[1:]
|
||||
|
||||
observation = np.concatenate((position, velocity)).ravel()
|
||||
return observation
|
||||
|
||||
def step(self, action):
|
||||
x_position_before = self.data.qpos[0]
|
||||
self.do_simulation(action, self.frame_skip)
|
||||
x_position_after = self.data.qpos[0]
|
||||
x_velocity = (x_position_after - x_position_before) / self.dt
|
||||
|
||||
ctrl_cost = self.control_cost(action)
|
||||
|
||||
forward_reward = self._forward_reward_weight * x_velocity
|
||||
healthy_reward = self.healthy_reward
|
||||
|
||||
rewards = forward_reward + healthy_reward
|
||||
costs = ctrl_cost
|
||||
|
||||
observation = self._get_obs()
|
||||
reward = rewards - costs
|
||||
done = self.done
|
||||
info = {
|
||||
"x_position": x_position_after,
|
||||
"x_velocity": x_velocity,
|
||||
}
|
||||
|
||||
return observation, reward, done, info
|
||||
|
||||
def reset_model(self):
|
||||
noise_low = -self._reset_noise_scale
|
||||
noise_high = self._reset_noise_scale
|
||||
|
||||
qpos = self.init_qpos + self.np_random.uniform(
|
||||
low=noise_low, high=noise_high, size=self.model.nq
|
||||
)
|
||||
qvel = self.init_qvel + self.np_random.uniform(
|
||||
low=noise_low, high=noise_high, size=self.model.nv
|
||||
)
|
||||
|
||||
self.set_state(qpos, qvel)
|
||||
|
||||
observation = self._get_obs()
|
||||
return observation
|
||||
|
||||
def viewer_setup(self):
|
||||
for key, value in DEFAULT_CAMERA_CONFIG.items():
|
||||
if isinstance(value, np.ndarray):
|
||||
getattr(self.viewer.cam, key)[:] = value
|
||||
else:
|
||||
setattr(self.viewer.cam, key, value)
|
@@ -6,15 +6,14 @@ RUN apt-get -y update && apt-get install -y unzip libglu1-mesa-dev libgl1-mesa-d
|
||||
# Download mujoco
|
||||
RUN mkdir /root/.mujoco && \
|
||||
cd /root/.mujoco && \
|
||||
curl -O https://www.roboti.us/download/mjpro150_linux.zip && \
|
||||
unzip mjpro150_linux.zip && \
|
||||
echo DUMMY_KEY > /root/.mujoco/mjkey.txt
|
||||
wget https://github.com/deepmind/mujoco/releases/download/2.1.0/mujoco210-linux-x86_64.tar.gz &&\
|
||||
tar -xf mujoco210-linux-x86_64.tar.gz
|
||||
|
||||
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/root/.mujoco/mjpro150/bin
|
||||
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/root/.mujoco/mujoco210/bin
|
||||
|
||||
COPY . /usr/local/gym/
|
||||
WORKDIR /usr/local/gym/
|
||||
|
||||
RUN pip install .[nomujoco] && pip install -r test_requirements.txt
|
||||
RUN pip install .[noatari] && pip install -r test_requirements.txt
|
||||
|
||||
ENTRYPOINT ["/usr/local/gym/bin/docker_entrypoint"]
|
||||
|
@@ -1,7 +1,6 @@
|
||||
ale-py~=0.7
|
||||
opencv-python>=3.0
|
||||
box2d-py==2.3.5
|
||||
mujoco_py>=1.50, <2.0
|
||||
scipy>=1.4.1
|
||||
numpy>=1.18.0
|
||||
pyglet>=1.4.0
|
||||
|
10
setup.py
10
setup.py
@@ -15,19 +15,25 @@ extras = {
|
||||
"accept-rom-license": ["autorom[accept-rom-license]~=0.4.2"],
|
||||
"box2d": ["box2d-py==2.3.5", "pygame==2.1.0"],
|
||||
"classic_control": ["pygame==2.1.0"],
|
||||
"mujoco": ["mujoco_py>=1.50, <2.0"],
|
||||
"mujoco_py": ["mujoco_py<2.2,>=2.1"],
|
||||
"mujoco": ["mujoco==2.1.5", "imageio>=2.14.1"],
|
||||
"toy_text": ["pygame==2.1.0", "scipy>=1.4.1"],
|
||||
"other": ["lz4>=3.1.0", "opencv-python>=3.0", "matplotlib>=3.0"],
|
||||
}
|
||||
|
||||
# Meta dependency groups.
|
||||
nomujoco_blacklist = {"mujoco", "accept-rom-license", "atari"}
|
||||
nomujoco_blacklist = {"mujoco_py", "mujoco", "accept-rom-license", "atari"}
|
||||
nomujoco_groups = set(extras.keys()) - nomujoco_blacklist
|
||||
|
||||
extras["nomujoco"] = list(
|
||||
itertools.chain.from_iterable(map(lambda group: extras[group], nomujoco_groups))
|
||||
)
|
||||
|
||||
noatari_blacklist = {"accept-rom-license", "atari"}
|
||||
noatari_groups = set(extras.keys()) - noatari_blacklist
|
||||
extras["noatari"] = list(
|
||||
itertools.chain.from_iterable(map(lambda group: extras[group], noatari_groups))
|
||||
)
|
||||
|
||||
all_blacklist = {"accept-rom-license"}
|
||||
all_groups = set(extras.keys()) - all_blacklist
|
||||
|
@@ -1,19 +1,14 @@
|
||||
import os
|
||||
|
||||
from gym import envs, logger
|
||||
|
||||
SKIP_MUJOCO_WARNING_MESSAGE = (
|
||||
"Cannot run mujoco test (either license key not found or mujoco not"
|
||||
"installed properly)."
|
||||
SKIP_MUJOCO_V3_WARNING_MESSAGE = (
|
||||
"Cannot run mujoco test because mujoco-py is not installed"
|
||||
)
|
||||
|
||||
|
||||
skip_mujoco = not (os.environ.get("MUJOCO_KEY"))
|
||||
if not skip_mujoco:
|
||||
try:
|
||||
skip_mujoco_v3 = False
|
||||
try:
|
||||
import mujoco_py # noqa:F401
|
||||
except ImportError:
|
||||
skip_mujoco = True
|
||||
except ImportError:
|
||||
skip_mujoco_v3 = True
|
||||
|
||||
|
||||
def should_skip_env_spec_for_tests(spec):
|
||||
@@ -21,7 +16,7 @@ def should_skip_env_spec_for_tests(spec):
|
||||
# troublesome to run frequently
|
||||
ep = spec.entry_point
|
||||
# Skip mujoco tests for pull request CI
|
||||
if skip_mujoco and ep.startswith("gym.envs.mujoco"):
|
||||
if skip_mujoco_v3 and ep.startswith("gym.envs.mujoco"):
|
||||
return True
|
||||
try:
|
||||
import gym.envs.atari # noqa:F401
|
||||
@@ -48,8 +43,19 @@ def should_skip_env_spec_for_tests(spec):
|
||||
return False
|
||||
|
||||
|
||||
def skip_mujoco_py_env_for_test(spec):
|
||||
ep = spec.entry_point
|
||||
version = spec.version
|
||||
if ep.startswith("gym.envs.mujoco") and version < 4:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
spec_list = [
|
||||
spec
|
||||
for spec in sorted(envs.registry.values(), key=lambda x: x.id)
|
||||
if spec.entry_point is not None and not should_skip_env_spec_for_tests(spec)
|
||||
]
|
||||
spec_list_no_mujoco_py = [
|
||||
spec for spec in spec_list if not skip_mujoco_py_env_for_test(spec)
|
||||
]
|
||||
|
@@ -8,17 +8,19 @@ from gym.envs.registration import EnvSpec
|
||||
from gym.spaces.box import Box
|
||||
from gym.spaces.discrete import Discrete
|
||||
from gym.spaces.space import Space
|
||||
from tests.envs.spec_list import SKIP_MUJOCO_WARNING_MESSAGE, skip_mujoco, spec_list
|
||||
from tests.envs.spec_list import (
|
||||
SKIP_MUJOCO_V3_WARNING_MESSAGE,
|
||||
skip_mujoco_v3,
|
||||
spec_list,
|
||||
)
|
||||
|
||||
ENVIRONMENT_IDS = ("HalfCheetah-v2",)
|
||||
|
||||
|
||||
def make_envs_by_action_space_type(spec_list: List[EnvSpec], action_space: Space):
|
||||
"""Make environments of specific action_space type.
|
||||
|
||||
This function returns a filtered list of environment from the
|
||||
spec_list that matches the action_space type.
|
||||
|
||||
Args:
|
||||
spec_list (list): list of registered environments' specification
|
||||
action_space (gym.spaces.Space): action_space type
|
||||
@@ -31,7 +33,7 @@ def make_envs_by_action_space_type(spec_list: List[EnvSpec], action_space: Space
|
||||
return filtered_envs
|
||||
|
||||
|
||||
@pytest.mark.skipif(skip_mujoco, reason=SKIP_MUJOCO_WARNING_MESSAGE)
|
||||
@pytest.mark.skipif(skip_mujoco_v3, reason=SKIP_MUJOCO_V3_WARNING_MESSAGE)
|
||||
@pytest.mark.parametrize("environment_id", ENVIRONMENT_IDS)
|
||||
def test_serialize_deserialize(environment_id):
|
||||
env = envs.make(environment_id)
|
||||
@@ -47,10 +49,8 @@ def test_serialize_deserialize(environment_id):
|
||||
@pytest.mark.parametrize("env", make_envs_by_action_space_type(spec_list, Discrete))
|
||||
def test_discrete_actions_out_of_bound(env):
|
||||
"""Test out of bound actions in Discrete action_space.
|
||||
|
||||
In discrete action_space environments, `out-of-bound`
|
||||
actions are not allowed and should raise an exception.
|
||||
|
||||
Args:
|
||||
env (gym.Env): the gym environment
|
||||
"""
|
||||
@@ -69,11 +69,9 @@ def test_discrete_actions_out_of_bound(env):
|
||||
)
|
||||
def test_box_actions_out_of_bound(env, seed):
|
||||
"""Test out of bound actions in Box action_space.
|
||||
|
||||
Environments with Box actions spaces perform clipping inside `step`.
|
||||
The expected behaviour is that an action `out-of-bound` has the same effect
|
||||
of an action with value exactly at the upper (or lower) bound.
|
||||
|
||||
Args:
|
||||
env (gym.Env): the gym environment
|
||||
seed (int): seed value for determinism
|
||||
|
@@ -4,7 +4,7 @@ import pytest
|
||||
from gym import envs
|
||||
from gym.spaces import Box
|
||||
from gym.utils.env_checker import check_env
|
||||
from tests.envs.spec_list import spec_list
|
||||
from tests.envs.spec_list import spec_list, spec_list_no_mujoco_py
|
||||
|
||||
|
||||
# This runs a smoketest on each official registered env. We may want
|
||||
@@ -13,7 +13,9 @@ from tests.envs.spec_list import spec_list
|
||||
@pytest.mark.filterwarnings(
|
||||
"ignore:.*We recommend you to use a symmetric and normalized Box action space.*"
|
||||
)
|
||||
@pytest.mark.parametrize("spec", spec_list, ids=[spec.id for spec in spec_list])
|
||||
@pytest.mark.parametrize(
|
||||
"spec", spec_list_no_mujoco_py, ids=[spec.id for spec in spec_list_no_mujoco_py]
|
||||
)
|
||||
def test_env(spec):
|
||||
# Capture warnings
|
||||
with pytest.warns(None) as warnings:
|
||||
@@ -47,12 +49,14 @@ def test_env(spec):
|
||||
assert (
|
||||
observation.dtype == ob_space.dtype
|
||||
), f"Step observation dtype: {ob.dtype}, expected: {ob_space.dtype}"
|
||||
|
||||
for mode in env.metadata.get("render_modes", []):
|
||||
if not (mode == "human" and spec.entry_point.startswith("gym.envs.mujoco")):
|
||||
env.render(mode=mode)
|
||||
|
||||
# Make sure we can render the environment after close.
|
||||
for mode in env.metadata.get("render_modes", []):
|
||||
if not (mode == "human" and spec.entry_point.startswith("gym.envs.mujoco")):
|
||||
|
||||
env.render(mode=mode)
|
||||
|
||||
env.close()
|
||||
|
@@ -3,7 +3,7 @@ import unittest
|
||||
import numpy as np
|
||||
|
||||
from gym import envs
|
||||
from tests.envs.spec_list import SKIP_MUJOCO_WARNING_MESSAGE, skip_mujoco
|
||||
from tests.envs.spec_list import SKIP_MUJOCO_V3_WARNING_MESSAGE, skip_mujoco_v3
|
||||
|
||||
|
||||
def verify_environments_match(
|
||||
@@ -31,7 +31,7 @@ def verify_environments_match(
|
||||
np.testing.assert_allclose(old_info[key], new_info[key], atol=eps)
|
||||
|
||||
|
||||
@unittest.skipIf(skip_mujoco, SKIP_MUJOCO_WARNING_MESSAGE)
|
||||
@unittest.skipIf(skip_mujoco_v3, SKIP_MUJOCO_V3_WARNING_MESSAGE)
|
||||
class Mujocov2Tov3ConversionTest(unittest.TestCase):
|
||||
def test_environments_match(self):
|
||||
test_cases = (
|
||||
|
Reference in New Issue
Block a user