Add new MuJoCo bindings (#2762)

2025-07-31 13:54:31 +00:00 · 2022-05-24 08:47:51 -04:00
parent 8f9b62f6a6
commit 3e006f3ea5
43 changed files with 3329 additions and 1523 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -11,7 +11,6 @@ jobs:
      - uses: actions/checkout@v2
      - run: |
           docker build -f py.Dockerfile \
             --build-arg MUJOCO_KEY=$MUJOCO_KEY \
             --build-arg PYTHON_VERSION=${{ matrix.python-version }} \
             --tag gym-docker .
      - name: Run tests
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -9,7 +9,7 @@ repos:
    hooks:
      - id: codespell
        args:
-          - --ignore-words-list=nd,reacher,thist,ths
+          - --ignore-words-list=nd,reacher,thist,ths, ure, referenc
  - repo: https://gitlab.com/PyCQA/flake8
    rev: 4.0.1
    hooks:
--- a/README.md
+++ b/README.md
@@ -46,6 +46,11 @@ env.close()
 Gym keeps strict versioning for reproducibility reasons. All environments end in a suffix like "\_v0".  When changes are made to environments that might impact learning results, the number is increased by one to prevent potential confusion.
 ## MuJoCo Environments
 The latest "\_v4" and future versions of the MuJoCo environments will no longer depend on `mujoco-py`. Instead `mujoco` will be the required dependency for future gym MuJoCo environment versions. Old gym MuJoCo environment versions that depend on `mujoco-py` will still be kept but unmaintained.
 To install the dependencies for the latest gym MuJoCo environments use `pip install gym[mujoco]`. Dependencies for old MuJoCo environments can still be installed by `pip install gym[mujoco_py]`. 
 ## Citation
 A whitepaper from when Gym just came out is available https://arxiv.org/pdf/1606.01540, and can be cited with the following bibtex entry:
--- a/gym/envs/init.py
+++ b/gym/envs/init.py
@@ -162,6 +162,13 @@ register(
    reward_threshold=-3.75,
 )
 register(
    id="Reacher-v4",
    entry_point="gym.envs.mujoco.reacher_v4:ReacherEnv",
    max_episode_steps=50,
    reward_threshold=-3.75,
 )
 register(
    id="Pusher-v2",
    entry_point="gym.envs.mujoco:PusherEnv",
@@ -169,6 +176,13 @@ register(
    reward_threshold=0.0,
 )
 register(
    id="Pusher-v4",
    entry_point="gym.envs.mujoco.pusher_v4:PusherEnv",
    max_episode_steps=100,
    reward_threshold=0.0,
 )
 register(
    id="InvertedPendulum-v2",
    entry_point="gym.envs.mujoco:InvertedPendulumEnv",
@@ -176,6 +190,13 @@ register(
    reward_threshold=950.0,
 )
 register(
    id="InvertedPendulum-v4",
    entry_point="gym.envs.mujoco.inverted_pendulum_v4:InvertedPendulumEnv",
    max_episode_steps=1000,
    reward_threshold=950.0,
 )
 register(
    id="InvertedDoublePendulum-v2",
    entry_point="gym.envs.mujoco:InvertedDoublePendulumEnv",
@@ -183,6 +204,13 @@ register(
    reward_threshold=9100.0,
 )
 register(
    id="InvertedDoublePendulum-v4",
    entry_point="gym.envs.mujoco.inverted_double_pendulum_v4:InvertedDoublePendulumEnv",
    max_episode_steps=1000,
    reward_threshold=9100.0,
 )
 register(
    id="HalfCheetah-v2",
    entry_point="gym.envs.mujoco:HalfCheetahEnv",
@@ -197,6 +225,13 @@ register(
    reward_threshold=4800.0,
 )
 register(
    id="HalfCheetah-v4",
    entry_point="gym.envs.mujoco.half_cheetah_v4:HalfCheetahEnv",
    max_episode_steps=1000,
    reward_threshold=4800.0,
 )
 register(
    id="Hopper-v2",
    entry_point="gym.envs.mujoco:HopperEnv",
@@ -211,6 +246,13 @@ register(
    reward_threshold=3800.0,
 )
 register(
    id="Hopper-v4",
    entry_point="gym.envs.mujoco.hopper_v4:HopperEnv",
    max_episode_steps=1000,
    reward_threshold=3800.0,
 )
 register(
    id="Swimmer-v2",
    entry_point="gym.envs.mujoco:SwimmerEnv",
@@ -225,6 +267,13 @@ register(
    reward_threshold=360.0,
 )
 register(
    id="Swimmer-v4",
    entry_point="gym.envs.mujoco.swimmer_v4:SwimmerEnv",
    max_episode_steps=1000,
    reward_threshold=360.0,
 )
 register(
    id="Walker2d-v2",
    max_episode_steps=1000,
@@ -237,6 +286,12 @@ register(
    entry_point="gym.envs.mujoco.walker2d_v3:Walker2dEnv",
 )
 register(
    id="Walker2d-v4",
    max_episode_steps=1000,
    entry_point="gym.envs.mujoco.walker2d_v4:Walker2dEnv",
 )
 register(
    id="Ant-v2",
    entry_point="gym.envs.mujoco:AntEnv",
@@ -251,6 +306,13 @@ register(
    reward_threshold=6000.0,
 )
 register(
    id="Ant-v4",
    entry_point="gym.envs.mujoco.ant_v4:AntEnv",
    max_episode_steps=1000,
    reward_threshold=6000.0,
 )
 register(
    id="Humanoid-v2",
    entry_point="gym.envs.mujoco:HumanoidEnv",
@@ -263,8 +325,20 @@ register(
    max_episode_steps=1000,
 )
 register(
    id="Humanoid-v4",
    entry_point="gym.envs.mujoco.humanoid_v4:HumanoidEnv",
    max_episode_steps=1000,
 )
 register(
    id="HumanoidStandup-v2",
    entry_point="gym.envs.mujoco:HumanoidStandupEnv",
    max_episode_steps=1000,
 )
 register(
    id="HumanoidStandup-v4",
    entry_point="gym.envs.mujoco.humanoidstandup_v4:HumanoidStandupEnv",
    max_episode_steps=1000,
 )
--- a/gym/envs/mujoco/init.py
+++ b/gym/envs/mujoco/init.py
@@ -9,6 +9,7 @@ from gym.envs.mujoco.humanoid import HumanoidEnv
 from gym.envs.mujoco.humanoidstandup import HumanoidStandupEnv
 from gym.envs.mujoco.inverted_double_pendulum import InvertedDoublePendulumEnv
 from gym.envs.mujoco.inverted_pendulum import InvertedPendulumEnv
 from gym.envs.mujoco.mujoco_rendering import RenderContextOffscreen, Viewer
 from gym.envs.mujoco.pusher import PusherEnv
 from gym.envs.mujoco.reacher import ReacherEnv
 from gym.envs.mujoco.swimmer import SwimmerEnv
--- a/gym/envs/mujoco/ant.py
+++ b/gym/envs/mujoco/ant.py
@@ -6,7 +6,7 @@ from gym.envs.mujoco import mujoco_env
 class AntEnv(mujoco_env.MujocoEnv, utils.EzPickle):
    def __init__(self):
-        mujoco_env.MujocoEnv.__init__(self, "ant.xml", 5)
+        mujoco_env.MujocoEnv.__init__(self, "ant.xml", 5, mujoco_bindings="mujoco_py")
        utils.EzPickle.__init__(self)
    def step(self, a):
--- a/gym/envs/mujoco/ant_v3.py
+++ b/gym/envs/mujoco/ant_v3.py
@@ -9,167 +9,6 @@ DEFAULT_CAMERA_CONFIG = {
 class AntEnv(mujoco_env.MujocoEnv, utils.EzPickle):
    """
    ### Description
    This environment is based on the environment introduced by Schulman,
    Moritz, Levine, Jordan and Abbeel in ["High-Dimensional Continuous Control
    Using Generalized Advantage Estimation"](https://arxiv.org/abs/1506.02438).
    The ant is a 3D robot consisting of one torso (free rotational body) with
    four legs attached to it with each leg having two links. The goal is to
    coordinate the four legs to move in the forward (right) direction by applying
    torques on the eight hinges connecting the two links of each leg and the torso
    (nine parts and eight hinges).
    ### Action Space
    The action space is a `Box(-1, 1, (8,), float32)`. An action represents the torques applied at the hinge joints.
    | Num | Action                    | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit |
    |-----|----------------------|---------------|----------------|---------------------------------------|-------|------|
    | 0   | Torque applied on the rotor between the torso and front left hip   | -1 | 1 | hip_1 (front_left_leg)       | hinge | torque (N m) |
    | 1   | Torque applied on the rotor between the front left two links       | -1 | 1 | angle_1 (front_left_leg)     | hinge | torque (N m) |
    | 2   | Torque applied on the rotor between the torso and front right hip  | -1 | 1 | hip_2 (front_right_leg)      | hinge | torque (N m) |
    | 3   | Torque applied on the rotor between the front right two links      | -1 | 1 | angle_2 (front_right_leg)    | hinge | torque (N m) |
    | 4   | Torque applied on the rotor between the torso and back left hip    | -1 | 1 | hip_3 (back_leg)             | hinge | torque (N m) |
    | 5   | Torque applied on the rotor between the back left two links        | -1 | 1 | angle_3 (back_leg)           | hinge | torque (N m) |
    | 6   | Torque applied on the rotor between the torso and back right hip   | -1 | 1 | hip_4 (right_back_leg)       | hinge | torque (N m) |
    | 7   | Torque applied on the rotor between the back right two links       | -1 | 1 | angle_4 (right_back_leg)     | hinge | torque (N m) |
    ### Observation Space
    Observations consist of positional values of different body parts of the ant,
    followed by the velocities of those individual parts (their derivatives) with all
    the positions ordered before all the velocities.
    By default, observations do not include the x- and y-coordinates of the ant's torso. These may
    be included by passing `exclude_current_positions_from_observation=False` during construction.
    In that case, the observation space will have 113 dimensions where the first two dimensions
    represent the x- and y- coordinates of the ant's torso.
    Regardless of whether `exclude_current_positions_from_observation` was set to true or false, the x- and y-coordinates
    of the torso will be returned in `info` with keys `"x_position"` and `"y_position"`, respectively.
    However, by default, an observation is a `ndarray` with shape `(111,)`
    where the elements correspond to the following:
    | Num | Observation                                                 | Min                | Max                | Name (in corresponding XML file) | Joint | Unit |
    |-----|-------------------------------------------------------------|----------------|-----------------|----------------------------------------|-------|------|
    | 0   | z-coordinate of the torso (centre)                          | -Inf                 | Inf                | torso      | free | position (m) |
    | 1   | x-orientation of the torso (centre)                         | -Inf                 | Inf                | torso      | free | angle (rad) |
    | 2   | y-orientation of the torso (centre)                         | -Inf                 | Inf                | torso      | free | angle (rad) |
    | 3   | z-orientation of the torso (centre)                         | -Inf                 | Inf                | torso      | free | angle (rad) |
    | 4   | w-orientation of the torso (centre)                         | -Inf                 | Inf               | torso       | free | angle (rad) |
    | 5   | angle between torso and first link on front left            | -Inf                 | Inf               | hip_1 (front_left_leg) | hinge | angle (rad) |
    | 6   | angle between the two links on the front left               | -Inf                 | Inf               | ankle_1 (front_left_leg) | hinge | angle (rad) |
    | 7   | angle between torso and first link on front right           | -Inf                 | Inf               | hip_2 (front_right_leg) | hinge | angle (rad) |
    | 8  | angle between the two links on the front right              | -Inf                 | Inf               | ankle_2 (front_right_leg) | hinge | angle (rad) |
    | 9  | angle between torso and first link on back left             | -Inf                 | Inf               | hip_3 (back_leg) | hinge | angle (rad) |
    | 10  | angle between the two links on the back left                | -Inf                 | Inf               | ankle_3 (back_leg) | hinge | angle (rad) |
    | 11 | angle between torso and first link on back right            | -Inf                 | Inf               | hip_4 (right_back_leg) | hinge | angle (rad) |
    | 12  | angle between the two links on the back right               | -Inf                 | Inf               | ankle_4 (right_back_leg) | hinge | angle (rad) |
    | 13  | x-coordinate velocity of the torso                          | -Inf                 | Inf                | torso      | free | velocity (m/s) |
    | 14  | y-coordinate velocity of the torso                          | -Inf                 | Inf                | torso      | free | velocity (m/s) |
    | 15  | z-coordinate velocity of the torso                          | -Inf                 | Inf                | torso      | free | velocity (m/s) |
    | 16  | x-coordinate angular velocity of the torso                  | -Inf                 | Inf                | torso      | free | angular velocity (rad/s) |
    | 17  | y-coordinate angular velocity of the torso                  | -Inf                 | Inf                | torso      | free | angular velocity (rad/s) |
    | 18  | z-coordinate angular velocity of the torso                  | -Inf                 | Inf                | torso      | free | angular velocity (rad/s) |
    | 19  | angular velocity of angle between torso and front left link | -Inf                 | Inf               | hip_1 (front_left_leg) | hinge | angle (rad) |
    | 20  | angular velocity of the angle between front left links      | -Inf                 | Inf               | ankle_1 (front_left_leg) | hinge | angle (rad) |
    | 21  | angular velocity of angle between torso and front right link| -Inf                 | Inf               | hip_2 (front_right_leg) | hinge | angle (rad) |
    | 22  | angular velocity of the angle between front right links     | -Inf                 | Inf               | ankle_2 (front_right_leg) | hinge | angle (rad) |
    | 23  | angular velocity of angle between torso and back left link  | -Inf                 | Inf               | hip_3 (back_leg) | hinge | angle (rad) |
    | 24  | angular velocity of the angle between back left links       | -Inf                 | Inf               | ankle_3 (back_leg) | hinge | angle (rad) |
    | 25  | angular velocity of angle between torso and back right link | -Inf                 | Inf               | hip_4 (right_back_leg) | hinge | angle (rad) |
    | 26  |angular velocity of the angle between back right links       | -Inf                 | Inf               | ankle_4 (right_back_leg) | hinge | angle (rad) |
    The remaining 14*6 = 84 elements of the observation are contact forces
    (external forces - force x, y, z and torque x, y, z) applied to the
    center of mass of each of the links. The 14 links are: the ground link,
    the torso link, and 3 links for each leg (1 + 1 + 12) with the 6 external forces.
    The (x,y,z) coordinates are translational DOFs while the orientations are rotational
    DOFs expressed as quaternions. One can read more about free joints on the [Mujoco Documentation](https://mujoco.readthedocs.io/en/latest/XMLreference.html).
    **Note:** There have been reported issues that using a Mujoco-Py version > 2.0 results
    in the contact forces always being 0. As such we recommend to use a Mujoco-Py version < 2.0
    when using the Ant environment if you would like to report results with contact forces (if
    contact forces are not used in your experiments, you can use version > 2.0).
    ### Rewards
    The reward consists of three parts:
    - *healthy_reward*: Every timestep that the ant is healthy (see definition in section "Episode Termination"), it gets a reward of fixed value `healthy_reward`
    - *forward_reward*: A reward of moving forward which is measured as
    *(x-coordinate before action - x-coordinate after action)/dt*. *dt* is the time
    between actions and is dependent on the `frame_skip` parameter (default is 5),
    where the frametime is 0.01 - making the default *dt = 5 * 0.01 = 0.05*.
    This reward would be positive if the ant moves forward (in positive x direction).
    - *ctrl_cost*: A negative reward for penalising the ant if it takes actions
    that are too large. It is measured as *`ctrl_cost_weight` * sum(action<sup>2</sup>)*
    where *`ctr_cost_weight`* is a parameter set for the control and has a default value of 0.5.
    - *contact_cost*: A negative reward for penalising the ant if the external contact
    force is too large. It is calculated *`contact_cost_weight` * sum(clip(external contact
    force to `contact_force_range`)<sup>2</sup>)*.
    The total reward returned is ***reward*** *=* *healthy_reward + forward_reward - ctrl_cost - contact_cost* and `info` will also contain the individual reward terms.
    ### Starting State
    All observations start in state
    (0.0, 0.0,  0.75, 1.0, 0.0  ... 0.0) with a uniform noise in the range
    of [-`reset_noise_scale`, `reset_noise_scale`] added to the positional values and standard normal noise
    with mean 0 and standard deviation `reset_noise_scale` added to the velocity values for
    stochasticity. Note that the initial z coordinate is intentionally selected
    to be slightly high, thereby indicating a standing up ant. The initial orientation
    is designed to make it face forward as well.
    ### Episode Termination
    The ant is said to be unhealthy if any of the following happens:
    1. Any of the state space values is no longer finite
    2. The z-coordinate of the torso is **not** in the closed interval given by `healthy_z_range` (defaults to [0.2, 1.0])
    If `terminate_when_unhealthy=True` is passed during construction (which is the default),
    the episode terminates when any of the following happens:
    1. The episode duration reaches a 1000 timesteps
    2. The ant is unhealthy
    If `terminate_when_unhealthy=False` is passed, the episode is terminated only when 1000 timesteps are exceeded.
    ### Arguments
    No additional arguments are currently supported in v2 and lower.
    ```
    env = gym.make('Ant-v2')
    ```
    v3 and beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
    ```
    env = gym.make('Ant-v3', ctrl_cost_weight=0.1, ...)
    ```
    | Parameter               | Type       | Default      |Description                    |
    |-------------------------|------------|--------------|-------------------------------|
    | `xml_file`              | **str**    | `"ant.xml"`  | Path to a MuJoCo model |
    | `ctrl_cost_weight`      | **float**  | `0.5`        | Weight for *ctrl_cost* term (see section on reward) |
    | `contact_cost_weight`   | **float**  | `5e-4`       | Weight for *contact_cost* term (see section on reward) |
    | `healthy_reward`        | **float**  | `1`          | Constant reward given if the ant is "healthy" after timestep |
    | `terminate_when_unhealthy` | **bool**| `True`       | If true, issue a done signal if the z-coordinate of the torso is no longer in the `healthy_z_range` |
    | `healthy_z_range`       | **tuple**  | `(0.2, 1)`   | The ant is considered healthy if the z-coordinate of the torso is in this range |
    | `contact_force_range`   | **tuple**  | `(-1, 1)`    | Contact forces are clipped to this range in the computation of *contact_cost* |
    | `reset_noise_scale`     | **float**  | `0.1`        | Scale of random perturbations of initial position and velocity (see section on Starting State) |
    | `exclude_current_positions_from_observation`| **bool** | `True`| Whether or not to omit the x- and y-coordinates from observations. Excluding the position can serve as an inductive bias to induce position-agnostic behavior in policies |
    ### Version History
    * v3: support for gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. rgb rendering comes from tracking camera (so agent does not run away from screen)
    * v2: All continuous control environments now use mujoco_py >= 1.50
    * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments.
    * v0: Initial versions release (1.0.0)
    """
    def __init__(
        self,
        xml_file="ant.xml",
@@ -199,7 +38,7 @@ class AntEnv(mujoco_env.MujocoEnv, utils.EzPickle):
            exclude_current_positions_from_observation
        )
-        mujoco_env.MujocoEnv.__init__(self, xml_file, 5)
+        mujoco_env.MujocoEnv.__init__(self, xml_file, 5, mujoco_bindings="mujoco_py")
    @property
    def healthy_reward(self):
--- a/gym/envs/mujoco/ant_v4.py
+++ b/gym/envs/mujoco/ant_v4.py
@@ -0,0 +1,308 @@
 import numpy as np
 from gym import utils
 from gym.envs.mujoco import mujoco_env
 DEFAULT_CAMERA_CONFIG = {
    "distance": 4.0,
 }
 class AntEnv(mujoco_env.MujocoEnv, utils.EzPickle):
    """
    ### Description
    This environment is based on the environment introduced by Schulman,
    Moritz, Levine, Jordan and Abbeel in ["High-Dimensional Continuous Control
    Using Generalized Advantage Estimation"](https://arxiv.org/abs/1506.02438).
    The ant is a 3D robot consisting of one torso (free rotational body) with
    four legs attached to it with each leg having two links. The goal is to
    coordinate the four legs to move in the forward (right) direction by applying
    torques on the eight hinges connecting the two links of each leg and the torso
    (nine parts and eight hinges).
    ### Action Space
    The agent take a 8-element vector for actions.
    The action space is a continuous `(action, action, action, action, action, action,
    action, action)` all in `[-1, 1]`, where `action` represents the numerical torques
    applied at the hinge joints.
    | Num | Action                    | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit |
    |-----|----------------------|---------------|----------------|---------------------------------------|-------|------|
    | 0   | Torque applied on the rotor between the torso and front left hip   | -1 | 1 | hip_1 (front_left_leg)       | hinge | torque (N m) |
    | 1   | Torque applied on the rotor between the front left two links       | -1 | 1 | angle_1 (front_left_leg)     | hinge | torque (N m) |
    | 2   | Torque applied on the rotor between the torso and front right hip  | -1 | 1 | hip_2 (front_right_leg)      | hinge | torque (N m) |
    | 3   | Torque applied on the rotor between the front right two links      | -1 | 1 | angle_2 (front_right_leg)    | hinge | torque (N m) |
    | 4   | Torque applied on the rotor between the torso and back left hip    | -1 | 1 | hip_3 (back_leg)             | hinge | torque (N m) |
    | 5   | Torque applied on the rotor between the back left two links        | -1 | 1 | angle_3 (back_leg)           | hinge | torque (N m) |
    | 6   | Torque applied on the rotor between the torso and back right hip   | -1 | 1 | hip_4 (right_back_leg)       | hinge | torque (N m) |
    | 7   | Torque applied on the rotor between the back right two links       | -1 | 1 | angle_4 (right_back_leg)     | hinge | torque (N m) |
    ### Observation Space
    The state space consists of positional values of different body parts of the ant,
    followed by the velocities of those individual parts (their derivatives) with all
    the positions ordered before all the velocities.
    The observation is a `ndarray` with shape `(111,)` where the elements correspond to the following:
    | Num | Observation                                                 | Min                | Max                | Name (in corresponding XML file) | Joint | Unit |
    |-----|-------------------------------------------------------------|----------------|-----------------|----------------------------------------|-------|------|
    | 0   | x-coordinate of the torso (centre)                          | -Inf                 | Inf                | torso      | free | position (m) |
    | 1   | y-coordinate of the torso (centre)                          | -Inf                 | Inf                | torso      | free | position (m) |
    | 2   | z-coordinate of the torso (centre)                          | -Inf                 | Inf                | torso      | free | position (m) |
    | 3   | x-orientation of the torso (centre)                         | -Inf                 | Inf                | torso      | free | angle (rad) |
    | 4   | y-orientation of the torso (centre)                         | -Inf                 | Inf                | torso      | free | angle (rad) |
    | 5   | z-orientation of the torso (centre)                         | -Inf                 | Inf                | torso      | free | angle (rad) |
    | 6   | w-orientation of the torso (centre)                         | -Inf                 | Inf               | torso       | free | angle (rad) |
    | 7   | angle between torso and first link on front left            | -Inf                 | Inf               | hip_1 (front_left_leg) | hinge | angle (rad) |
    | 8   | angle between the two links on the front left               | -Inf                 | Inf               | ankle_1 (front_left_leg) | hinge | angle (rad) |
    | 9   | angle between torso and first link on front right           | -Inf                 | Inf               | hip_2 (front_right_leg) | hinge | angle (rad) |
    | 10  | angle between the two links on the front right              | -Inf                 | Inf               | ankle_2 (front_right_leg) | hinge | angle (rad) |
    | 11  | angle between torso and first link on back left             | -Inf                 | Inf               | hip_3 (back_leg) | hinge | angle (rad) |
    | 12  | angle between the two links on the back left                | -Inf                 | Inf               | ankle_3 (back_leg) | hinge | angle (rad) |
    | 13  | angle between torso and first link on back right            | -Inf                 | Inf               | hip_4 (right_back_leg) | hinge | angle (rad) |
    | 14  | angle between the two links on the back right               | -Inf                 | Inf               | ankle_4 (right_back_leg) | hinge | angle (rad) |
    | 15  | x-coordinate velocity of the torso                          | -Inf                 | Inf                | torso      | free | velocity (m/s) |
    | 16  | y-coordinate velocity of the torso                          | -Inf                 | Inf                | torso      | free | velocity (m/s) |
    | 17  | z-coordinate velocity of the torso                          | -Inf                 | Inf                | torso      | free | velocity (m/s) |
    | 18  | x-coordinate angular velocity of the torso                  | -Inf                 | Inf                | torso      | free | angular velocity (rad/s) |
    | 19  | y-coordinate angular velocity of the torso                  | -Inf                 | Inf                | torso      | free | angular velocity (rad/s) |
    | 20  | z-coordinate angular velocity of the torso                  | -Inf                 | Inf                | torso      | free | angular velocity (rad/s) |
    | 21  | angular velocity of angle between torso and front left link | -Inf                 | Inf               | hip_1 (front_left_leg) | hinge | angle (rad) |
    | 22  | angular velocity of the angle between front left links      | -Inf                 | Inf               | ankle_1 (front_left_leg) | hinge | angle (rad) |
    | 23  | angular velocity of angle between torso and front right link| -Inf                 | Inf               | hip_2 (front_right_leg) | hinge | angle (rad) |
    | 24  | angular velocity of the angle between front right links     | -Inf                 | Inf               | ankle_2 (front_right_leg) | hinge | angle (rad) |
    | 25  | angular velocity of angle between torso and back left link  | -Inf                 | Inf               | hip_3 (back_leg) | hinge | angle (rad) |
    | 26  | angular velocity of the angle between back left links       | -Inf                 | Inf               | ankle_3 (back_leg) | hinge | angle (rad) |
    | 27  | angular velocity of angle between torso and back right link | -Inf                 | Inf               | hip_4 (right_back_leg) | hinge | angle (rad) |
    | 28  |angular velocity of the angle between back right links       | -Inf                 | Inf               | ankle_4 (right_back_leg) | hinge | angle (rad) |
    The remaining 14*6 = 84 elements in the state are contact forces
    (external forces - force x, y, z and torque x, y, z) applied to the
    center of mass of each of the links. The 14 links are: the ground link,
    the torso link, and 3 links for each leg (1 + 1 + 12) with the 6 external forces.
    The (x,y,z) coordinates are translational DOFs while the orientations are rotational
    DOFs expressed as quaternions. One can read more about free joints on the [Mujoco Documentation](https://mujoco.readthedocs.io/en/latest/XMLreference.html).
    **Note:** There are 29 elements in the table above - giving rise to `(113,)` elements
    in the state space. In practice (and Gym implementation), the first two positional
    elements are omitted from the state space since the reward function is calculated based
    on the x-coordinate value. This value is hidden from the algorithm, which in turn has to
    develop an abstract understanding of it from the observed rewards. Therefore, observation
    space has shape `(111,)` instead of `(113,)` and the table should not have the first two rows.
    **Note:** Ant-v4 environment no longer has the following contact forces issue.
    If using previous Ant versions from v4, there have been reported issues that using a Mujoco-Py version > 2.0 results
    in the contact forces always being 0. As such we recommend to use a Mujoco-Py version < 2.0
    when using the Ant environment if you would like to report results with contact forces (if
    contact forces are not used in your experiments, you can use version > 2.0).
    **Note:** Ant-v4 has the option of including contact forces in the observation space. To add contact forces set the argument
    'use_contact_forces" to True. The default value is False. Also note that training including contact forces can perform worse
    than not using them as shown in (https://github.com/openai/gym/pull/2762).
    ### Rewards
    The reward consists of three parts:
    - *survive_reward*: Every timestep that the ant is alive, it gets a reward of 1.
    - *forward_reward*: A reward of moving forward which is measured as
    *(x-coordinate before action - x-coordinate after action)/dt*. *dt* is the time
    between actions and is dependent on the frame_skip parameter (default is 5),
    where the *dt* for one frame is 0.01 - making the default *dt = 5 * 0.01 = 0.05*.
    This reward would be positive if the ant moves forward (right) desired.
    - *ctrl_cost*: A negative reward for penalising the ant if it takes actions
    that are too large. It is measured as *coefficient **x** sum(action<sup>2</sup>)*
    where *coefficient* is a parameter set for the control and has a default value of 0.5.
    - *contact_cost*: A negative reward for penalising the ant if the external contact
    force is too large. It is calculated *0.5 * 0.001 * sum(clip(external contact
    force to [-1,1])<sup>2</sup>)*.
    The total reward returned is ***reward*** *=* *alive survive_reward + forward_reward - ctrl_cost - contact_cost*
    ### Starting State
    All observations start in state
    (0.0, 0.0,  0.75, 1.0, 0.0  ... 0.0) with a uniform noise in the range
    of [-0.1, 0.1] added to the positional values and standard normal noise
    with 0 mean and 0.1 standard deviation added to the velocity values for
    stochasticity. Note that the initial z coordinate is intentionally selected
    to be slightly high, thereby indicating a standing up ant. The initial orientation
    is designed to make it face forward as well.
    ### Episode Termination
    The episode terminates when any of the following happens:
    1. The episode duration reaches a 1000 timesteps
    2. Any of the state space values is no longer finite
    3. The y-orientation (index 2) in the state is **not** in the range `[0.2, 1.0]`
    ### Arguments
    No additional arguments are currently supported (in v2 and lower), but modifications
    can be made to the XML file in the assets folder (or by changing the path to a modified
    XML file in another folder).
    ```
    env = gym.make('Ant-v2')
    ```
    v3 and beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
    ```
    env = gym.make('Ant-v3', ctrl_cost_weight=0.1, ...)
    ```
    ### Version History
    * v4: all mujoco environments now use the mujoco binidings in mujoco>=2.1.3
    * v3: support for gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. rgb rendering comes from tracking camera (so agent does not run away from screen)
    * v2: All continuous control environments now use mujoco_py >= 1.50
    * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments.
    * v0: Initial versions release (1.0.0)
    """
    def __init__(
        self,
        xml_file="ant.xml",
        ctrl_cost_weight=0.5,
        use_contact_forces=False,
        contact_cost_weight=5e-4,
        healthy_reward=1.0,
        terminate_when_unhealthy=True,
        healthy_z_range=(0.2, 1.0),
        contact_force_range=(-1.0, 1.0),
        reset_noise_scale=0.1,
        exclude_current_positions_from_observation=True,
    ):
        utils.EzPickle.__init__(**locals())
        self._ctrl_cost_weight = ctrl_cost_weight
        self._contact_cost_weight = contact_cost_weight
        self._healthy_reward = healthy_reward
        self._terminate_when_unhealthy = terminate_when_unhealthy
        self._healthy_z_range = healthy_z_range
        self._contact_force_range = contact_force_range
        self._reset_noise_scale = reset_noise_scale
        self._use_contact_forces = use_contact_forces
        self._exclude_current_positions_from_observation = (
            exclude_current_positions_from_observation
        )
        mujoco_env.MujocoEnv.__init__(self, xml_file, 5)
    @property
    def healthy_reward(self):
        return (
            float(self.is_healthy or self._terminate_when_unhealthy)
            * self._healthy_reward
        )
    def control_cost(self, action):
        control_cost = self._ctrl_cost_weight * np.sum(np.square(action))
        return control_cost
    @property
    def contact_forces(self):
        raw_contact_forces = self.data.cfrc_ext
        min_value, max_value = self._contact_force_range
        contact_forces = np.clip(raw_contact_forces, min_value, max_value)
        return contact_forces
    @property
    def contact_cost(self):
        contact_cost = self._contact_cost_weight * np.sum(
            np.square(self.contact_forces)
        )
        return contact_cost
    @property
    def is_healthy(self):
        state = self.state_vector()
        min_z, max_z = self._healthy_z_range
        is_healthy = np.isfinite(state).all() and min_z <= state[2] <= max_z
        return is_healthy
    @property
    def done(self):
        done = not self.is_healthy if self._terminate_when_unhealthy else False
        return done
    def step(self, action):
        xy_position_before = self.get_body_com("torso")[:2].copy()
        self.do_simulation(action, self.frame_skip)
        xy_position_after = self.get_body_com("torso")[:2].copy()
        xy_velocity = (xy_position_after - xy_position_before) / self.dt
        x_velocity, y_velocity = xy_velocity
        forward_reward = x_velocity
        healthy_reward = self.healthy_reward
        rewards = forward_reward + healthy_reward
        costs = ctrl_cost = self.control_cost(action)
        done = self.done
        observation = self._get_obs()
        info = {
            "reward_forward": forward_reward,
            "reward_ctrl": -ctrl_cost,
            "reward_survive": healthy_reward,
            "x_position": xy_position_after[0],
            "y_position": xy_position_after[1],
            "distance_from_origin": np.linalg.norm(xy_position_after, ord=2),
            "x_velocity": x_velocity,
            "y_velocity": y_velocity,
            "forward_reward": forward_reward,
        }
        if self._use_contact_forces:
            contact_cost = self.contact_cost
            costs += contact_cost
            info["reward_ctrl"] = -contact_cost
        reward = rewards - costs
        return observation, reward, done, info
    def _get_obs(self):
        position = self.data.qpos.flat.copy()
        velocity = self.data.qvel.flat.copy()
        if self._exclude_current_positions_from_observation:
            position = position[2:]
        if self._use_contact_forces:
            contact_force = self.contact_forces.flat.copy()
            return np.concatenate((position, velocity, contact_force))
        else:
            return np.concatenate((position, velocity))
    def reset_model(self):
        noise_low = -self._reset_noise_scale
        noise_high = self._reset_noise_scale
        qpos = self.init_qpos + self.np_random.uniform(
            low=noise_low, high=noise_high, size=self.model.nq
        )
        qvel = (
            self.init_qvel
            + self._reset_noise_scale * self.np_random.standard_normal(self.model.nv)
        )
        self.set_state(qpos, qvel)
        observation = self._get_obs()
        return observation
    def viewer_setup(self):
        for key, value in DEFAULT_CAMERA_CONFIG.items():
            if isinstance(value, np.ndarray):
                getattr(self.viewer.cam, key)[:] = value
            else:
                setattr(self.viewer.cam, key, value)
--- a/gym/envs/mujoco/assets/inverted_pendulum.xml
+++ b/gym/envs/mujoco/assets/inverted_pendulum.xml
@@ -22,6 +22,6 @@
 		</body>
 	</worldbody>
 	<actuator>
-		<motor gear="100" joint="slider" name="slide"/>
+		<motor ctrllimited="true" ctrlrange="-3 3" gear="100" joint="slider" name="slide"/>
 	</actuator>
 </mujoco>
--- a/gym/envs/mujoco/half_cheetah.py
+++ b/gym/envs/mujoco/half_cheetah.py
@@ -6,7 +6,9 @@ from gym.envs.mujoco import mujoco_env
 class HalfCheetahEnv(mujoco_env.MujocoEnv, utils.EzPickle):
    def __init__(self):
-        mujoco_env.MujocoEnv.__init__(self, "half_cheetah.xml", 5)
+        mujoco_env.MujocoEnv.__init__(
            self, "half_cheetah.xml", 5, mujoco_bindings="mujoco_py"
        )
        utils.EzPickle.__init__(self)
    def step(self, action):
--- a/gym/envs/mujoco/half_cheetah_v3.py
+++ b/gym/envs/mujoco/half_cheetah_v3.py
@@ -10,125 +10,6 @@ DEFAULT_CAMERA_CONFIG = {
 class HalfCheetahEnv(mujoco_env.MujocoEnv, utils.EzPickle):
    """
    ### Description
    This environment is based on the work by P. Wawrzy´nski in
    ["A Cat-Like Robot Real-Time Learning to Run"](http://staff.elka.pw.edu.pl/~pwawrzyn/pub-s/0812_LSCLRR.pdf).
    The HalfCheetah is a 2-dimensional robot consisting of 9 links and 8
    joints connecting them (including two paws). The goal is to apply a torque
    on the joints to make the cheetah run forward (right) as fast as possible,
    with a positive reward allocated based on the distance moved forward and a
    negative reward allocated for moving backward. The torso and head of the
    cheetah are fixed, and the torque can only be applied on the other 6 joints
    over the front and back thighs (connecting to the torso), shins
    (connecting to the thighs) and feet (connecting to the shins).
    ### Action Space
    The action space is a `Box(-1, 1, (6,), float32)`. An action represents the torques applied between *links*.
    | Num | Action                                 | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit |
    |-------|--------------------------------------|---------------|----------------|---------------------------------------|-------|------|
    | 0   | Torque applied on the back thigh rotor | -1 | 1 | bthigh | hinge | torque (N m) |
    | 1   | Torque applied on the back shin rotor  | -1 | 1 | bshin | hinge | torque (N m) |
    | 2   | Torque applied on the back foot rotor  | -1 | 1 | bfoot | hinge | torque (N m) |
    | 3   | Torque applied on the front thigh rotor| -1 | 1 | fthigh | hinge | torque (N m) |
    | 4   | Torque applied on the front shin rotor | -1 | 1 | fshin | hinge | torque (N m) |
    | 5   | Torque applied on the front foot rotor | -1 | 1 | ffoot | hinge | torque (N m) |
    ### Observation Space
    Observations consist of positional values of different body parts of the
    cheetah, followed by the velocities of those individual parts (their derivatives) with all the positions ordered before all the velocities.
    By default, observations do not include the x-coordinate of the cheetah's center of mass. It may
    be included by passing `exclude_current_positions_from_observation=False` during construction.
    In that case, the observation space will have 18 dimensions where the first dimension
    represents the x-coordinate of the cheetah's center of mass.
    Regardless of whether `exclude_current_positions_from_observation` was set to true or false, the x-coordinate
    will be returned in `info` with key `"x_position"`.
    However, by default, the observation is a `ndarray` with shape `(17,)` where the elements correspond to the following:
    | Num | Observation           | Min                  | Max                | Name (in corresponding XML file) | Joint| Unit |
    |-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------|
    | 0     | z-coordinate of the front tip              | -Inf                 | Inf                | rootz | slide | position (m) |
    | 1     | angle of the front tip                     | -Inf                 | Inf                | rooty | hinge | angle (rad) |
    | 2     | angle of the second rotor                  | -Inf                 | Inf                | bthigh | hinge | angle (rad) |
    | 3     | angle of the second rotor                  | -Inf                 | Inf                | bshin | hinge | angle (rad) |
    | 4     | velocity of the tip along the x-axis       | -Inf                 | Inf                | bfoot | hinge | angle (rad) |
    | 5     | velocity of the tip along the y-axis       | -Inf                 | Inf                | fthigh | hinge | angle (rad) |
    | 6     | angular velocity of front tip              | -Inf                 | Inf                | fshin | hinge | angle (rad) |
    | 7     | angular velocity of second rotor           | -Inf                 | Inf                | ffoot | hinge | angle (rad) |
    | 8     | x-coordinate of the front tip              | -Inf                 | Inf                | rootx | slide | velocity (m/s) |
    | 9     | y-coordinate of the front tip              | -Inf                 | Inf                | rootz | slide | velocity (m/s) |
    | 10   | angle of the front tip                      | -Inf                 | Inf                | rooty | hinge | angular velocity (rad/s) |
    | 11   | angle of the second rotor                   | -Inf                 | Inf                | bthigh | hinge | angular velocity (rad/s) |
    | 12   | angle of the second rotor                   | -Inf                 | Inf                | bshin | hinge | angular velocity (rad/s) |
    | 13   | velocity of the tip along the x-axis        | -Inf                 | Inf                | bfoot | hinge | angular velocity (rad/s) |
    | 14   | velocity of the tip along the y-axis        | -Inf                 | Inf                | fthigh | hinge |angular velocity (rad/s) |
    | 15   | angular velocity of front tip               | -Inf                 | Inf                | fshin | hinge | angular velocity (rad/s) |
    | 16   | angular velocity of second rotor            | -Inf                 | Inf                | ffoot | hinge | angular velocity (rad/s) |
    ### Rewards
    The reward consists of two parts:
    - *forward_reward*: A reward of moving forward which is measured
    as *`forward_reward_weight` * (x-coordinate before action - x-coordinate after action)/dt*. *dt* is
    the time between actions and is dependent on the frame_skip parameter
    (fixed to 5), where the frametime is 0.01 - making the
    default *dt = 5 * 0.01 = 0.05*. This reward would be positive if the cheetah
    runs forward (right).
    - *ctrl_cost*: A cost for penalising the cheetah if it takes
    actions that are too large. It is measured as *`ctrl_cost_weight` *
    sum(action<sup>2</sup>)* where *`ctrl_cost_weight`* is a parameter set for the
    control and has a default value of 0.1
    The total reward returned is ***reward*** *=* *forward_reward - ctrl_cost* and `info` will also contain the individual reward terms
    ### Starting State
    All observations start in state (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
    0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,) with a noise added to the
    initial state for stochasticity. As seen before, the first 8 values in the
    state are positional and the last 9 values are velocity. A uniform noise in
    the range of [-`reset_noise_scale`, `reset_noise_scale`] is added to the positional values while a standard
    normal noise with a mean of 0 and standard deviation of `reset_noise_scale` is added to the
    initial velocity values of all zeros.
    ### Episode Termination
    The episode terminates when the episode length is greater than 1000.
    ### Arguments
    No additional arguments are currently supported in v2 and lower.
    ```
    env = gym.make('HalfCheetah-v2')
    ```
    v3 and beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
    ```
    env = gym.make('HalfCheetah-v3', ctrl_cost_weight=0.1, ....)
    ```
    | Parameter               | Type       | Default              |Description                    |
    |-------------------------|------------|----------------------|-------------------------------|
    | `xml_file`              | **str**    | `"half_cheetah.xml"` | Path to a MuJoCo model |
    | `forward_reward_weight` | **float**  | `1.0`                | Weight for *forward_reward* term (see section on reward) |
    | `ctrl_cost_weight`      | **float**  | `0.1`                | Weight for *ctrl_cost* weight (see section on reward) |
    | `reset_noise_scale`     | **float**  | `0.1`                | Scale of random perturbations of initial position and velocity (see section on Starting State) |
    | `exclude_current_positions_from_observation`| **bool** | `True` | Whether or not to omit the x-coordinate from observations. Excluding the position can serve as an inductive bias to induce position-agnostic behavior in policies |
    ### Version History
    * v3: support for gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. rgb rendering comes from tracking camera (so agent does not run away from screen)
    * v2: All continuous control environments now use mujoco_py >= 1.50
    * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments.
    * v0: Initial versions release (1.0.0)
    """
    def __init__(
        self,
        xml_file="half_cheetah.xml",
@@ -149,7 +30,7 @@ class HalfCheetahEnv(mujoco_env.MujocoEnv, utils.EzPickle):
            exclude_current_positions_from_observation
        )
-        mujoco_env.MujocoEnv.__init__(self, xml_file, 5)
+        mujoco_env.MujocoEnv.__init__(self, xml_file, 5, mujoco_bindings="mujoco_py")
    def control_cost(self, action):
        control_cost = self._ctrl_cost_weight * np.sum(np.square(action))
--- a/gym/envs/mujoco/half_cheetah_v4.py
+++ b/gym/envs/mujoco/half_cheetah_v4.py
@@ -0,0 +1,230 @@
 __credits__ = ["Rushiv Arora"]
 import numpy as np
 from gym import utils
 from gym.envs.mujoco import mujoco_env
 DEFAULT_CAMERA_CONFIG = {
    "distance": 4.0,
 }
 class HalfCheetahEnv(mujoco_env.MujocoEnv, utils.EzPickle):
    """
    ### Description
    This environment is based on the work by P. Wawrzy´nski in
    ["A Cat-Like Robot Real-Time Learning to Run"](http://staff.elka.pw.edu.pl/~pwawrzyn/pub-s/0812_LSCLRR.pdf).
    The HalfCheetah is a 2-dimensional robot consisting of 9 links and 8
    joints connecting them (including two paws). The goal is to apply a torque
    on the joints to make the cheetah run forward (right) as fast as possible,
    with a positive reward allocated based on the distance moved forward and a
    negative reward allocated for moving backward. The torso and head of the
    cheetah are fixed, and the torque can only be applied on the other 6 joints
    over the front and back thighs (connecting to the torso), shins
    (connecting to the thighs) and feet (connecting to the shins).
    ### Action Space
    The agent take a 6-element vector for actions.
    The action space is a continuous `(action, action, action, action, action, action)` all in `[-1.0, 1.0]`, where `action` represents the numerical torques applied between *links*
    | Num | Action                                 | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit |
    |-------|--------------------------------------|---------------|----------------|---------------------------------------|-------|------|
    | 0   | Torque applied on the back thigh rotor | -1 | 1 | bthigh | hinge | torque (N m) |
    | 1   | Torque applied on the back shin rotor  | -1 | 1 | bshin | hinge | torque (N m) |
    | 2   | Torque applied on the back foot rotor  | -1 | 1 | bfoot | hinge | torque (N m) |
    | 3   | Torque applied on the front thigh rotor| -1 | 1 | fthigh | hinge | torque (N m) |
    | 4   | Torque applied on the front shin rotor | -1 | 1 | fshin | hinge | torque (N m) |
    | 5   | Torque applied on the front foot rotor | -1 | 1 | ffoot | hinge | torque (N m) |
    ### Observation Space
    The state space consists of positional values of different body parts of the
    cheetah, followed by the velocities of those individual parts (their derivatives) with all the positions ordered before all the velocities.
    The observation is a `ndarray` with shape `(17,)` where the elements correspond to the following:
    | Num | Observation           | Min                  | Max                | Name (in corresponding XML file) | Joint| Unit |
    |-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------|
    | 0     | x-coordinate of the center of mass         | -Inf                 | Inf                | rootx | slide | position (m) |
    | 1     | y-coordinate of the center of mass         | -Inf                 | Inf                | rootz | slide | position (m) |
    | 2     | angle of the front tip                     | -Inf                 | Inf                | rooty | hinge | angle (rad) |
    | 3     | angle of the back thigh rotor              | -Inf                 | Inf                | bthigh | hinge | angle (rad) |
    | 4     | angle of the back shin rotor               | -Inf                 | Inf                | bshin | hinge | angle (rad) |
    | 5     | angle of the back foot rotor               | -Inf                 | Inf                | bfoot | hinge | angle (rad) |
    | 6     | velocity of the tip along the y-axis       | -Inf                 | Inf                | fthigh | hinge | angle (rad) |
    | 7     | angular velocity of front tip              | -Inf                 | Inf                | fshin | hinge | angle (rad) |
    | 8     | angular velocity of second rotor           | -Inf                 | Inf                | ffoot | hinge | angle (rad) |
    | 9     | x-coordinate of the front tip              | -Inf                 | Inf                | rootx | slide | velocity (m/s) |
    | 10   | y-coordinate of the front tip               | -Inf                 | Inf                | rootz | slide | velocity (m/s) |
    | 11   | angle of the front tip                      | -Inf                 | Inf                | rooty | hinge | angular velocity (rad/s) |
    | 12   | angle of the second rotor                   | -Inf                 | Inf                | bthigh | hinge | angular velocity (rad/s) |
    | 13   | angle of the second rotor                   | -Inf                 | Inf                | bshin | hinge | angular velocity (rad/s) |
    | 14   | velocity of the tip along the x-axis        | -Inf                 | Inf                | bfoot | hinge | angular velocity (rad/s) |
    | 15   | velocity of the tip along the y-axis        | -Inf                 | Inf                | fthigh | hinge |angular velocity (rad/s) |
    | 16   | angular velocity of front tip               | -Inf                 | Inf                | fshin | hinge | angular velocity (rad/s) |
    | 17   | angular velocity of second rotor            | -Inf                 | Inf                | ffoot | hinge | angular velocity (rad/s) |
    **Note:**
    In practice (and Gym implementation), the first positional element is
    omitted from the state space since the reward function is calculated based
    on that value. This value is hidden from the algorithm, which in turn has
    to develop an abstract understanding of it from the observed rewards.
    Therefore, observation space has shape `(8,)` and looks like:
    | Num | Observation           | Min                  | Max                | Name (in corresponding XML file) | Joint| Unit |
    |-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------|
    | 0     | y-coordinate of the front tip              | -Inf                 | Inf                | rootz | slide | position (m) |
    | 1     | angle of the front tip                     | -Inf                 | Inf                | rooty | hinge | angle (rad) |
    | 2     | angle of the second rotor                  | -Inf                 | Inf                | bthigh | hinge | angle (rad) |
    | 3     | angle of the second rotor                  | -Inf                 | Inf                | bshin | hinge | angle (rad) |
    | 4     | velocity of the tip along the x-axis       | -Inf                 | Inf                | bfoot | hinge | angle (rad) |
    | 5     | velocity of the tip along the y-axis       | -Inf                 | Inf                | fthigh | hinge | angle (rad) |
    | 6     | angular velocity of front tip              | -Inf                 | Inf                | fshin | hinge | angle (rad) |
    | 7     | angular velocity of second rotor           | -Inf                 | Inf                | ffoot | hinge | angle (rad) |
    | 8     | x-coordinate of the front tip              | -Inf                 | Inf                | rootx | slide | velocity (m/s) |
    | 9     | y-coordinate of the front tip              | -Inf                 | Inf                | rootz | slide | velocity (m/s) |
    | 10   | angle of the front tip                      | -Inf                 | Inf                | rooty | hinge | angular velocity (rad/s) |
    | 11   | angle of the second rotor                   | -Inf                 | Inf                | bthigh | hinge | angular velocity (rad/s) |
    | 12   | angle of the second rotor                   | -Inf                 | Inf                | bshin | hinge | angular velocity (rad/s) |
    | 13   | velocity of the tip along the x-axis        | -Inf                 | Inf                | bfoot | hinge | angular velocity (rad/s) |
    | 14   | velocity of the tip along the y-axis        | -Inf                 | Inf                | fthigh | hinge |angular velocity (rad/s) |
    | 15   | angular velocity of front tip               | -Inf                 | Inf                | fshin | hinge | angular velocity (rad/s) |
    | 16   | angular velocity of second rotor            | -Inf                 | Inf                | ffoot | hinge | angular velocity (rad/s) |
    ### Rewards
    The reward consists of two parts:
    - *reward_run*: A reward of moving forward which is measured
    as *(x-coordinate before action - x-coordinate after action)/dt*. *dt* is
    the time between actions and is dependeent on the frame_skip parameter
    (default is 5), where the *dt* for one frame is 0.01 - making the
    default *dt = 5*0.01 = 0.05*. This reward would be positive if the cheetah
    runs forward (right) desired.
    - *reward_control*: A negative reward for penalising the cheetah if it takes
    actions that are too large. It is measured as *-coefficient x
    sum(action<sup>2</sup>)* where *coefficient* is a parameter set for the
    control and has a default value of 0.1
    The total reward returned is ***reward*** *=* *reward_run + reward_control*
    ### Starting State
    All observations start in state (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
    0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,) with a noise added to the
    initial state for stochasticity. As seen before, the first 8 values in the
    state are positional and the last 9 values are velocity. A uniform noise in
    the range of [-0.1, 0.1] is added to the positional values while a standard
    normal noise with a mean of 0 and standard deviation of 0.1 is added to the
    initial velocity values of all zeros.
    ### Episode Termination
    The episode terminates when the episode length is greater than 1000.
    ### Arguments
    No additional arguments are currently supported (in v2 and lower), but
    modifications can be made to the XML file in the assets folder at
    `gym/envs/mujoco/assets/half_cheetah.xml` (or by changing the path to a
    modified XML file in another folder).
    ```
    env = gym.make('HalfCheetah-v2')
    ```
    v3 and beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
    ```
    env = gym.make('HalfCheetah-v3', ctrl_cost_weight=0.1, ....)
    ```
    ### Version History
    * v4: all mujoco environments now use the mujoco binidings in mujoco>=2.1.3
    * v3: support for gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. rgb rendering comes from tracking camera (so agent does not run away from screen)
    * v2: All continuous control environments now use mujoco_py >= 1.50
    * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments.
    * v0: Initial versions release (1.0.0)
    """
    def __init__(
        self,
        xml_file="half_cheetah.xml",
        forward_reward_weight=1.0,
        ctrl_cost_weight=0.1,
        reset_noise_scale=0.1,
        exclude_current_positions_from_observation=True,
    ):
        utils.EzPickle.__init__(**locals())
        self._forward_reward_weight = forward_reward_weight
        self._ctrl_cost_weight = ctrl_cost_weight
        self._reset_noise_scale = reset_noise_scale
        self._exclude_current_positions_from_observation = (
            exclude_current_positions_from_observation
        )
        mujoco_env.MujocoEnv.__init__(self, xml_file, 5)
    def control_cost(self, action):
        control_cost = self._ctrl_cost_weight * np.sum(np.square(action))
        return control_cost
    def step(self, action):
        x_position_before = self.data.qpos[0]
        self.do_simulation(action, self.frame_skip)
        x_position_after = self.data.qpos[0]
        x_velocity = (x_position_after - x_position_before) / self.dt
        ctrl_cost = self.control_cost(action)
        forward_reward = self._forward_reward_weight * x_velocity
        observation = self._get_obs()
        reward = forward_reward - ctrl_cost
        done = False
        info = {
            "x_position": x_position_after,
            "x_velocity": x_velocity,
            "reward_run": forward_reward,
            "reward_ctrl": -ctrl_cost,
        }
        return observation, reward, done, info
    def _get_obs(self):
        position = self.data.qpos.flat.copy()
        velocity = self.data.qvel.flat.copy()
        if self._exclude_current_positions_from_observation:
            position = position[1:]
        observation = np.concatenate((position, velocity)).ravel()
        return observation
    def reset_model(self):
        noise_low = -self._reset_noise_scale
        noise_high = self._reset_noise_scale
        qpos = self.init_qpos + self.np_random.uniform(
            low=noise_low, high=noise_high, size=self.model.nq
        )
        qvel = (
            self.init_qvel
            + self._reset_noise_scale * self.np_random.standard_normal(self.model.nv)
        )
        self.set_state(qpos, qvel)
        observation = self._get_obs()
        return observation
    def viewer_setup(self):
        for key, value in DEFAULT_CAMERA_CONFIG.items():
            if isinstance(value, np.ndarray):
                getattr(self.viewer.cam, key)[:] = value
            else:
                setattr(self.viewer.cam, key, value)
--- a/gym/envs/mujoco/hopper.py
+++ b/gym/envs/mujoco/hopper.py
@@ -6,7 +6,9 @@ from gym.envs.mujoco import mujoco_env
 class HopperEnv(mujoco_env.MujocoEnv, utils.EzPickle):
    def __init__(self):
-        mujoco_env.MujocoEnv.__init__(self, "hopper.xml", 4)
+        mujoco_env.MujocoEnv.__init__(
            self, "hopper.xml", 4, mujoco_bindings="mujoco_py"
        )
        utils.EzPickle.__init__(self)
    def step(self, a):
--- a/gym/envs/mujoco/hopper_v3.py
+++ b/gym/envs/mujoco/hopper_v3.py
@@ -14,130 +14,6 @@ DEFAULT_CAMERA_CONFIG = {
 class HopperEnv(mujoco_env.MujocoEnv, utils.EzPickle):
    """
    ### Description
    This environment is based on the work done by Erez, Tassa, and Todorov in
    ["Infinite Horizon Model Predictive Control for Nonlinear Periodic Tasks"](http://www.roboticsproceedings.org/rss07/p10.pdf). The environment aims to
    increase the number of independent state and control variables as compared to
    the classic control environments. The hopper is a two-dimensional
    one-legged figure that consist of four main body parts - the torso at the
    top, the thigh in the middle, the leg in the bottom, and a single foot on
    which the entire body rests. The goal is to make hops that move in the
    forward (right) direction by applying torques on the three hinges
    connecting the four body parts.
    ### Action Space
    The action space is a `Box(-1, 1, (3,), float32)`. An action represents the torques applied between *links*
    | Num | Action                             | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit         |
    |-----|------------------------------------|-------------|-------------|----------------------------------|-------|--------------|
    | 0   | Torque applied on the thigh rotor  | -1          | 1           | thigh_joint                      | hinge | torque (N m) |
    | 1   | Torque applied on the leg rotor    | -1          | 1           | leg_joint                        | hinge | torque (N m) |
    | 3   | Torque applied on the foot rotor   | -1          | 1           | foot_joint                       | hinge | torque (N m) |
    ### Observation Space
    Observations consist of positional values of different body parts of the
    hopper, followed by the velocities of those individual parts
    (their derivatives) with all the positions ordered before all the velocities.
    By default, observations do not include the x-coordinate of the hopper. It may
    be included by passing `exclude_current_positions_from_observation=False` during construction.
    In that case, the observation space will have 12 dimensions where the first dimension
    represents the x-coordinate of the hopper.
    Regardless of whether `exclude_current_positions_from_observation` was set to true or false, the x-coordinate
    will be returned in `info` with key `"x_position"`.
    However, by default, the observation is a `ndarray` with shape `(11,)` where the elements
    correspond to the following:
    | Num | Observation           | Min                  | Max                | Name (in corresponding XML file) | Joint| Unit |
    |-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------|
    | 0   | z-coordinate of the top (height of hopper)       | -Inf                 | Inf                | rootz | slide | position (m) |
    | 1   | angle of the top                                 | -Inf                 | Inf                | rooty | hinge | angle (rad) |
    | 2   | angle of the thigh joint                         | -Inf                 | Inf                | thigh_joint | hinge | angle (rad) |
    | 3   | angle of the leg joint                           | -Inf                 | Inf                | leg_joint | hinge | angle (rad) |
    | 4   | angle of the foot joint                          | -Inf                 | Inf                | foot_joint | hinge | angle (rad) |
    | 5   | velocity of the x-coordinate of the top          | -Inf                 | Inf                | rootx | slide | velocity (m/s) |
    | 6   | velocity of the z-coordinate (height) of the top | -Inf                 | Inf                | rootz | slide | velocity (m/s)  |
    | 7   | angular velocity of the angle of the top         | -Inf                 | Inf                | rooty | hinge | angular velocity (rad/s) |
    | 8   | angular velocity of the thigh hinge              | -Inf                 | Inf                | thigh_joint | hinge | angular velocity (rad/s) |
    | 9   | angular velocity of the leg hinge                | -Inf                 | Inf                | leg_joint | hinge | angular velocity (rad/s) |
    | 10  | angular velocity of the foot hinge               | -Inf                 | Inf                | foot_joint | hinge | angular velocity (rad/s) |
    ### Rewards
    The reward consists of three parts:
    - *healthy_reward*: Every timestep that the hopper is healthy (see definition in section "Episode Termination"), it gets a reward of fixed value `healthy_reward`.
    - *forward_reward*: A reward of hopping forward which is measured
    as *`forward_reward_weight` * (x-coordinate before action - x-coordinate after action)/dt*. *dt* is
    the time between actions and is dependent on the frame_skip parameter
    (fixed to 4), where the frametime is 0.002 - making the
    default *dt = 4 * 0.002 = 0.008*. This reward would be positive if the hopper
    hops forward (positive x direction).
    - *ctrl_cost*: A cost for penalising the hopper if it takes
    actions that are too large. It is measured as *`ctrl_cost_weight` *
    sum(action<sup>2</sup>)* where *`ctrl_cost_weight`* is a parameter set for the
    control and has a default value of 0.001
    The total reward returned is ***reward*** *=* *healthy_reward + forward_reward - ctrl_cost* and `info` will also contain the individual reward terms
    ### Starting State
    All observations start in state
    (0.0, 1.25, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0) with a uniform noise
     in the range of [-`reset_noise_scale`, `reset_noise_scale`] added to the values for stochasticity.
    ### Episode Termination
    The hopper is said to be unhealthy if any of the following happens:
    1. An element of `observation[1:]` (if  `exclude_current_positions_from_observation=True`, else `observation[2:]`) is no longer contained in the closed interval specified by the argument `healthy_state_range`
    2. The height of the hopper (`observation[0]` if  `exclude_current_positions_from_observation=True`, else `observation[1]`) is no longer contained in the closed interval specified by the argument `healthy_z_range` (usually meaning that it has fallen)
    3. The angle (`observation[1]` if  `exclude_current_positions_from_observation=True`, else `observation[2]`) is no longer contained in the closed interval specified by the argument `healthy_angle_range`
    If `terminate_when_unhealthy=True` is passed during construction (which is the default),
    the episode terminates when any of the following happens:
    1. The episode duration reaches a 1000 timesteps
    2. The hopper is unhealthy
    If `terminate_when_unhealthy=False` is passed, the episode is terminated only when 1000 timesteps are exceeded.
    ### Arguments
    No additional arguments are currently supported in v2 and lower.
    ```
    env = gym.make('Hopper-v2')
    ```
    v3 and beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
    ```
    env = gym.make('Hopper-v3', ctrl_cost_weight=0.1, ....)
    ```
    | Parameter               | Type       | Default        |Description                    |
    |-------------------------|------------|----------------|-------------------------------|
    | `xml_file`              | **str**    | `"hopper.xml"` | Path to a MuJoCo model |
    | `forward_reward_weight` | **float**  | `1.0`          | Weight for *forward_reward* term (see section on reward) |
    | `ctrl_cost_weight`      | **float**  | `0.001`        | Weight for *ctrl_cost* reward (see section on reward) |
    | `healthy_reward`        | **float**  | `1`            | Constant reward given if the ant is "healthy" after timestep |
    | `terminate_when_unhealthy` | **bool**| `True`         | If true, issue a done signal if the hopper is no longer healthy |
    | `healthy_state_range`   | **tuple**  | `(-100, 100)`  | The elements of `observation[1:]` (if  `exclude_current_positions_from_observation=True`, else `observation[2:]`) must be in this range for the hopper to be considered healthy |
    | `healthy_z_range`       | **tuple**  | `(0.7, float("inf"))`    | The z-coordinate must be in this range for the hopper to be considered healthy |
    | `healthy_angle_range`   | **tuple**  | `(-0.2, 0.2)`   | The angle given by `observation[1]` (if  `exclude_current_positions_from_observation=True`, else `observation[2]`) must be in this range for the hopper to be considered healthy |
    | `reset_noise_scale`     | **float**  | `5e-3`         | Scale of random perturbations of initial position and velocity (see section on Starting State) |
    | `exclude_current_positions_from_observation`| **bool** | `True`| Whether or not to omit the x-coordinate from observations. Excluding the position can serve as an inductive bias to induce position-agnostic behavior in policies |
    ### Version History
    * v3: support for gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. rgb rendering comes from tracking camera (so agent does not run away from screen)
    * v2: All continuous control environments now use mujoco_py >= 1.50
    * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments.
    * v0: Initial versions release (1.0.0)
    """
    def __init__(
        self,
        xml_file="hopper.xml",
@@ -170,7 +46,7 @@ class HopperEnv(mujoco_env.MujocoEnv, utils.EzPickle):
            exclude_current_positions_from_observation
        )
-        mujoco_env.MujocoEnv.__init__(self, xml_file, 4)
+        mujoco_env.MujocoEnv.__init__(self, xml_file, 4, mujoco_bindings="mujoco_py")
    @property
    def healthy_reward(self):
--- a/gym/envs/mujoco/hopper_v4.py
+++ b/gym/envs/mujoco/hopper_v4.py
@@ -0,0 +1,263 @@
 import numpy as np
 from gym import utils
 from gym.envs.mujoco import mujoco_env
 DEFAULT_CAMERA_CONFIG = {
    "trackbodyid": 2,
    "distance": 3.0,
    "lookat": np.array((0.0, 0.0, 1.15)),
    "elevation": -20.0,
 }
 class HopperEnv(mujoco_env.MujocoEnv, utils.EzPickle):
    """
    ### Description
    This environment is based on the work done by Erez, Tassa, and Todorov in
    ["Infinite Horizon Model Predictive Control for Nonlinear Periodic Tasks"](http://www.roboticsproceedings.org/rss07/p10.pdf). The environment aims to
    increase the number of independent state and control variables as compared to
    the classic control environments. The hopper is a two-dimensional
    one-legged figure that consist of four main body parts - the torso at the
    top, the thigh in the middle, the leg in the bottom, and a single foot on
    which the entire body rests. The goal is to make hops that move in the
    forward (right) direction by applying torques on the three hinges
    connecting the four body parts.
    ### Action Space
    The agent take a 3-element vector for actions.
    The action space is a continuous `(action, action, action)` all in `[-1, 1]`
    , where `action` represents the numerical torques applied between *links*
    | Num | Action                             | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit         |
    |-----|------------------------------------|-------------|-------------|----------------------------------|-------|--------------|
    | 0   | Torque applied on the thigh rotor  | -1          | 1           | thigh_joint                      | hinge | torque (N m) |
    | 1   | Torque applied on the leg rotor    | -1          | 1           | leg_joint                        | hinge | torque (N m) |
    | 3   | Torque applied on the foot rotor   | -1          | 1           | foot_joint                       | hinge | torque (N m) |
    ### Observation Space
    The state space consists of positional values of different body parts of the
    hopper, followed by the velocities of those individual parts
    (their derivatives) with all the positions ordered before all the velocities.
    The observation is a `ndarray` with shape `(11,)` where the elements
    correspond to the following:
    | Num | Observation           | Min                  | Max                | Name (in corresponding XML file) | Joint| Unit |
    |-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------|
    | 0   | x-coordinate of the top                          | -Inf                 | Inf                | rootx | slide | position (m) |
    | 1   | z-coordinate of the top (height of hopper)       | -Inf                 | Inf                | rootz | slide | position (m) |
    | 2   | angle of the top                                 | -Inf                 | Inf                | rooty | hinge | angle (rad) |
    | 3   | angle of the thigh joint                         | -Inf                 | Inf                | thigh_joint | hinge | angle (rad) |
    | 4   | angle of the leg joint                           | -Inf                 | Inf                | leg_joint | hinge | angle (rad) |
    | 5   | angle of the foot joint                          | -Inf                 | Inf                | foot_joint | hinge | angle (rad) |
    | 6   | velocity of the x-coordinate of the top          | -Inf                 | Inf                | rootx | slide | velocity (m/s) |
    | 7   | velocity of the z-coordinate (height) of the top | -Inf                 | Inf                | rootz | slide | velocity (m/s)  |
    | 8   | angular velocity of the angle of the top         | -Inf                 | Inf                | rooty | hinge | angular velocity (rad/s) |
    | 9   | angular velocity of the thigh hinge              | -Inf                 | Inf                | thigh_joint | hinge | angular velocity (rad/s) |
    | 10  | angular velocity of the leg hinge                | -Inf                 | Inf                | leg_joint | hinge | angular velocity (rad/s) |
    | 11  | angular velocity of the foot hinge               | -Inf                 | Inf                | foot_joint | hinge | angular velocity (rad/s) |
    **Note:**
    In practice (and Gym implementation), the first positional element is
    omitted from the state space since the reward function is calculated based
    on that value. This value is hidden from the algorithm, which in turn has
    to develop an abstract understanding of it from the observed rewards.
    Therefore, observation space has shape `(11,)` instead of `(12,)` and looks like:
    | Num | Observation           | Min                  | Max                | Name (in corresponding XML file) | Joint| Unit |
    |-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------|
    | 0   | z-coordinate of the top (height of hopper)       | -Inf                 | Inf                | rootz | slide | position (m) |
    | 1   | angle of the top                                 | -Inf                 | Inf                | rooty | hinge | angle (rad) |
    | 2   | angle of the thigh joint                         | -Inf                 | Inf                | thigh_joint | hinge | angle (rad) |
    | 3   | angle of the leg joint                           | -Inf                 | Inf                | leg_joint | hinge | angle (rad) |
    | 4   | angle of the foot joint                          | -Inf                 | Inf                | foot_joint | hinge | angle (rad) |
    | 5   | velocity of the x-coordinate of the top          | -Inf                 | Inf                | rootx | slide | velocity (m/s) |
    | 6   | velocity of the z-coordinate (height) of the top | -Inf                 | Inf                | rootz | slide | velocity (m/s)  |
    | 7   | angular velocity of the angle of the top         | -Inf                 | Inf                | rooty | hinge | angular velocity (rad/s) |
    | 8   | angular velocity of the thigh hinge              | -Inf                 | Inf                | thigh_joint | hinge | angular velocity (rad/s) |
    | 9   | angular velocity of the leg hinge                | -Inf                 | Inf                | leg_joint | hinge | angular velocity (rad/s) |
    | 10  | angular velocity of the foot hinge               | -Inf                 | Inf                | foot_joint | hinge | angular velocity (rad/s) |
    ### Rewards
    The reward consists of three parts:
    - *alive bonus*: Every timestep that the hopper is alive, it gets a reward of 1,
    - *reward_forward*: A reward of hopping forward which is measured
    as *(x-coordinate before action - x-coordinate after action)/dt*. *dt* is
    the time between actions and is dependeent on the frame_skip parameter
    (default is 4), where the *dt* for one frame is 0.002 - making the
    default *dt = 4*0.002 = 0.008*. This reward would be positive if the hopper
    hops forward (right) desired.
    - *reward_control*: A negative reward for penalising the hopper if it takes
    actions that are too large. It is measured as *-coefficient **x**
    sum(action<sup>2</sup>)* where *coefficient* is a parameter set for the
    control and has a default value of 0.001
    The total reward returned is ***reward*** *=* *alive bonus + reward_forward + reward_control*
    ### Starting State
    All observations start in state
    (0.0, 1.25, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0) with a uniform noise
     in the range of [-0.005, 0.005] added to the values for stochasticity.
    ### Episode Termination
    The episode terminates when any of the following happens:
    1. The episode duration reaches a 1000 timesteps
    2. Any of the state space values is no longer finite
    3. The absolute value of any of the state variable indexed (angle and beyond) is greater than 100
    4. The height of the hopper becomes greater than 0.7 metres (hopper has hopped too high).
    5. The absolute value of the angle (index 2) is less than 0.2 radians (hopper has fallen down).
    ### Arguments
    No additional arguments are currently supported (in v2 and lower), but
    modifications can be made to the XML file in the assets folder
    (or by changing the path to a modified XML file in another folder).
    ```
    env = gym.make('Hopper-v2')
    ```
    v3 and beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
    ```
    env = gym.make('Hopper-v3', ctrl_cost_weight=0.1, ....)
    ```
    ### Version History
    * v4: all mujoco environments now use the mujoco binidings in mujoco>=2.1.3
    * v3: support for gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. rgb rendering comes from tracking camera (so agent does not run away from screen)
    * v2: All continuous control environments now use mujoco_py >= 1.50
    * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments.
    * v0: Initial versions release (1.0.0)
    """
    def __init__(
        self,
        xml_file="hopper.xml",
        forward_reward_weight=1.0,
        ctrl_cost_weight=1e-3,
        healthy_reward=1.0,
        terminate_when_unhealthy=True,
        healthy_state_range=(-100.0, 100.0),
        healthy_z_range=(0.7, float("inf")),
        healthy_angle_range=(-0.2, 0.2),
        reset_noise_scale=5e-3,
        exclude_current_positions_from_observation=True,
    ):
        utils.EzPickle.__init__(**locals())
        self._forward_reward_weight = forward_reward_weight
        self._ctrl_cost_weight = ctrl_cost_weight
        self._healthy_reward = healthy_reward
        self._terminate_when_unhealthy = terminate_when_unhealthy
        self._healthy_state_range = healthy_state_range
        self._healthy_z_range = healthy_z_range
        self._healthy_angle_range = healthy_angle_range
        self._reset_noise_scale = reset_noise_scale
        self._exclude_current_positions_from_observation = (
            exclude_current_positions_from_observation
        )
        mujoco_env.MujocoEnv.__init__(self, xml_file, 4)
    @property
    def healthy_reward(self):
        return (
            float(self.is_healthy or self._terminate_when_unhealthy)
            * self._healthy_reward
        )
    def control_cost(self, action):
        control_cost = self._ctrl_cost_weight * np.sum(np.square(action))
        return control_cost
    @property
    def is_healthy(self):
        z, angle = self.data.qpos[1:3]
        state = self.state_vector()[2:]
        min_state, max_state = self._healthy_state_range
        min_z, max_z = self._healthy_z_range
        min_angle, max_angle = self._healthy_angle_range
        healthy_state = np.all(np.logical_and(min_state < state, state < max_state))
        healthy_z = min_z < z < max_z
        healthy_angle = min_angle < angle < max_angle
        is_healthy = all((healthy_state, healthy_z, healthy_angle))
        return is_healthy
    @property
    def done(self):
        done = not self.is_healthy if self._terminate_when_unhealthy else False
        return done
    def _get_obs(self):
        position = self.data.qpos.flat.copy()
        velocity = np.clip(self.data.qvel.flat.copy(), -10, 10)
        if self._exclude_current_positions_from_observation:
            position = position[1:]
        observation = np.concatenate((position, velocity)).ravel()
        return observation
    def step(self, action):
        x_position_before = self.data.qpos[0]
        self.do_simulation(action, self.frame_skip)
        x_position_after = self.data.qpos[0]
        x_velocity = (x_position_after - x_position_before) / self.dt
        ctrl_cost = self.control_cost(action)
        forward_reward = self._forward_reward_weight * x_velocity
        healthy_reward = self.healthy_reward
        rewards = forward_reward + healthy_reward
        costs = ctrl_cost
        observation = self._get_obs()
        reward = rewards - costs
        done = self.done
        info = {
            "x_position": x_position_after,
            "x_velocity": x_velocity,
        }
        return observation, reward, done, info
    def reset_model(self):
        noise_low = -self._reset_noise_scale
        noise_high = self._reset_noise_scale
        qpos = self.init_qpos + self.np_random.uniform(
            low=noise_low, high=noise_high, size=self.model.nq
        )
        qvel = self.init_qvel + self.np_random.uniform(
            low=noise_low, high=noise_high, size=self.model.nv
        )
        self.set_state(qpos, qvel)
        observation = self._get_obs()
        return observation
    def viewer_setup(self):
        for key, value in DEFAULT_CAMERA_CONFIG.items():
            if isinstance(value, np.ndarray):
                getattr(self.viewer.cam, key)[:] = value
            else:
                setattr(self.viewer.cam, key, value)
--- a/gym/envs/mujoco/humanoid.py
+++ b/gym/envs/mujoco/humanoid.py
@@ -12,7 +12,9 @@ def mass_center(model, sim):
 class HumanoidEnv(mujoco_env.MujocoEnv, utils.EzPickle):
    def __init__(self):
-        mujoco_env.MujocoEnv.__init__(self, "humanoid.xml", 5)
+        mujoco_env.MujocoEnv.__init__(
            self, "humanoid.xml", 5, mujoco_bindings="mujoco_py"
        )
        utils.EzPickle.__init__(self)
    def _get_obs(self):
--- a/gym/envs/mujoco/humanoid_v3.py
+++ b/gym/envs/mujoco/humanoid_v3.py
@@ -18,198 +18,6 @@ def mass_center(model, sim):
 class HumanoidEnv(mujoco_env.MujocoEnv, utils.EzPickle):
    """
    ### Description
    This environment is based on the environment introduced by Tassa, Erez and Todorov
    in ["Synthesis and stabilization of complex behaviors through online trajectory optimization"](https://ieeexplore.ieee.org/document/6386025).
    The 3D bipedal robot is designed to simulate a human. It has a torso (abdomen) with a pair of
    legs and arms. The legs each consist of two links, and so the arms (representing the knees and
    elbows respectively). The goal of the environment is to walk forward as fast as possible without falling over.
    ### Action Space
    The action space is a `Box(-1, 1, (17,), float32)`. An action represents the torques applied at the hinge joints.
    | Num | Action                    | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit |
    |-----|----------------------|---------------|----------------|---------------------------------------|-------|------|
    | 0   | Torque applied on the hinge in the y-coordinate of the abdomen                     | -0.4 | 0.4 | hip_1 (front_left_leg)      | hinge | torque (N m) |
    | 1   | Torque applied on the hinge in the z-coordinate of the abdomen                     | -0.4 | 0.4 | angle_1 (front_left_leg)    | hinge | torque (N m) |
    | 2   | Torque applied on the hinge in the x-coordinate of the abdomen                     | -0.4 | 0.4 | hip_2 (front_right_leg)     | hinge | torque (N m) |
    | 3   | Torque applied on the rotor between torso/abdomen and the right hip (x-coordinate) | -0.4 | 0.4 | right_hip_x (right_thigh)   | hinge | torque (N m) |
    | 4   | Torque applied on the rotor between torso/abdomen and the right hip (z-coordinate) | -0.4 | 0.4 | right_hip_z (right_thigh)   | hinge | torque (N m) |
    | 5   | Torque applied on the rotor between torso/abdomen and the right hip (y-coordinate) | -0.4 | 0.4 | right_hip_y (right_thigh)   | hinge | torque (N m) |
    | 6   | Torque applied on the rotor between the right hip/thigh and the right shin         | -0.4 | 0.4 | right_knee                  | hinge | torque (N m) |
    | 7   | Torque applied on the rotor between torso/abdomen and the left hip (x-coordinate)  | -0.4 | 0.4 | left_hip_x (left_thigh)     | hinge | torque (N m) |
    | 8   | Torque applied on the rotor between torso/abdomen and the left hip (z-coordinate)  | -0.4 | 0.4 | left_hip_z (left_thigh)     | hinge | torque (N m) |
    | 9   | Torque applied on the rotor between torso/abdomen and the left hip (y-coordinate)  | -0.4 | 0.4 | left_hip_y (left_thigh)     | hinge | torque (N m) |
    | 10   | Torque applied on the rotor between the left hip/thigh and the left shin          | -0.4 | 0.4 | left_knee                   | hinge | torque (N m) |
    | 11   | Torque applied on the rotor between the torso and right upper arm (coordinate -1) | -0.4 | 0.4 | right_shoulder1             | hinge | torque (N m) |
    | 12   | Torque applied on the rotor between the torso and right upper arm (coordinate -2) | -0.4 | 0.4 | right_shoulder2             | hinge | torque (N m) |
    | 13   | Torque applied on the rotor between the right upper arm and right lower arm       | -0.4 | 0.4 | right_elbow                 | hinge | torque (N m) |
    | 14   | Torque applied on the rotor between the torso and left upper arm (coordinate -1)  | -0.4 | 0.4 | left_shoulder1              | hinge | torque (N m) |
    | 15   | Torque applied on the rotor between the torso and left upper arm (coordinate -2)  | -0.4 | 0.4 | left_shoulder2              | hinge | torque (N m) |
    | 16   | Torque applied on the rotor between the left upper arm and left lower arm         | -0.4 | 0.4 | left_elbow                  | hinge | torque (N m) |
    ### Observation Space
    Observations consist of positional values of different body parts of the Humanoid,
     followed by the velocities of those individual parts (their derivatives) with all the
     positions ordered before all the velocities.
    By default, observations do not include the x- and y-coordinates of the torso. These may
    be included by passing `exclude_current_positions_from_observation=False` during construction.
    In that case, the observation space will have 378 dimensions where the first two dimensions
    represent the x- and y-coordinates of the torso.
    Regardless of whether `exclude_current_positions_from_observation` was set to true or false, the x- and y-coordinates
    will be returned in `info` with keys `"x_position"` and `"y_position"`, respectively.
    However, by default, the observation is a `ndarray` with shape `(376,)` where the elements correspond to the following:
    | Num | Observation                                                        | Min                | Max                | Name (in corresponding XML file) | Joint | Unit |
    |-----|---------------------------------------------------------|----------------|-----------------|----------------------------------------|-------|------|
    | 0   | z-coordinate of the torso (centre)                                                                              | -Inf                 | Inf                | root      | free | position (m) |
    | 1   | x-orientation of the torso (centre)                                                                             | -Inf                 | Inf                | root      | free | angle (rad) |
    | 2   | y-orientation of the torso (centre)                                                                             | -Inf                 | Inf                | root      | free | angle (rad) |
    | 3   | z-orientation of the torso (centre)                                                                             | -Inf                 | Inf                | root      | free | angle (rad) |
    | 4   | w-orientation of the torso (centre)                                                                             | -Inf                 | Inf                | root       | free | angle (rad) |
    | 5   | z-angle of the abdomen (in lower_waist)                                                                         | -Inf                 | Inf               | abdomen_z | hinge | angle (rad) |
    | 6   | y-angle of the abdomen (in lower_waist)                                                                         | -Inf                 | Inf               | abdomen_y | hinge | angle (rad) |
    | 7   | x-angle of the abdomen (in pelvis)                                                                              | -Inf                 | Inf               | abdomen_x | hinge | angle (rad) |
    | 8   | x-coordinate of angle between pelvis and right hip (in right_thigh)                                             | -Inf                 | Inf               | right_hip_x | hinge | angle (rad) |
    | 9   | z-coordinate of angle between pelvis and right hip (in right_thigh)                                             | -Inf                 | Inf               | right_hip_z | hinge | angle (rad) |
    | 19  | y-coordinate of angle between pelvis and right hip (in right_thigh)                                             | -Inf                 | Inf               | right_hip_y | hinge | angle (rad) |
    | 11  | angle between right hip and the right shin (in right_knee)                                                      | -Inf                 | Inf               | right_knee | hinge | angle (rad) |
    | 12  | x-coordinate of angle between pelvis and left hip (in left_thigh)                                               | -Inf                 | Inf               | left_hip_x | hinge | angle (rad) |
    | 13  | z-coordinate of angle between pelvis and left hip (in left_thigh)                                               | -Inf                 | Inf               | left_hip_z | hinge | angle (rad) |
    | 14  | y-coordinate of angle between pelvis and left hip (in left_thigh)                                               | -Inf                 | Inf               | left_hip_y | hinge | angle (rad) |
    | 15  | angle between left hip and the left shin (in left_knee)                                                         | -Inf                 | Inf               | left_knee | hinge | angle (rad) |
    | 16  | coordinate-1 (multi-axis) angle between torso and right arm (in right_upper_arm)                                | -Inf                 | Inf               | right_shoulder1 | hinge | angle (rad) |
    | 17  | coordinate-2 (multi-axis) angle between torso and right arm (in right_upper_arm)                                | -Inf                 | Inf               | right_shoulder2 | hinge | angle (rad) |
    | 18  | angle between right upper arm and right_lower_arm                                                               | -Inf                 | Inf               | right_elbow | hinge | angle (rad) |
    | 19  | coordinate-1 (multi-axis) angle between torso and left arm (in left_upper_arm)                                  | -Inf                 | Inf               | left_shoulder1 | hinge | angle (rad) |
    | 20  | coordinate-2 (multi-axis) angle between torso and left arm (in left_upper_arm)                                  | -Inf                 | Inf               | left_shoulder2 | hinge | angle (rad) |
    | 21  | angle between left upper arm and left_lower_arm                                                                 | -Inf                 | Inf               | left_elbow | hinge | angle (rad) |
    | 22  | x-coordinate velocity of the torso (centre)                                                                     | -Inf                 | Inf                | root      | free | velocity (m/s) |
    | 23  | y-coordinate velocity of the torso (centre)                                                                     | -Inf                 | Inf                | root      | free | velocity (m/s) |
    | 24  | z-coordinate velocity of the torso (centre)                                                                     | -Inf                 | Inf                | root      | free | velocity (m/s) |
    | 25  | x-coordinate angular velocity of the torso (centre)                                                             | -Inf                 | Inf                | root      | free | anglular velocity (rad/s) |
    | 26  | y-coordinate angular velocity of the torso (centre)                                                             | -Inf                 | Inf                | root      | free | anglular velocity (rad/s) |
    | 27  | z-coordinate angular velocity of the torso (centre)                                                             | -Inf                 | Inf                | root      | free | anglular velocity (rad/s) |
    | 28  | z-coordinate of angular velocity of the abdomen (in lower_waist)                                                | -Inf                 | Inf               | abdomen_z | hinge | anglular velocity (rad/s) |
    | 29  | y-coordinate of angular velocity of the abdomen (in lower_waist)                                                | -Inf                 | Inf               | abdomen_y | hinge | anglular velocity (rad/s) |
    | 30  | x-coordinate of angular velocity of the abdomen (in pelvis)                                                     | -Inf                 | Inf               | abdomen_x | hinge | aanglular velocity (rad/s) |
    | 31  | x-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh)                 | -Inf                 | Inf               | right_hip_x | hinge | anglular velocity (rad/s) |
    | 32  | z-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh)                 | -Inf                 | Inf               | right_hip_z | hinge | anglular velocity (rad/s) |
    | 33  | y-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh)                 | -Inf                 | Inf               | right_hip_y | hinge | anglular velocity (rad/s) |
    | 34  | angular velocity of the angle between right hip and the right shin (in right_knee)                              | -Inf                 | Inf               | right_knee | hinge | anglular velocity (rad/s) |
    | 35  | x-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh)                   | -Inf                 | Inf               | left_hip_x | hinge | anglular velocity (rad/s) |
    | 36  | z-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh)                   | -Inf                 | Inf               | left_hip_z | hinge | anglular velocity (rad/s) |
    | 37  | y-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh)                   | -Inf                 | Inf               | left_hip_y | hinge | anglular velocity (rad/s) |
    | 38  | angular velocity of the angle between left hip and the left shin (in left_knee)                                 | -Inf                 | Inf               | left_knee | hinge | anglular velocity (rad/s) |
    | 39  | coordinate-1 (multi-axis) of the angular velocity of the angle between torso and right arm (in right_upper_arm) | -Inf                 | Inf               | right_shoulder1 | hinge | anglular velocity (rad/s) |
    | 40  | coordinate-2 (multi-axis) of the angular velocity of the angle between torso and right arm (in right_upper_arm) | -Inf                 | Inf               | right_shoulder2 | hinge | anglular velocity (rad/s) |
    | 41  | angular velocity of the angle between right upper arm and right_lower_arm                                       | -Inf                 | Inf               | right_elbow | hinge | anglular velocity (rad/s) |
    | 42  | coordinate-1 (multi-axis) of the angular velocity of the angle between torso and left arm (in left_upper_arm)   | -Inf                 | Inf               | left_shoulder1 | hinge | anglular velocity (rad/s) |
    | 43  | coordinate-2 (multi-axis) of the angular velocity of the angle between torso and left arm (in left_upper_arm)   | -Inf                 | Inf               | left_shoulder2 | hinge | anglular velocity (rad/s) |
    | 44  | angular velocitty of the angle between left upper arm and left_lower_arm                                        | -Inf                 | Inf               | left_elbow | hinge | anglular velocity (rad/s) |
    Additionally, after all the positional and velocity based values in the table,
    the observation contains (in order):
    - *cinert:* Mass and inertia of a single rigid body relative to the center of mass
    (this is an intermediate result of transition). It has shape 14*10 (*nbody * 10*)
    and hence adds to another 140 elements in the state space.
    - *cvel:* Center of mass based velocity. It has shape 14 * 6 (*nbody * 6*) and hence
    adds another 84 elements in the state space
    - *qfrc_actuator:* Constraint force generated as the actuator force. This has shape
    `(23,)`  *(nv * 1)* and hence adds another 23 elements to the state space.
    - *cfrc_ext:* This is the center of mass based external force on the body.  It has shape
    14 * 6 (*nbody * 6*) and hence adds to another 84 elements in the state space.
    where *nbody* stands for the number of bodies in the robot and *nv* stands for the
    number of degrees of freedom (*= dim(qvel)*)
    The (x,y,z) coordinates are translational DOFs while the orientations are rotational
    DOFs expressed as quaternions. One can read more about free joints on the
    [Mujoco Documentation](https://mujoco.readthedocs.io/en/latest/XMLreference.html).
    **Note:** There have been reported issues that using a Mujoco-Py version > 2.0
    results in the contact forces always being 0. As such we recommend to use a Mujoco-Py
    version < 2.0 when using the Humanoid environment if you would like to report results
    with contact forces (if contact forces are not used in your experiments, you can use
    version > 2.0).
    ### Rewards
    The reward consists of three parts:
    - *healthy_reward*: Every timestep that the humanoid is alive (see section Episode Termination for definition), it gets a reward of fixed value `healthy_reward`
    - *forward_reward*: A reward of walking forward which is measured as *`forward_reward_weight` *
    (average center of mass before action - average center of mass after action)/dt*.
    *dt* is the time between actions and is dependent on the frame_skip parameter
    (default is 5), where the frametime is 0.003 - making the default *dt = 5 * 0.003 = 0.015*.
    This reward would be positive if the humanoid walks forward (in positive x-direction). The calculation
    for the center of mass is defined in the `.py` file for the Humanoid.
    - *ctrl_cost*: A negative reward for penalising the humanoid if it has too
    large of a control force. If there are *nu* actuators/controls, then the control has
    shape  `nu x 1`. It is measured as *`ctrl_cost_weight` * sum(control<sup>2</sup>)*.
    - *contact_cost*: A negative reward for penalising the humanoid if the external
    contact force is too large. It is calculated by clipping
    *`contact_cost_weight` * sum(external contact force<sup>2</sup>)* to the interval specified by `contact_cost_range`.
    The total reward returned is ***reward*** *=* *healthy_reward + forward_reward - ctrl_cost - contact_cost* and `info` will also contain the individual reward terms
    ### Starting State
    All observations start in state
    (0.0, 0.0,  1.4, 1.0, 0.0  ... 0.0) with a uniform noise in the range
    of [-`reset_noise_scale`, `reset_noise_scale`] added to the positional and velocity values (values in the table)
    for stochasticity. Note that the initial z coordinate is intentionally
    selected to be high, thereby indicating a standing up humanoid. The initial
    orientation is designed to make it face forward as well.
    ### Episode Termination
    The humanoid is said to be unhealthy if the z-position of the torso is no longer contained in the
    closed interval specified by the argument `healthy_z_range`.
    If `terminate_when_unhealthy=True` is passed during construction (which is the default),
    the episode terminates when any of the following happens:
    1. The episode duration reaches a 1000 timesteps
    3. The humanoid is unhealthy
    If `terminate_when_unhealthy=False` is passed, the episode is terminated only when 1000 timesteps are exceeded.
    ### Arguments
    No additional arguments are currently supported in v2 and lower.
    ```
    env = gym.make('Humanoid-v2')
    ```
    v3 and beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
    ```
    env = gym.make('Humanoid-v3', ctrl_cost_weight=0.1, ....)
    ```
    | Parameter               | Type       | Default           |Description                    |
    |-------------------------|------------|-------------------|-------------------------------|
    | `xml_file`              | **str**    | `"humanoid.xml"`  | Path to a MuJoCo model |
    | `forward_reward_weight` | **float**  | `1.25`            | Weight for *forward_reward* term (see section on reward) |
    | `ctrl_cost_weight`      | **float**  | `0.1`             | Weight for *ctrl_cost* term (see section on reward) |
    | `contact_cost_weight`   | **float**  | `5e-7`            | Weight for *contact_cost* term (see section on reward) |
    | `healthy_reward`        | **float**  | `5.0`             | Constant reward given if the humanoid is "healthy" after timestep |
    | `terminate_when_unhealthy` | **bool**| `True`            | If true, issue a done signal if the z-coordinate of the torso is no longer in the `healthy_z_range` |
    | `healthy_z_range`       | **tuple**  | `(1.0, 2.0)`      | The humanoid is considered healthy if the z-coordinate of the torso is in this range |
    | `reset_noise_scale`     | **float**  | `1e-2`            | Scale of random perturbations of initial position and velocity (see section on Starting State) |
    | `exclude_current_positions_from_observation`| **bool**   | `True`| Whether or not to omit the x- and y-coordinates from observations. Excluding the position can serve as an inductive bias to induce position-agnostic behavior in policies |
    ### Version History
    * v3: support for gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. rgb rendering comes from tracking camera (so agent does not run away from screen)
    * v2: All continuous control environments now use mujoco_py >= 1.50
    * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments.
    * v0: Initial versions release (1.0.0)
    """
    def __init__(
        self,
        xml_file="humanoid.xml",
@@ -239,7 +47,7 @@ class HumanoidEnv(mujoco_env.MujocoEnv, utils.EzPickle):
            exclude_current_positions_from_observation
        )
-        mujoco_env.MujocoEnv.__init__(self, xml_file, 5)
+        mujoco_env.MujocoEnv.__init__(self, xml_file, 5, mujoco_bindings="mujoco_py")
    @property
    def healthy_reward(self):
--- a/gym/envs/mujoco/humanoid_v4.py
+++ b/gym/envs/mujoco/humanoid_v4.py
@@ -0,0 +1,331 @@
 import numpy as np
 from gym import utils
 from gym.envs.mujoco import mujoco_env
 DEFAULT_CAMERA_CONFIG = {
    "trackbodyid": 1,
    "distance": 4.0,
    "lookat": np.array((0.0, 0.0, 2.0)),
    "elevation": -20.0,
 }
 def mass_center(model, data):
    mass = np.expand_dims(model.body_mass, axis=1)
    xpos = data.xipos
    return (np.sum(mass * xpos, axis=0) / np.sum(mass))[0:2].copy()
 class HumanoidEnv(mujoco_env.MujocoEnv, utils.EzPickle):
    """
    ### Description
    This environment is based on the environment introduced by Tassa, Erez and Todorov
    in ["Synthesis and stabilization of complex behaviors through online trajectory optimization"](https://ieeexplore.ieee.org/document/6386025).
    The 3D bipedal robot is designed to simulate a human. It has a torso (abdomen) with a pair of
    legs and arms. The legs each consist of two links, and so the arms (representing the knees and
    elbows respectively). The goal of the environment is to walk forward as fast as possible without falling over.
    ### Action Space
    The agent take a 17-element vector for actions.
    The action space is a continuous `(action, ...)` all in `[-1, 1]`, where `action`
    represents the numerical torques applied at the hinge joints.
    | Num | Action                    | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit |
    |-----|----------------------|---------------|----------------|---------------------------------------|-------|------|
    | 0   | Torque applied on the hinge in the y-coordinate of the abdomen                     | -0.4 | 0.4 | hip_1 (front_left_leg)      | hinge | torque (N m) |
    | 1   | Torque applied on the hinge in the z-coordinate of the abdomen                     | -0.4 | 0.4 | angle_1 (front_left_leg)    | hinge | torque (N m) |
    | 2   | Torque applied on the hinge in the x-coordinate of the abdomen                     | -0.4 | 0.4 | hip_2 (front_right_leg)     | hinge | torque (N m) |
    | 3   | Torque applied on the rotor between torso/abdomen and the right hip (x-coordinate) | -0.4 | 0.4 | right_hip_x (right_thigh)   | hinge | torque (N m) |
    | 4   | Torque applied on the rotor between torso/abdomen and the right hip (z-coordinate) | -0.4 | 0.4 | right_hip_z (right_thigh)   | hinge | torque (N m) |
    | 5   | Torque applied on the rotor between torso/abdomen and the right hip (y-coordinate) | -0.4 | 0.4 | right_hip_y (right_thigh)   | hinge | torque (N m) |
    | 6   | Torque applied on the rotor between the right hip/thigh and the right shin         | -0.4 | 0.4 | right_knee                  | hinge | torque (N m) |
    | 7   | Torque applied on the rotor between torso/abdomen and the left hip (x-coordinate)  | -0.4 | 0.4 | left_hip_x (left_thigh)     | hinge | torque (N m) |
    | 8   | Torque applied on the rotor between torso/abdomen and the left hip (z-coordinate)  | -0.4 | 0.4 | left_hip_z (left_thigh)     | hinge | torque (N m) |
    | 9   | Torque applied on the rotor between torso/abdomen and the left hip (y-coordinate)  | -0.4 | 0.4 | left_hip_y (left_thigh)     | hinge | torque (N m) |
    | 10   | Torque applied on the rotor between the left hip/thigh and the left shin          | -0.4 | 0.4 | left_knee                   | hinge | torque (N m) |
    | 11   | Torque applied on the rotor between the torso and right upper arm (coordinate -1) | -0.4 | 0.4 | right_shoulder1             | hinge | torque (N m) |
    | 12   | Torque applied on the rotor between the torso and right upper arm (coordinate -2) | -0.4 | 0.4 | right_shoulder2             | hinge | torque (N m) |
    | 13   | Torque applied on the rotor between the right upper arm and right lower arm       | -0.4 | 0.4 | right_elbow                 | hinge | torque (N m) |
    | 14   | Torque applied on the rotor between the torso and left upper arm (coordinate -1)  | -0.4 | 0.4 | left_shoulder1              | hinge | torque (N m) |
    | 15   | Torque applied on the rotor between the torso and left upper arm (coordinate -2)  | -0.4 | 0.4 | left_shoulder2              | hinge | torque (N m) |
    | 16   | Torque applied on the rotor between the left upper arm and left lower arm         | -0.4 | 0.4 | left_elbow                  | hinge | torque (N m) |
    ### Observation Space
    The state space consists of positional values of different body parts of the Humanoid,
     followed by the velocities of those individual parts (their derivatives) with all the
     positions ordered before all the velocities.
    The observation is a `ndarray` with shape `(376,)` where the elements correspond to the following:
    | Num | Observation                                                        | Min                | Max                | Name (in corresponding XML file) | Joint | Unit |
    |-----|---------------------------------------------------------|----------------|-----------------|----------------------------------------|-------|------|
    | 0   | x-coordinate of the torso (centre)                                                                              | -Inf                 | Inf                | root      | free | position (m) |
    | 1   | y-coordinate of the torso (centre)                                                                              | -Inf                 | Inf                | root      | free | position (m) |
    | 2   | z-coordinate of the torso (centre)                                                                              | -Inf                 | Inf                | root      | free | position (m) |
    | 3   | x-orientation of the torso (centre)                                                                             | -Inf                 | Inf                | root      | free | angle (rad) |
    | 4   | y-orientation of the torso (centre)                                                                             | -Inf                 | Inf                | root      | free | angle (rad) |
    | 5   | z-orientation of the torso (centre)                                                                             | -Inf                 | Inf                | root      | free | angle (rad) |
    | 6   | w-orientation of the torso (centre)                                                                             | -Inf                 | Inf                | root       | free | angle (rad) |
    | 7   | z-angle of the abdomen (in lower_waist)                                                                         | -Inf                 | Inf               | abdomen_z | hinge | angle (rad) |
    | 8   | y-angle of the abdomen (in lower_waist)                                                                         | -Inf                 | Inf               | abdomen_y | hinge | angle (rad) |
    | 9   | x-angle of the abdomen (in pelvis)                                                                              | -Inf                 | Inf               | abdomen_x | hinge | angle (rad) |
    | 10  | x-coordinate of angle between pelvis and right hip (in right_thigh)                                             | -Inf                 | Inf               | right_hip_x | hinge | angle (rad) |
    | 11  | z-coordinate of angle between pelvis and right hip (in right_thigh)                                             | -Inf                 | Inf               | right_hip_z | hinge | angle (rad) |
    | 12  | y-coordinate of angle between pelvis and right hip (in right_thigh)                                             | -Inf                 | Inf               | right_hip_y | hinge | angle (rad) |
    | 13  | angle between right hip and the right shin (in right_knee)                                                      | -Inf                 | Inf               | right_knee | hinge | angle (rad) |
    | 14  | x-coordinate of angle between pelvis and left hip (in left_thigh)                                               | -Inf                 | Inf               | left_hip_x | hinge | angle (rad) |
    | 15  | z-coordinate of angle between pelvis and left hip (in left_thigh)                                               | -Inf                 | Inf               | left_hip_z | hinge | angle (rad) |
    | 16  | y-coordinate of angle between pelvis and left hip (in left_thigh)                                               | -Inf                 | Inf               | left_hip_y | hinge | angle (rad) |
    | 17  | angle between left hip and the left shin (in left_knee)                                                         | -Inf                 | Inf               | left_knee | hinge | angle (rad) |
    | 18  | coordinate-1 (multi-axis) angle between torso and right arm (in right_upper_arm)                                | -Inf                 | Inf               | right_shoulder1 | hinge | angle (rad) |
    | 19  | coordinate-2 (multi-axis) angle between torso and right arm (in right_upper_arm)                                | -Inf                 | Inf               | right_shoulder2 | hinge | angle (rad) |
    | 20  | angle between right upper arm and right_lower_arm                                                               | -Inf                 | Inf               | right_elbow | hinge | angle (rad) |
    | 21  | coordinate-1 (multi-axis) angle between torso and left arm (in left_upper_arm)                                  | -Inf                 | Inf               | left_shoulder1 | hinge | angle (rad) |
    | 22  | coordinate-2 (multi-axis) angle between torso and left arm (in left_upper_arm)                                  | -Inf                 | Inf               | left_shoulder2 | hinge | angle (rad) |
    | 23  | angle between left upper arm and left_lower_arm                                                                 | -Inf                 | Inf               | left_elbow | hinge | angle (rad) |
    | 24  | x-coordinate velocity of the torso (centre)                                                                     | -Inf                 | Inf                | root      | free | velocity (m/s) |
    | 25  | y-coordinate velocity of the torso (centre)                                                                     | -Inf                 | Inf                | root      | free | velocity (m/s) |
    | 26  | z-coordinate velocity of the torso (centre)                                                                     | -Inf                 | Inf                | root      | free | velocity (m/s) |
    | 27  | x-coordinate angular velocity of the torso (centre)                                                             | -Inf                 | Inf                | root      | free | anglular velocity (rad/s) |
    | 28  | y-coordinate angular velocity of the torso (centre)                                                             | -Inf                 | Inf                | root      | free | anglular velocity (rad/s) |
    | 29  | z-coordinate angular velocity of the torso (centre)                                                             | -Inf                 | Inf                | root      | free | anglular velocity (rad/s) |
    | 30  | z-coordinate of angular velocity of the abdomen (in lower_waist)                                                | -Inf                 | Inf               | abdomen_z | hinge | anglular velocity (rad/s) |
    | 31  | y-coordinate of angular velocity of the abdomen (in lower_waist)                                                | -Inf                 | Inf               | abdomen_y | hinge | anglular velocity (rad/s) |
    | 32  | x-coordinate of angular velocity of the abdomen (in pelvis)                                                     | -Inf                 | Inf               | abdomen_x | hinge | aanglular velocity (rad/s) |
    | 33  | x-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh)                 | -Inf                 | Inf               | right_hip_x | hinge | anglular velocity (rad/s) |
    | 34  | z-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh)                 | -Inf                 | Inf               | right_hip_z | hinge | anglular velocity (rad/s) |
    | 35  | y-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh)                 | -Inf                 | Inf               | right_hip_y | hinge | anglular velocity (rad/s) |
    | 36  | angular velocity of the angle between right hip and the right shin (in right_knee)                              | -Inf                 | Inf               | right_knee | hinge | anglular velocity (rad/s) |
    | 37  | x-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh)                   | -Inf                 | Inf               | left_hip_x | hinge | anglular velocity (rad/s) |
    | 38  | z-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh)                   | -Inf                 | Inf               | left_hip_z | hinge | anglular velocity (rad/s) |
    | 39  | y-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh)                   | -Inf                 | Inf               | left_hip_y | hinge | anglular velocity (rad/s) |
    | 40  | angular velocity of the angle between left hip and the left shin (in left_knee)                                 | -Inf                 | Inf               | left_knee | hinge | anglular velocity (rad/s) |
    | 41  | coordinate-1 (multi-axis) of the angular velocity of the angle between torso and right arm (in right_upper_arm) | -Inf                 | Inf               | right_shoulder1 | hinge | anglular velocity (rad/s) |
    | 42  | coordinate-2 (multi-axis) of the angular velocity of the angle between torso and right arm (in right_upper_arm) | -Inf                 | Inf               | right_shoulder2 | hinge | anglular velocity (rad/s) |
    | 43  | angular velocity of the angle between right upper arm and right_lower_arm                                       | -Inf                 | Inf               | right_elbow | hinge | anglular velocity (rad/s) |
    | 44  | coordinate-1 (multi-axis) of the angular velocity of the angle between torso and left arm (in left_upper_arm)   | -Inf                 | Inf               | left_shoulder1 | hinge | anglular velocity (rad/s) |
    | 45  | coordinate-2 (multi-axis) of the angular velocity of the angle between torso and left arm (in left_upper_arm)   | -Inf                 | Inf               | left_shoulder2 | hinge | anglular velocity (rad/s) |
    | 46  | angular velocitty of the angle between left upper arm and left_lower_arm                                        | -Inf                 | Inf               | left_elbow | hinge | anglular velocity (rad/s) |
    Additionally, after all the positional and velocity based values in the table,
    the state_space consists of (in order):
    - *cinert:* Mass and inertia of a single rigid body relative to the center of mass
    (this is an intermediate result of transition). It has shape 14*10 (*nbody * 10*)
    and hence adds to another 140 elements in the state space.
    - *cvel:* Center of mass based velocity. It has shape 14 * 6 (*nbody * 6*) and hence
    adds another 84 elements in the state space
    - *qfrc_actuator:* Constraint force generated as the actuator force. This has shape
    `(23,)`  *(nv * 1)* and hence adds another 23 elements to the state space.
    - *cfrc_ext:* This is the center of mass based external force on the body.  It has shape
    14 * 6 (*nbody * 6*) and hence adds to another 84 elements in the state space.
    where *nbody* stands for the number of bodies in the robot and *nv* stands for the
    number of degrees of freedom (*= dim(qvel)*)
    The (x,y,z) coordinates are translational DOFs while the orientations are rotational
    DOFs expressed as quaternions. One can read more about free joints on the
    [Mujoco Documentation](https://mujoco.readthedocs.io/en/latest/XMLreference.html).
    **Note:** There are 47 elements in the table above - giving rise to `(378,)`
    elements in the state space. In practice (and Gym implementation), the first two
    positional elements are omitted from the state space since the reward function is
    calculated based on the x-coordinate value. This value is hidden from the algorithm,
    which in turn has to develop an abstract understanding of it from the observed rewards.
    Therefore, observation space has shape `(376,)` instead of `(378,)` and the table should
    not have the first two rows.
    **Note:** Humanoid-v4 environment no longer has the following contact forces issue.
    If using previous Humanoid versions from v4, there have been reported issues that using
    a Mujoco-Py version > 2.0 results in the contact forces always being 0. As such we recommend
    to use a Mujoco-Py version < 2.0 when using the Humanoid environment if you would like to report
    results with contact forces (if contact forces are not used in your experiments, you can use
    version > 2.0).
    ### Rewards
    The reward consists of three parts:
    - *alive_bonus*: Every timestep that the humanoid is alive, it gets a reward of 5.
    - *lin_vel_cost*: A reward of walking forward which is measured as *1.25 *
    (average center of mass before action - average center of mass after action)/dt*.
    *dt* is the time between actions and is dependent on the frame_skip parameter
    (default is 5), where the *dt* for one frame is 0.003 - making the default *dt = 5 * 0.003 = 0.015*.
    This reward would be positive if the humanoid walks forward (right) desired. The calculation
    for the center of mass is defined in the `.py` file for the Humanoid.
    - *quad_ctrl_cost*: A negative reward for penalising the humanoid if it has too
    large of a control force. If there are *nu* actuators/controls, then the control has
    shape  `nu x 1`. It is measured as *0.1 **x** sum(control<sup>2</sup>)*.
    - *quad_impact_cost*: A negative reward for penalising the humanoid if the external
    contact force is too large. It is calculated as
    *min(0.5 * 0.000001 * sum(external contact force<sup>2</sup>), 10)*.
    The total reward returned is ***reward*** *=* *alive_bonus + lin_vel_cost - quad_ctrl_cost - quad_impact_cost*
    ### Starting State
    All observations start in state
    (0.0, 0.0,  1.4, 1.0, 0.0  ... 0.0) with a uniform noise in the range
    of [-0.01, 0.01] added to the positional and velocity values (values in the table)
    for stochasticity. Note that the initial z coordinate is intentionally
    selected to be high, thereby indicating a standing up humanoid. The initial
    orientation is designed to make it face forward as well.
    ### Episode Termination
    The episode terminates when any of the following happens:
    1. The episode duration reaches a 1000 timesteps
    2. Any of the state space values is no longer finite
    3. The z-coordinate of the torso (index 0 in state space OR index 2 in the table) is **not** in the range `[1.0, 2.0]` (the humanoid has fallen or is about to fall beyond recovery).
    ### Arguments
    No additional arguments are currently supported (in v2 and lower), but
    modifications can be made to the XML file in the assets folder (or by changing
    the path to a modified XML file in another folder)..
    ```
    env = gym.make('Ant-v2')
    ```
    v3 and beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
    ```
    env = gym.make('Ant-v3', ctrl_cost_weight=0.1, ....)
    ```
    ### Version History
    * v4: all mujoco environments now use the mujoco binidings in mujoco>=2.1.3
    * v3: support for gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. rgb rendering comes from tracking camera (so agent does not run away from screen)
    * v2: All continuous control environments now use mujoco_py >= 1.50
    * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments.
    * v0: Initial versions release (1.0.0)
    """
    def __init__(
        self,
        xml_file="humanoid.xml",
        forward_reward_weight=1.25,
        ctrl_cost_weight=0.1,
        healthy_reward=5.0,
        terminate_when_unhealthy=True,
        healthy_z_range=(1.0, 2.0),
        reset_noise_scale=1e-2,
        exclude_current_positions_from_observation=True,
    ):
        utils.EzPickle.__init__(**locals())
        self._forward_reward_weight = forward_reward_weight
        self._ctrl_cost_weight = ctrl_cost_weight
        self._healthy_reward = healthy_reward
        self._terminate_when_unhealthy = terminate_when_unhealthy
        self._healthy_z_range = healthy_z_range
        self._reset_noise_scale = reset_noise_scale
        self._exclude_current_positions_from_observation = (
            exclude_current_positions_from_observation
        )
        mujoco_env.MujocoEnv.__init__(self, xml_file, 5)
    @property
    def healthy_reward(self):
        return (
            float(self.is_healthy or self._terminate_when_unhealthy)
            * self._healthy_reward
        )
    def control_cost(self, action):
        control_cost = self._ctrl_cost_weight * np.sum(np.square(self.data.ctrl))
        return control_cost
    @property
    def is_healthy(self):
        min_z, max_z = self._healthy_z_range
        is_healthy = min_z < self.data.qpos[2] < max_z
        return is_healthy
    @property
    def done(self):
        done = (not self.is_healthy) if self._terminate_when_unhealthy else False
        return done
    def _get_obs(self):
        position = self.data.qpos.flat.copy()
        velocity = self.data.qvel.flat.copy()
        com_inertia = self.data.cinert.flat.copy()
        com_velocity = self.data.cvel.flat.copy()
        actuator_forces = self.data.qfrc_actuator.flat.copy()
        external_contact_forces = self.data.cfrc_ext.flat.copy()
        if self._exclude_current_positions_from_observation:
            position = position[2:]
        return np.concatenate(
            (
                position,
                velocity,
                com_inertia,
                com_velocity,
                actuator_forces,
                external_contact_forces,
            )
        )
    def step(self, action):
        xy_position_before = mass_center(self.model, self.data)
        self.do_simulation(action, self.frame_skip)
        xy_position_after = mass_center(self.model, self.data)
        xy_velocity = (xy_position_after - xy_position_before) / self.dt
        x_velocity, y_velocity = xy_velocity
        ctrl_cost = self.control_cost(action)
        forward_reward = self._forward_reward_weight * x_velocity
        healthy_reward = self.healthy_reward
        rewards = forward_reward + healthy_reward
        observation = self._get_obs()
        reward = rewards - ctrl_cost
        done = self.done
        info = {
            "reward_linvel": forward_reward,
            "reward_quadctrl": -ctrl_cost,
            "reward_alive": healthy_reward,
            "x_position": xy_position_after[0],
            "y_position": xy_position_after[1],
            "distance_from_origin": np.linalg.norm(xy_position_after, ord=2),
            "x_velocity": x_velocity,
            "y_velocity": y_velocity,
            "forward_reward": forward_reward,
        }
        return observation, reward, done, info
    def reset_model(self):
        noise_low = -self._reset_noise_scale
        noise_high = self._reset_noise_scale
        qpos = self.init_qpos + self.np_random.uniform(
            low=noise_low, high=noise_high, size=self.model.nq
        )
        qvel = self.init_qvel + self.np_random.uniform(
            low=noise_low, high=noise_high, size=self.model.nv
        )
        self.set_state(qpos, qvel)
        observation = self._get_obs()
        return observation
    def viewer_setup(self):
        for key, value in DEFAULT_CAMERA_CONFIG.items():
            if isinstance(value, np.ndarray):
                getattr(self.viewer.cam, key)[:] = value
            else:
                setattr(self.viewer.cam, key, value)
--- a/gym/envs/mujoco/humanoidstandup.py
+++ b/gym/envs/mujoco/humanoidstandup.py
@@ -5,178 +5,10 @@ from gym.envs.mujoco import mujoco_env
 class HumanoidStandupEnv(mujoco_env.MujocoEnv, utils.EzPickle):
    """
    ### Description
    This environment is based on the environment introduced by Tassa, Erez and Todorov
    in ["Synthesis and stabilization of complex behaviors through online trajectory optimization"](https://ieeexplore.ieee.org/document/6386025).
    The 3D bipedal robot is designed to simulate a human. It has a torso (abdomen) with a
    pair of legs and arms. The legs each consist of two links, and so the arms (representing the
    knees and elbows respectively). The environment starts with the humanoid layiing on the ground,
    and then the goal of the environment is to make the humanoid standup and then keep it standing
    by applying torques on the various hinges.
    ### Action Space
    The agent take a 17-element vector for actions.
    The action space is a continuous `(action, ...)` all in `[-1, 1]`, where `action`
    represents the numerical torques applied at the hinge joints.
    | Num | Action                    | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit |
    |-----|----------------------|---------------|----------------|---------------------------------------|-------|------|
    | 0   | Torque applied on the hinge in the y-coordinate of the abdomen                     | -0.4 | 0.4 | hip_1 (front_left_leg)      | hinge | torque (N m) |
    | 1   | Torque applied on the hinge in the z-coordinate of the abdomen                     | -0.4 | 0.4 | angle_1 (front_left_leg)    | hinge | torque (N m) |
    | 2   | Torque applied on the hinge in the x-coordinate of the abdomen                     | -0.4 | 0.4 | hip_2 (front_right_leg)     | hinge | torque (N m) |
    | 3   | Torque applied on the rotor between torso/abdomen and the right hip (x-coordinate) | -0.4 | 0.4 | right_hip_x (right_thigh)   | hinge | torque (N m) |
    | 4   | Torque applied on the rotor between torso/abdomen and the right hip (z-coordinate) | -0.4 | 0.4 | right_hip_z (right_thigh)   | hinge | torque (N m) |
    | 5   | Torque applied on the rotor between torso/abdomen and the right hip (y-coordinate) | -0.4 | 0.4 | right_hip_y (right_thigh)   | hinge | torque (N m) |
    | 6   | Torque applied on the rotor between the right hip/thigh and the right shin         | -0.4 | 0.4 | right_knee                  | hinge | torque (N m) |
    | 7   | Torque applied on the rotor between torso/abdomen and the left hip (x-coordinate)  | -0.4 | 0.4 | left_hip_x (left_thigh)     | hinge | torque (N m) |
    | 8   | Torque applied on the rotor between torso/abdomen and the left hip (z-coordinate)  | -0.4 | 0.4 | left_hip_z (left_thigh)     | hinge | torque (N m) |
    | 9   | Torque applied on the rotor between torso/abdomen and the left hip (y-coordinate)  | -0.4 | 0.4 | left_hip_y (left_thigh)     | hinge | torque (N m) |
    | 10   | Torque applied on the rotor between the left hip/thigh and the left shin          | -0.4 | 0.4 | left_knee                   | hinge | torque (N m) |
    | 11   | Torque applied on the rotor between the torso and right upper arm (coordinate -1) | -0.4 | 0.4 | right_shoulder1             | hinge | torque (N m) |
    | 12   | Torque applied on the rotor between the torso and right upper arm (coordinate -2) | -0.4 | 0.4 | right_shoulder2             | hinge | torque (N m) |
    | 13   | Torque applied on the rotor between the right upper arm and right lower arm       | -0.4 | 0.4 | right_elbow                 | hinge | torque (N m) |
    | 14   | Torque applied on the rotor between the torso and left upper arm (coordinate -1)  | -0.4 | 0.4 | left_shoulder1              | hinge | torque (N m) |
    | 15   | Torque applied on the rotor between the torso and left upper arm (coordinate -2)  | -0.4 | 0.4 | left_shoulder2              | hinge | torque (N m) |
    | 16   | Torque applied on the rotor between the left upper arm and left lower arm         | -0.4 | 0.4 | left_elbow                  | hinge | torque (N m) |
    ### Observation Space
    The state space consists of positional values of different body parts of the Humanoid,
    followed by the velocities of those individual parts (their derivatives) with all the positions ordered before all the velocities.
    **Note:** The x- and y-coordinates of the torso are being omitted to produce position-agnostic behavior in policies
    The observation is a `ndarray` with shape `(376,)` where the elements correspond to the following:
    | Num | Observation                                                        | Min                | Max                | Name (in corresponding XML file) | Joint | Unit |
    |-----|---------------------------------------------------------|----------------|-----------------|----------------------------------------|-------|------|
    | 0   | z-coordinate of the torso (centre)                                                                              | -Inf                 | Inf                | root      | free | position (m) |
    | 1   | x-orientation of the torso (centre)                                                                             | -Inf                 | Inf                | root      | free | angle (rad) |
    | 2   | y-orientation of the torso (centre)                                                                             | -Inf                 | Inf                | root      | free | angle (rad) |
    | 3   | z-orientation of the torso (centre)                                                                             | -Inf                 | Inf                | root      | free | angle (rad) |
    | 4   | w-orientation of the torso (centre)                                                                             | -Inf                 | Inf                | root       | free | angle (rad) |
    | 5   | z-angle of the abdomen (in lower_waist)                                                                         | -Inf                 | Inf               | abdomen_z | hinge | angle (rad) |
    | 6   | y-angle of the abdomen (in lower_waist)                                                                         | -Inf                 | Inf               | abdomen_y | hinge | angle (rad) |
    | 7   | x-angle of the abdomen (in pelvis)                                                                              | -Inf                 | Inf               | abdomen_x | hinge | angle (rad) |
    | 8   | x-coordinate of angle between pelvis and right hip (in right_thigh)                                             | -Inf                 | Inf               | right_hip_x | hinge | angle (rad) |
    | 9   | z-coordinate of angle between pelvis and right hip (in right_thigh)                                             | -Inf                 | Inf               | right_hip_z | hinge | angle (rad) |
    | 10  | y-coordinate of angle between pelvis and right hip (in right_thigh)                                             | -Inf                 | Inf               | right_hip_y | hinge | angle (rad) |
    | 11  | angle between right hip and the right shin (in right_knee)                                                      | -Inf                 | Inf               | right_knee | hinge | angle (rad) |
    | 12  | x-coordinate of angle between pelvis and left hip (in left_thigh)                                               | -Inf                 | Inf               | left_hip_x | hinge | angle (rad) |
    | 13  | z-coordinate of angle between pelvis and left hip (in left_thigh)                                               | -Inf                 | Inf               | left_hip_z | hinge | angle (rad) |
    | 14  | y-coordinate of angle between pelvis and left hip (in left_thigh)                                               | -Inf                 | Inf               | left_hip_y | hinge | angle (rad) |
    | 15  | angle between left hip and the left shin (in left_knee)                                                         | -Inf                 | Inf               | left_knee | hinge | angle (rad) |
    | 16  | coordinate-1 (multi-axis) angle between torso and right arm (in right_upper_arm)                                | -Inf                 | Inf               | right_shoulder1 | hinge | angle (rad) |
    | 17  | coordinate-2 (multi-axis) angle between torso and right arm (in right_upper_arm)                                | -Inf                 | Inf               | right_shoulder2 | hinge | angle (rad) |
    | 18  | angle between right upper arm and right_lower_arm                                                               | -Inf                 | Inf               | right_elbow | hinge | angle (rad) |
    | 19  | coordinate-1 (multi-axis) angle between torso and left arm (in left_upper_arm)                                  | -Inf                 | Inf               | left_shoulder1 | hinge | angle (rad) |
    | 20  | coordinate-2 (multi-axis) angle between torso and left arm (in left_upper_arm)                                  | -Inf                 | Inf               | left_shoulder2 | hinge | angle (rad) |
    | 21  | angle between left upper arm and left_lower_arm                                                                 | -Inf                 | Inf               | left_elbow | hinge | angle (rad) |
    | 22  | x-coordinate velocity of the torso (centre)                                                                     | -Inf                 | Inf                | root      | free | velocity (m/s) |
    | 23  | y-coordinate velocity of the torso (centre)                                                                     | -Inf                 | Inf                | root      | free | velocity (m/s) |
    | 24  | z-coordinate velocity of the torso (centre)                                                                     | -Inf                 | Inf                | root      | free | velocity (m/s) |
    | 25  | x-coordinate angular velocity of the torso (centre)                                                             | -Inf                 | Inf                | root      | free | anglular velocity (rad/s) |
    | 26  | y-coordinate angular velocity of the torso (centre)                                                             | -Inf                 | Inf                | root      | free | anglular velocity (rad/s) |
    | 27  | z-coordinate angular velocity of the torso (centre)                                                             | -Inf                 | Inf                | root      | free | anglular velocity (rad/s) |
    | 28  | z-coordinate of angular velocity of the abdomen (in lower_waist)                                                | -Inf                 | Inf               | abdomen_z | hinge | anglular velocity (rad/s) |
    | 29  | y-coordinate of angular velocity of the abdomen (in lower_waist)                                                | -Inf                 | Inf               | abdomen_y | hinge | anglular velocity (rad/s) |
    | 30  | x-coordinate of angular velocity of the abdomen (in pelvis)                                                     | -Inf                 | Inf               | abdomen_x | hinge | aanglular velocity (rad/s) |
    | 31  | x-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh)                 | -Inf                 | Inf               | right_hip_x | hinge | anglular velocity (rad/s) |
    | 32  | z-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh)                 | -Inf                 | Inf               | right_hip_z | hinge | anglular velocity (rad/s) |
    | 33  | y-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh)                 | -Inf                 | Inf               | right_hip_y | hinge | anglular velocity (rad/s) |
    | 35  | angular velocity of the angle between right hip and the right shin (in right_knee)                              | -Inf                 | Inf               | right_knee | hinge | anglular velocity (rad/s) |
    | 36  | x-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh)                   | -Inf                 | Inf               | left_hip_x | hinge | anglular velocity (rad/s) |
    | 37  | z-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh)                   | -Inf                 | Inf               | left_hip_z | hinge | anglular velocity (rad/s) |
    | 38  | y-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh)                   | -Inf                 | Inf               | left_hip_y | hinge | anglular velocity (rad/s) |
    | 39  | angular velocity of the angle between left hip and the left shin (in left_knee)                                 | -Inf                 | Inf               | left_knee | hinge | anglular velocity (rad/s) |
    | 40  | coordinate-1 (multi-axis) of the angular velocity of the angle between torso and right arm (in right_upper_arm) | -Inf                 | Inf               | right_shoulder1 | hinge | anglular velocity (rad/s) |
    | 41  | coordinate-2 (multi-axis) of the angular velocity of the angle between torso and right arm (in right_upper_arm) | -Inf                 | Inf               | right_shoulder2 | hinge | anglular velocity (rad/s) |
    | 42  | angular velocity of the angle between right upper arm and right_lower_arm                                       | -Inf                 | Inf               | right_elbow | hinge | anglular velocity (rad/s) |
    | 43  | coordinate-1 (multi-axis) of the angular velocity of the angle between torso and left arm (in left_upper_arm)   | -Inf                 | Inf               | left_shoulder1 | hinge | anglular velocity (rad/s) |
    | 44  | coordinate-2 (multi-axis) of the angular velocity of the angle between torso and left arm (in left_upper_arm)   | -Inf                 | Inf               | left_shoulder2 | hinge | anglular velocity (rad/s) |
    | 45  | angular velocitty of the angle between left upper arm and left_lower_arm                                        | -Inf                 | Inf               | left_elbow | hinge | anglular velocity (rad/s) |
    Additionally, after all the positional and velocity based values in the table,
    the state_space consists of (in order):
    - *cinert:* Mass and inertia of a single rigid body relative to the center of mass
    (this is an intermediate result of transition). It has shape 14*10 (*nbody * 10*)
    and hence adds to another 140 elements in the state space.
    - *cvel:* Center of mass based velocity. It has shape 14 * 6 (*nbody * 6*) and hence
    adds another 84 elements in the state space
    - *qfrc_actuator:* Constraint force generated as the actuator force. This has shape
    `(23,)`  *(nv * 1)* and hence adds another 23 elements to the state space.
    - *cfrc_ext:* This is the center of mass based external force on the body.  It has shape
    14 * 6 (*nbody * 6*) and hence adds to another 84 elements in the state space.
    where *nbody* stands for the number of bodies in the robot and *nv* stands for the number
    of degrees of freedom (*= dim(qvel)*)
    The (x,y,z) coordinates are translational DOFs while the orientations are rotational
    DOFs expressed as quaternions. One can read more about free joints on the
    [Mujoco Documentation](https://mujoco.readthedocs.io/en/latest/XMLreference.html).
    **Note:** There have been reported issues that using a Mujoco-Py version > 2.0 results
    in the contact forces always being 0. As such we recommend to use a Mujoco-Py version < 2.0
    when using the Humanoid environment if you would like to report results with contact forces
    (if contact forces are not used in your experiments, you can use version > 2.0).
    ### Rewards
    The reward consists of three parts:
    - *uph_cost*: A reward for moving upward (in an attempt to stand up). This is not a relative
    reward which meaures how much upward it has moved from the last timestep, but it is an
    absolute reward which measures how much upward the Humanoid has moved overall. It is
    measured as *(z coordinate after action - 0)/(atomic timestep)*, where *z coordinate after
    action* is index 0 in the state/index 2 in the table, and *atomic timestep* is the time for
    one frame of movement even though the simulation has a framerate of 5 (done in order to inflate
    rewards a little for fassteer learning).
    - *quad_ctrl_cost*: A negative reward for penalising the humanoid if it has too large of
    a control force. If there are *nu* actuators/controls, then the control has shape  `nu x 1`.
    It is measured as *0.1 **x** sum(control<sup>2</sup>)*.
    - *quad_impact_cost*: A negative reward for penalising the humanoid if the external
    contact force is too large. It is calculated as *min(0.5 * 0.000001 * sum(external
    contact force<sup>2</sup>), 10)*.
    The total reward returned is ***reward*** *=* *uph_cost + 1 - quad_ctrl_cost - quad_impact_cost*
    ### Starting State
    All observations start in state
    (0.0, 0.0,  0.105, 1.0, 0.0  ... 0.0) with a uniform noise in the range of
    [-0.01, 0.01] added to the positional and velocity values (values in the table)
    for stochasticity. Note that the initial z coordinate is intentionally selected
    to be low, thereby indicating a laying down humanoid. The initial orientation is
    designed to make it face forward as well.
    ### Episode Termination
    The episode terminates when any of the following happens:
    1. The episode duration reaches a 1000 timesteps
    2. Any of the state space values is no longer finite
    ### Arguments
    No additional arguments are currently supported.
    ```
    env = gym.make('HumanoidStandup-v2')
    ```
    There is no v3 for HumanoidStandup, unlike the robot environments where a v3 and
    beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
    ### Version History
    * v2: All continuous control environments now use mujoco_py >= 1.50
    * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments.
    * v0: Initial versions release (1.0.0)
    """
    def __init__(self):
-        mujoco_env.MujocoEnv.__init__(self, "humanoidstandup.xml", 5)
+        mujoco_env.MujocoEnv.__init__(
            self, "humanoidstandup.xml", 5, mujoco_bindings="mujoco_py"
        )
        utils.EzPickle.__init__(self)
    def _get_obs(self):
--- a/gym/envs/mujoco/humanoidstandup_v4.py
+++ b/gym/envs/mujoco/humanoidstandup_v4.py
@@ -0,0 +1,247 @@
 import numpy as np
 from gym import utils
 from gym.envs.mujoco import mujoco_env
 class HumanoidStandupEnv(mujoco_env.MujocoEnv, utils.EzPickle):
    """
    ### Description
    This environment is based on the environment introduced by Tassa, Erez and Todorov
    in ["Synthesis and stabilization of complex behaviors through online trajectory optimization"](https://ieeexplore.ieee.org/document/6386025).
    The 3D bipedal robot is designed to simulate a human. It has a torso (abdomen) with a
    pair of legs and arms. The legs each consist of two links, and so the arms (representing the
    knees and elbows respectively). The environment starts with the humanoid layiing on the ground,
    and then the goal of the environment is to make the humanoid standup and then keep it standing
    by applying torques on the various hinges.
    ### Action Space
    The agent take a 17-element vector for actions.
    The action space is a continuous `(action, ...)` all in `[-1, 1]`, where `action`
    represents the numerical torques applied at the hinge joints.
    | Num | Action                    | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit |
    |-----|----------------------|---------------|----------------|---------------------------------------|-------|------|
    | 0   | Torque applied on the hinge in the y-coordinate of the abdomen                     | -0.4 | 0.4 | hip_1 (front_left_leg)      | hinge | torque (N m) |
    | 1   | Torque applied on the hinge in the z-coordinate of the abdomen                     | -0.4 | 0.4 | angle_1 (front_left_leg)    | hinge | torque (N m) |
    | 2   | Torque applied on the hinge in the x-coordinate of the abdomen                     | -0.4 | 0.4 | hip_2 (front_right_leg)     | hinge | torque (N m) |
    | 3   | Torque applied on the rotor between torso/abdomen and the right hip (x-coordinate) | -0.4 | 0.4 | right_hip_x (right_thigh)   | hinge | torque (N m) |
    | 4   | Torque applied on the rotor between torso/abdomen and the right hip (z-coordinate) | -0.4 | 0.4 | right_hip_z (right_thigh)   | hinge | torque (N m) |
    | 5   | Torque applied on the rotor between torso/abdomen and the right hip (y-coordinate) | -0.4 | 0.4 | right_hip_y (right_thigh)   | hinge | torque (N m) |
    | 6   | Torque applied on the rotor between the right hip/thigh and the right shin         | -0.4 | 0.4 | right_knee                  | hinge | torque (N m) |
    | 7   | Torque applied on the rotor between torso/abdomen and the left hip (x-coordinate)  | -0.4 | 0.4 | left_hip_x (left_thigh)     | hinge | torque (N m) |
    | 8   | Torque applied on the rotor between torso/abdomen and the left hip (z-coordinate)  | -0.4 | 0.4 | left_hip_z (left_thigh)     | hinge | torque (N m) |
    | 9   | Torque applied on the rotor between torso/abdomen and the left hip (y-coordinate)  | -0.4 | 0.4 | left_hip_y (left_thigh)     | hinge | torque (N m) |
    | 10   | Torque applied on the rotor between the left hip/thigh and the left shin          | -0.4 | 0.4 | left_knee                   | hinge | torque (N m) |
    | 11   | Torque applied on the rotor between the torso and right upper arm (coordinate -1) | -0.4 | 0.4 | right_shoulder1             | hinge | torque (N m) |
    | 12   | Torque applied on the rotor between the torso and right upper arm (coordinate -2) | -0.4 | 0.4 | right_shoulder2             | hinge | torque (N m) |
    | 13   | Torque applied on the rotor between the right upper arm and right lower arm       | -0.4 | 0.4 | right_elbow                 | hinge | torque (N m) |
    | 14   | Torque applied on the rotor between the torso and left upper arm (coordinate -1)  | -0.4 | 0.4 | left_shoulder1              | hinge | torque (N m) |
    | 15   | Torque applied on the rotor between the torso and left upper arm (coordinate -2)  | -0.4 | 0.4 | left_shoulder2              | hinge | torque (N m) |
    | 16   | Torque applied on the rotor between the left upper arm and left lower arm         | -0.4 | 0.4 | left_elbow                  | hinge | torque (N m) |
    ### Observation Space
    The state space consists of positional values of different body parts of the Humanoid,
    followed by the velocities of those individual parts (their derivatives) with all the positions ordered before all the velocities.
    The observation is a `ndarray` with shape `(376,)` where the elements correspond to the following:
    | Num | Observation                                                        | Min                | Max                | Name (in corresponding XML file) | Joint | Unit |
    |-----|---------------------------------------------------------|----------------|-----------------|----------------------------------------|-------|------|
    | 0   | x-coordinate of the torso (centre)                                                                              | -Inf                 | Inf                | root      | free | position (m) |
    | 1   | y-coordinate of the torso (centre)                                                                              | -Inf                 | Inf                | root      | free | position (m) |
    | 2   | z-coordinate of the torso (centre)                                                                              | -Inf                 | Inf                | root      | free | position (m) |
    | 3   | x-orientation of the torso (centre)                                                                             | -Inf                 | Inf                | root      | free | angle (rad) |
    | 4   | y-orientation of the torso (centre)                                                                             | -Inf                 | Inf                | root      | free | angle (rad) |
    | 5   | z-orientation of the torso (centre)                                                                             | -Inf                 | Inf                | root      | free | angle (rad) |
    | 6   | w-orientation of the torso (centre)                                                                             | -Inf                 | Inf                | root       | free | angle (rad) |
    | 7   | z-angle of the abdomen (in lower_waist)                                                                         | -Inf                 | Inf               | abdomen_z | hinge | angle (rad) |
    | 8   | y-angle of the abdomen (in lower_waist)                                                                         | -Inf                 | Inf               | abdomen_y | hinge | angle (rad) |
    | 9   | x-angle of the abdomen (in pelvis)                                                                              | -Inf                 | Inf               | abdomen_x | hinge | angle (rad) |
    | 10  | x-coordinate of angle between pelvis and right hip (in right_thigh)                                             | -Inf                 | Inf               | right_hip_x | hinge | angle (rad) |
    | 11  | z-coordinate of angle between pelvis and right hip (in right_thigh)                                             | -Inf                 | Inf               | right_hip_z | hinge | angle (rad) |
    | 12  | y-coordinate of angle between pelvis and right hip (in right_thigh)                                             | -Inf                 | Inf               | right_hip_y | hinge | angle (rad) |
    | 13  | angle between right hip and the right shin (in right_knee)                                                      | -Inf                 | Inf               | right_knee | hinge | angle (rad) |
    | 14  | x-coordinate of angle between pelvis and left hip (in left_thigh)                                               | -Inf                 | Inf               | left_hip_x | hinge | angle (rad) |
    | 15  | z-coordinate of angle between pelvis and left hip (in left_thigh)                                               | -Inf                 | Inf               | left_hip_z | hinge | angle (rad) |
    | 16  | y-coordinate of angle between pelvis and left hip (in left_thigh)                                               | -Inf                 | Inf               | left_hip_y | hinge | angle (rad) |
    | 17  | angle between left hip and the left shin (in left_knee)                                                         | -Inf                 | Inf               | left_knee | hinge | angle (rad) |
    | 18  | coordinate-1 (multi-axis) angle between torso and right arm (in right_upper_arm)                                | -Inf                 | Inf               | right_shoulder1 | hinge | angle (rad) |
    | 19  | coordinate-2 (multi-axis) angle between torso and right arm (in right_upper_arm)                                | -Inf                 | Inf               | right_shoulder2 | hinge | angle (rad) |
    | 20  | angle between right upper arm and right_lower_arm                                                               | -Inf                 | Inf               | right_elbow | hinge | angle (rad) |
    | 21  | coordinate-1 (multi-axis) angle between torso and left arm (in left_upper_arm)                                  | -Inf                 | Inf               | left_shoulder1 | hinge | angle (rad) |
    | 22  | coordinate-2 (multi-axis) angle between torso and left arm (in left_upper_arm)                                  | -Inf                 | Inf               | left_shoulder2 | hinge | angle (rad) |
    | 23  | angle between left upper arm and left_lower_arm                                                                 | -Inf                 | Inf               | left_elbow | hinge | angle (rad) |
    | 24  | x-coordinate velocity of the torso (centre)                                                                     | -Inf                 | Inf                | root      | free | velocity (m/s) |
    | 25  | y-coordinate velocity of the torso (centre)                                                                     | -Inf                 | Inf                | root      | free | velocity (m/s) |
    | 26  | z-coordinate velocity of the torso (centre)                                                                     | -Inf                 | Inf                | root      | free | velocity (m/s) |
    | 27  | x-coordinate angular velocity of the torso (centre)                                                             | -Inf                 | Inf                | root      | free | anglular velocity (rad/s) |
    | 28  | y-coordinate angular velocity of the torso (centre)                                                             | -Inf                 | Inf                | root      | free | anglular velocity (rad/s) |
    | 29  | z-coordinate angular velocity of the torso (centre)                                                             | -Inf                 | Inf                | root      | free | anglular velocity (rad/s) |
    | 30  | z-coordinate of angular velocity of the abdomen (in lower_waist)                                                | -Inf                 | Inf               | abdomen_z | hinge | anglular velocity (rad/s) |
    | 31  | y-coordinate of angular velocity of the abdomen (in lower_waist)                                                | -Inf                 | Inf               | abdomen_y | hinge | anglular velocity (rad/s) |
    | 32  | x-coordinate of angular velocity of the abdomen (in pelvis)                                                     | -Inf                 | Inf               | abdomen_x | hinge | aanglular velocity (rad/s) |
    | 33  | x-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh)                 | -Inf                 | Inf               | right_hip_x | hinge | anglular velocity (rad/s) |
    | 34  | z-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh)                 | -Inf                 | Inf               | right_hip_z | hinge | anglular velocity (rad/s) |
    | 35  | y-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh)                 | -Inf                 | Inf               | right_hip_y | hinge | anglular velocity (rad/s) |
    | 36  | angular velocity of the angle between right hip and the right shin (in right_knee)                              | -Inf                 | Inf               | right_knee | hinge | anglular velocity (rad/s) |
    | 37  | x-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh)                   | -Inf                 | Inf               | left_hip_x | hinge | anglular velocity (rad/s) |
    | 38  | z-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh)                   | -Inf                 | Inf               | left_hip_z | hinge | anglular velocity (rad/s) |
    | 39  | y-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh)                   | -Inf                 | Inf               | left_hip_y | hinge | anglular velocity (rad/s) |
    | 40  | angular velocity of the angle between left hip and the left shin (in left_knee)                                 | -Inf                 | Inf               | left_knee | hinge | anglular velocity (rad/s) |
    | 41  | coordinate-1 (multi-axis) of the angular velocity of the angle between torso and right arm (in right_upper_arm) | -Inf                 | Inf               | right_shoulder1 | hinge | anglular velocity (rad/s) |
    | 42  | coordinate-2 (multi-axis) of the angular velocity of the angle between torso and right arm (in right_upper_arm) | -Inf                 | Inf               | right_shoulder2 | hinge | anglular velocity (rad/s) |
    | 43  | angular velocity of the angle between right upper arm and right_lower_arm                                       | -Inf                 | Inf               | right_elbow | hinge | anglular velocity (rad/s) |
    | 44  | coordinate-1 (multi-axis) of the angular velocity of the angle between torso and left arm (in left_upper_arm)   | -Inf                 | Inf               | left_shoulder1 | hinge | anglular velocity (rad/s) |
    | 45  | coordinate-2 (multi-axis) of the angular velocity of the angle between torso and left arm (in left_upper_arm)   | -Inf                 | Inf               | left_shoulder2 | hinge | anglular velocity (rad/s) |
    | 46  | angular velocitty of the angle between left upper arm and left_lower_arm                                        | -Inf                 | Inf               | left_elbow | hinge | anglular velocity (rad/s) |
    Additionally, after all the positional and velocity based values in the table,
    the state_space consists of (in order):
    - *cinert:* Mass and inertia of a single rigid body relative to the center of mass
    (this is an intermediate result of transition). It has shape 14*10 (*nbody * 10*)
    and hence adds to another 140 elements in the state space.
    - *cvel:* Center of mass based velocity. It has shape 14 * 6 (*nbody * 6*) and hence
    adds another 84 elements in the state space
    - *qfrc_actuator:* Constraint force generated as the actuator force. This has shape
    `(23,)`  *(nv * 1)* and hence adds another 23 elements to the state space.
    - *cfrc_ext:* This is the center of mass based external force on the body.  It has shape
    14 * 6 (*nbody * 6*) and hence adds to another 84 elements in the state space.
    where *nbody* stands for the number of bodies in the robot and *nv* stands for the number
    of degrees of freedom (*= dim(qvel)*)
    The (x,y,z) coordinates are translational DOFs while the orientations are rotational
    DOFs expressed as quaternions. One can read more about free joints on the
    [Mujoco Documentation](https://mujoco.readthedocs.io/en/latest/XMLreference.html).
    **Note:** There are 47 elements in the table above - giving rise to `(378,)`
    elements in the state space. In practice (and Gym implementation), the first two
    positional elements are omitted from the state space since the reward function is
    calculated based on the x-coordinate value. This value is hidden from the algorithm,
    which in turn has to develop an abstract understanding of it from the observed rewards.
    Therefore, observation space has shape `(376,)` instead of `(378,)` and the table should not have the first two rows.
    **Note:** HumanoidStandup-v4 environment no longer has the following contact forces issue.
    If using previous HumanoidStandup versions from v4, there have been reported issues that using a Mujoco-Py version > 2.0 results
    in the contact forces always being 0. As such we recommend to use a Mujoco-Py version < 2.0
    when using the Humanoid environment if you would like to report results with contact forces
    (if contact forces are not used in your experiments, you can use version > 2.0).
    ### Rewards
    The reward consists of three parts:
    - *uph_cost*: A reward for moving upward (in an attempt to stand up). This is not a relative
    reward which meaures how much upward it has moved from the last timestep, but it is an
    absolute reward which measures how much upward the Humanoid has moved overall. It is
    measured as *(z coordinate after action - 0)/(atomic timestep)*, where *z coordinate after
    action* is index 0 in the state/index 2 in the table, and *atomic timestep* is the time for
    one frame of movement even though the simulation has a framerate of 5 (done in order to inflate
    rewards a little for fassteer learning).
    - *quad_ctrl_cost*: A negative reward for penalising the humanoid if it has too large of
    a control force. If there are *nu* actuators/controls, then the control has shape  `nu x 1`.
    It is measured as *0.1 **x** sum(control<sup>2</sup>)*.
    - *quad_impact_cost*: A negative reward for penalising the humanoid if the external
    contact force is too large. It is calculated as *min(0.5 * 0.000001 * sum(external
    contact force<sup>2</sup>), 10)*.
    The total reward returned is ***reward*** *=* *uph_cost + 1 - quad_ctrl_cost - quad_impact_cost*
    ### Starting State
    All observations start in state
    (0.0, 0.0,  0.105, 1.0, 0.0  ... 0.0) with a uniform noise in the range of
    [-0.01, 0.01] added to the positional and velocity values (values in the table)
    for stochasticity. Note that the initial z coordinate is intentionally selected
    to be low, thereby indicating a laying down humanoid. The initial orientation is
    designed to make it face forward as well.
    ### Episode Termination
    The episode terminates when any of the following happens:
    1. The episode duration reaches a 1000 timesteps
    2. Any of the state space values is no longer finite
    ### Arguments
    No additional arguments are currently supported (in v2 and lower), but
    modifications can be made to the XML file in the assets folder
    (or by changing the path to a modified XML file in another folder)..
    ```
    env = gym.make('HumanoidStandup-v2')
    ```
    There is no v3 for HumanoidStandup, unlike the robot environments where a v3 and
    beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
    ### Version History
    * v4: all mujoco environments now use the mujoco binidings in mujoco>=2.1.3
    * v3: support for gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. rgb rendering comes from tracking camera (so agent does not run away from screen)
    * v2: All continuous control environments now use mujoco_py >= 1.50
    * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments.
    * v0: Initial versions release (1.0.0)
    """
    def __init__(self):
        mujoco_env.MujocoEnv.__init__(self, "humanoidstandup.xml", 5)
        utils.EzPickle.__init__(self)
    def _get_obs(self):
        data = self.data
        return np.concatenate(
            [
                data.qpos.flat[2:],
                data.qvel.flat,
                data.cinert.flat,
                data.cvel.flat,
                data.qfrc_actuator.flat,
                data.cfrc_ext.flat,
            ]
        )
    def step(self, a):
        self.do_simulation(a, self.frame_skip)
        pos_after = self.data.qpos[2]
        data = self.data
        uph_cost = (pos_after - 0) / self.model.opt.timestep
        quad_ctrl_cost = 0.1 * np.square(data.ctrl).sum()
        quad_impact_cost = 0.5e-6 * np.square(data.cfrc_ext).sum()
        quad_impact_cost = min(quad_impact_cost, 10)
        reward = uph_cost - quad_ctrl_cost - quad_impact_cost + 1
        done = bool(False)
        return (
            self._get_obs(),
            reward,
            done,
            dict(
                reward_linup=uph_cost,
                reward_quadctrl=-quad_ctrl_cost,
                reward_impact=-quad_impact_cost,
            ),
        )
    def reset_model(self):
        c = 0.01
        self.set_state(
            self.init_qpos + self.np_random.uniform(low=-c, high=c, size=self.model.nq),
            self.init_qvel
            + self.np_random.uniform(
                low=-c,
                high=c,
                size=self.model.nv,
            ),
        )
        return self._get_obs()
    def viewer_setup(self):
        self.viewer.cam.trackbodyid = 1
        self.viewer.cam.distance = self.model.stat.extent * 1.0
        self.viewer.cam.lookat[2] = 0.8925
        self.viewer.cam.elevation = -20
--- a/gym/envs/mujoco/inverted_double_pendulum.py
+++ b/gym/envs/mujoco/inverted_double_pendulum.py
@@ -5,113 +5,10 @@ from gym.envs.mujoco import mujoco_env
 class InvertedDoublePendulumEnv(mujoco_env.MujocoEnv, utils.EzPickle):
    """
    ### Description
    This environment originates from control theory and builds on the cartpole
    environment based on the work done by Barto, Sutton, and Anderson in
    ["Neuronlike adaptive elements that can solve difficult learning control problems"](https://ieeexplore.ieee.org/document/6313077),
    powered by the Mujoco physics simulator - allowing for more complex experiments
    (such as varying the effects of gravity or constraints). This environment involves a cart that can
    moved linearly, with a pole fixed on it and a second pole fixed on the other end of the first one
    (leaving the second pole as the only one with one free end). The cart can be pushed left or right,
    and the goal is to balance the second pole on top of the first pole, which is in turn on top of the
    cart, by applying continuous forces on the cart.
    ### Action Space
    The agent take a 1-element vector for actions.
    The action space is a continuous `(action)` in `[-1, 1]`, where `action` represents the
    numerical force applied to the cart (with magnitude representing the amount of force and
    sign representing the direction)
    | Num | Action                    | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit      |
    |-----|---------------------------|-------------|-------------|----------------------------------|-------|-----------|
    | 0   | Force applied on the cart | -1          | 1           | slider                           | slide | Force (N) |
    ### Observation Space
    The state space consists of positional values of different body parts of the pendulum system,
    followed by the velocities of those individual parts (their derivatives) with all the
    positions ordered before all the velocities.
    The observation is a `ndarray` with shape `(11,)` where the elements correspond to the following:
    | Num | Observation           | Min                  | Max                | Name (in corresponding XML file) | Joint| Unit |
    |-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------|
    | 0   | position of the cart along the linear surface                        | -Inf                 | Inf                | slider | slide | position (m) |
    | 1   | sine of the angle between the cart and the first pole                | -Inf                 | Inf                | sin(hinge) | hinge | unitless |
    | 2   | sine of the angle between the two poles                              | -Inf                 | Inf                | sin(hinge2) | hinge | unitless |
    | 3   | cosine of the angle between the cart and the first pole              | -Inf                 | Inf                | cos(hinge) | hinge | unitless |
    | 4   | cosine of the angle between the two poles                            | -Inf                 | Inf                | cos(hinge2) | hinge | unitless |
    | 5   | velocity of the cart                                                 | -Inf                 | Inf                | slider | slide | velocity (m/s) |
    | 6   | angular velocity of the angle between the cart and the first pole    | -Inf                 | Inf                | hinge | hinge | angular velocity (rad/s) |
    | 7   | angular velocity of the angle between the two poles                  | -Inf                 | Inf                | hinge2 | hinge | angular velocity (rad/s) |
    | 8   | constraint force - 1                                                 | -Inf                 | Inf                |  |  | Force (N) |
    | 9   | constraint force - 2                                                 | -Inf                 | Inf                |  |  | Force (N) |
    | 10  | constraint force - 3                                                 | -Inf                 | Inf                |  |  | Force (N) |
    There is physical contact between the robots and their environment - and Mujoco
    attempts at getting realisitic physics simulations for the possible physical contact
    dynamics by aiming for physical accuracy and computational efficiency.
    There is one constraint force for contacts for each degree of freedom (3).
    The approach and handling of constraints by Mujoco is unique to the simulator
    and is based on their research. Once can find more information in their
    [*documentation*](https://mujoco.readthedocs.io/en/latest/computation.html)
    or in their paper
    ["Analytically-invertible dynamics with contacts and constraints: Theory and implementation in MuJoCo"](https://homes.cs.washington.edu/~todorov/papers/TodorovICRA14.pdf).
    ### Rewards
    The reward consists of two parts:
    - *alive_bonus*: The goal is to make the second inverted pendulum stand upright
    (within a certain angle limit) as long as possible - as such a reward of +10 is awarded
     for each timestep that the second pole is upright.
    - *distance_penalty*: This reward is a measure of how far the *tip* of the second pendulum
    (the only free end) moves, and it is calculated as
    *0.01 * x<sup>2</sup> + (y - 2)<sup>2</sup>*, where *x* is the x-coordinate of the tip
    and *y* is the y-coordinate of the tip of the second pole.
    - *velocity_penalty*: A negative reward for penalising the agent if it moves too
    fast *0.001 *  v<sub>1</sub><sup>2</sup> + 0.005 * v<sub>2</sub> <sup>2</sup>*
    The total reward returned is ***reward*** *=* *alive_bonus - distance_penalty - velocity_penalty*
    ### Starting State
    All observations start in state
    (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0) with a uniform noise in the range
    of [-0.1, 0.1] added to the positional values (cart position and pole angles) and standard
    normal force with a standard deviation of 0.1 added to the velocity values for stochasticity.
    ### Episode Termination
    The episode terminates when any of the following happens:
    1. The episode duration reaches 1000 timesteps.
    2. Any of the state space values is no longer finite.
    3. The y_coordinate of the tip of the second pole *is less than or equal* to 1. The maximum standing height of the system is 1.196 m when all the parts are perpendicularly vertical on top of each other).
    ### Arguments
    No additional arguments are currently supported.
    ```
    env = gym.make('InvertedDoublePendulum-v2')
    ```
    There is no v3 for InvertedPendulum, unlike the robot environments where a v3 and
    beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
    ### Version History
    * v2: All continuous control environments now use mujoco_py >= 1.50
    * v1: max_time_steps raised to 1000 for robot based tasks (including inverted pendulum)
    * v0: Initial versions release (1.0.0)
    """
    def __init__(self):
-        mujoco_env.MujocoEnv.__init__(self, "inverted_double_pendulum.xml", 5)
+        mujoco_env.MujocoEnv.__init__(
            self, "inverted_double_pendulum.xml", 5, mujoco_bindings="mujoco_py"
        )
        utils.EzPickle.__init__(self)
    def step(self, action):
--- a/gym/envs/mujoco/inverted_double_pendulum_v4.py
+++ b/gym/envs/mujoco/inverted_double_pendulum_v4.py
@@ -0,0 +1,156 @@
 import numpy as np
 from gym import utils
 from gym.envs.mujoco import mujoco_env
 class InvertedDoublePendulumEnv(mujoco_env.MujocoEnv, utils.EzPickle):
    """
    ### Description
    This environment originates from control theory and builds on the cartpole
    environment based on the work done by Barto, Sutton, and Anderson in
    ["Neuronlike adaptive elements that can solve difficult learning control problems"](https://ieeexplore.ieee.org/document/6313077),
    powered by the Mujoco physics simulator - allowing for more complex experiments
    (such as varying the effects of gravity or constraints). This environment involves a cart that can
    moved linearly, with a pole fixed on it and a second pole fixed on the other end of the first one
    (leaving the second pole as the only one with one free end). The cart can be pushed left or right,
    and the goal is to balance the second pole on top of the first pole, which is in turn on top of the
    cart, by applying continuous forces on the cart.
    ### Action Space
    The agent take a 1-element vector for actions.
    The action space is a continuous `(action)` in `[-1, 1]`, where `action` represents the
    numerical force applied to the cart (with magnitude representing the amount of force and
    sign representing the direction)
    | Num | Action                    | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit      |
    |-----|---------------------------|-------------|-------------|----------------------------------|-------|-----------|
    | 0   | Force applied on the cart | -1          | 1           | slider                           | slide | Force (N) |
    ### Observation Space
    The state space consists of positional values of different body parts of the pendulum system,
    followed by the velocities of those individual parts (their derivatives) with all the
    positions ordered before all the velocities.
    The observation is a `ndarray` with shape `(11,)` where the elements correspond to the following:
    | Num | Observation           | Min                  | Max                | Name (in corresponding XML file) | Joint| Unit |
    |-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------|
    | 0   | position of the cart along the linear surface                        | -Inf                 | Inf                | slider | slide | position (m) |
    | 1   | sine of the angle between the cart and the first pole                | -Inf                 | Inf                | sin(hinge) | hinge | unitless |
    | 2   | sine of the angle between the two poles                              | -Inf                 | Inf                | sin(hinge2) | hinge | unitless |
    | 3   | cosine of the angle between the cart and the first pole              | -Inf                 | Inf                | cos(hinge) | hinge | unitless |
    | 4   | cosine of the angle between the two poles                            | -Inf                 | Inf                | cos(hinge2) | hinge | unitless |
    | 5   | velocity of the cart                                                 | -Inf                 | Inf                | slider | slide | velocity (m/s) |
    | 6   | angular velocity of the angle between the cart and the first pole    | -Inf                 | Inf                | hinge | hinge | angular velocity (rad/s) |
    | 7   | angular velocity of the angle between the two poles                  | -Inf                 | Inf                | hinge2 | hinge | angular velocity (rad/s) |
    | 8   | constraint force - 1                                                 | -Inf                 | Inf                |  |  | Force (N) |
    | 9   | constraint force - 2                                                 | -Inf                 | Inf                |  |  | Force (N) |
    | 10  | constraint force - 3                                                 | -Inf                 | Inf                |  |  | Force (N) |
    There is physical contact between the robots and their environment - and Mujoco
    attempts at getting realisitic physics simulations for the possible physical contact
    dynamics by aiming for physical accuracy and computational efficiency.
    There is one constraint force for contacts for each degree of freedom (3).
    The approach and handling of constraints by Mujoco is unique to the simulator
    and is based on their research. Once can find more information in their
    [*documentation*](https://mujoco.readthedocs.io/en/latest/computation.html)
    or in their paper
    ["Analytically-invertible dynamics with contacts and constraints: Theory and implementation in MuJoCo"](https://homes.cs.washington.edu/~todorov/papers/TodorovICRA14.pdf).
    ### Rewards
    The reward consists of two parts:
    - *alive_bonus*: The goal is to make the second inverted pendulum stand upright
    (within a certain angle limit) as long as possible - as such a reward of +10 is awarded
     for each timestep that the second pole is upright.
    - *distance_penalty*: This reward is a measure of how far the *tip* of the second pendulum
    (the only free end) moves, and it is calculated as
    *0.01 * x<sup>2</sup> + (y - 2)<sup>2</sup>*, where *x* is the x-coordinate of the tip
    and *y* is the y-coordinate of the tip of the second pole.
    - *velocity_penalty*: A negative reward for penalising the agent if it moves too
    fast *0.001 *  v<sub>1</sub><sup>2</sup> + 0.005 * v<sub>2</sub> <sup>2</sup>*
    The total reward returned is ***reward*** *=* *alive_bonus - distance_penalty - velocity_penalty*
    ### Starting State
    All observations start in state
    (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0) with a uniform noise in the range
    of [-0.1, 0.1] added to the positional values (cart position and pole angles) and standard
    normal force with a standard deviation of 0.1 added to the velocity values for stochasticity.
    ### Episode Termination
    The episode terminates when any of the following happens:
    1. The episode duration reaches 1000 timesteps.
    2. Any of the state space values is no longer finite.
    3. The y_coordinate of the tip of the second pole *is less than or equal* to 1. The maximum standing height of the system is 1.196 m when all the parts are perpendicularly vertical on top of each other).
    ### Arguments
    No additional arguments are currently supported (in v2 and lower), but modifications can
    be made to the XML file in the assets folder (or by changing the path to a modified XML
    file in another folder)..
    ```
    env = gym.make('InvertedDoublePendulum-v2')
    ```
    There is no v3 for InvertedPendulum, unlike the robot environments where a v3 and
    beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
    ### Version History
    * v4: all mujoco environments now use the mujoco binidings in mujoco>=2.1.3
    * v3: support for gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. rgb rendering comes from tracking camera (so agent does not run away from screen)
    * v2: All continuous control environments now use mujoco_py >= 1.50
    * v1: max_time_steps raised to 1000 for robot based tasks (including inverted pendulum)
    * v0: Initial versions release (1.0.0)
    """
    def __init__(self):
        mujoco_env.MujocoEnv.__init__(self, "inverted_double_pendulum.xml", 5)
        utils.EzPickle.__init__(self)
    def step(self, action):
        self.do_simulation(action, self.frame_skip)
        ob = self._get_obs()
        x, _, y = self.data.site_xpos[0]
        dist_penalty = 0.01 * x**2 + (y - 2) ** 2
        v1, v2 = self.data.qvel[1:3]
        vel_penalty = 1e-3 * v1**2 + 5e-3 * v2**2
        alive_bonus = 10
        r = alive_bonus - dist_penalty - vel_penalty
        done = bool(y <= 1)
        return ob, r, done, {}
    def _get_obs(self):
        return np.concatenate(
            [
                self.data.qpos[:1],  # cart x pos
                np.sin(self.data.qpos[1:]),  # link angles
                np.cos(self.data.qpos[1:]),
                np.clip(self.data.qvel, -10, 10),
                np.clip(self.data.qfrc_constraint, -10, 10),
            ]
        ).ravel()
    def reset_model(self):
        self.set_state(
            self.init_qpos
            + self.np_random.uniform(low=-0.1, high=0.1, size=self.model.nq),
            self.init_qvel + self.np_random.standard_normal(self.model.nv) * 0.1,
        )
        return self._get_obs()
    def viewer_setup(self):
        v = self.viewer
        v.cam.trackbodyid = 0
        v.cam.distance = self.model.stat.extent * 0.5
        v.cam.lookat[2] = 0.12250000000000005  # v.model.stat.center[2]
--- a/gym/envs/mujoco/inverted_pendulum.py
+++ b/gym/envs/mujoco/inverted_pendulum.py
@@ -5,85 +5,11 @@ from gym.envs.mujoco import mujoco_env
 class InvertedPendulumEnv(mujoco_env.MujocoEnv, utils.EzPickle):
    """
    ### Description
    This environment is the cartpole environment based on the work done by
    Barto, Sutton, and Anderson in ["Neuronlike adaptive elements that can
    solve difficult learning control problems"](https://ieeexplore.ieee.org/document/6313077),
    just like in the classic environments but now powered by the Mujoco physics simulator -
    allowing for more complex experiments (such as varying the effects of gravity).
    This environment involves a cart that can moved linearly, with a pole fixed on it
    at one end and having another end free. The cart can be pushed left or right, and the
    goal is to balance the pole on the top of the cart by applying forces on the cart.
    ### Action Space
    The agent take a 1-element vector for actions.
    The action space is a continuous `(action)` in `[-3, 3]`, where `action` represents
    the numerical force applied to the cart (with magnitude representing the amount of
    force and sign representing the direction)
    | Num | Action                    | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit      |
    |-----|---------------------------|-------------|-------------|----------------------------------|-------|-----------|
    | 0   | Force applied on the cart | -3          | 3           | slider                           | slide | Force (N) |
    ### Observation Space
    The state space consists of positional values of different body parts of
    the pendulum system, followed by the velocities of those individual parts (their derivatives)
    with all the positions ordered before all the velocities.
    The observation is a `ndarray` with shape `(4,)` where the elements correspond to the following:
    | Num | Observation           | Min                  | Max                | Name (in corresponding XML file) | Joint| Unit |
    |-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------|
    | 0   | position of the cart along the linear surface | -Inf                 | Inf                | slider | slide | position (m) |
    | 1   | vertical angle of the pole on the cart        | -Inf                 | Inf                | hinge | hinge | angle (rad) |
    | 2   | linear velocity of the cart                   | -Inf                 | Inf                | slider | slide | velocity (m/s) |
    | 3   | angular velocity of the pole on the cart      | -Inf                 | Inf                | hinge | hinge | anglular velocity (rad/s) |
    ### Rewards
    The goal is to make the inverted pendulum stand upright (within a certain angle limit)
    as long as possible - as such a reward of +1 is awarded for each timestep that
    the pole is upright.
    ### Starting State
    All observations start in state
    (0.0, 0.0, 0.0, 0.0) with a uniform noise in the range
    of [-0.01, 0.01] added to the values for stochasticity.
    ### Episode Termination
    The episode terminates when any of the following happens:
    1. The episode duration reaches 1000 timesteps.
    2. Any of the state space values is no longer finite.
    3. The absolutely value of the vertical angle between the pole and the cart is greater than 0.2 radian.
    ### Arguments
    No additional arguments are currently supported.
    ```
    env = gym.make('InvertedPendulum-v2')
    ```
    There is no v3 for InvertedPendulum, unlike the robot environments where a
    v3 and beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
    ### Version History
    * v2: All continuous control environments now use mujoco_py >= 1.50
    * v1: max_time_steps raised to 1000 for robot based tasks (including inverted pendulum)
    * v0: Initial versions release (1.0.0)
    """
    def __init__(self):
        utils.EzPickle.__init__(self)
-        mujoco_env.MujocoEnv.__init__(self, "inverted_pendulum.xml", 2)
+        mujoco_env.MujocoEnv.__init__(
            self, "inverted_pendulum.xml", 2, mujoco_bindings="mujoco_py"
        )
    def step(self, a):
        reward = 1.0
--- a/gym/envs/mujoco/inverted_pendulum_v4.py
+++ b/gym/envs/mujoco/inverted_pendulum_v4.py
@@ -0,0 +1,116 @@
 import numpy as np
 from gym import utils
 from gym.envs.mujoco import mujoco_env
 class InvertedPendulumEnv(mujoco_env.MujocoEnv, utils.EzPickle):
    """
    ### Description
    This environment is the cartpole environment based on the work done by
    Barto, Sutton, and Anderson in ["Neuronlike adaptive elements that can
    solve difficult learning control problems"](https://ieeexplore.ieee.org/document/6313077),
    just like in the classic environments but now powered by the Mujoco physics simulator -
    allowing for more complex experiments (such as varying the effects of gravity).
    This environment involves a cart that can moved linearly, with a pole fixed on it
    at one end and having another end free. The cart can be pushed left or right, and the
    goal is to balance the pole on the top of the cart by applying forces on the cart.
    ### Action Space
    The agent take a 1-element vector for actions.
    The action space is a continuous `(action)` in `[-3, 3]`, where `action` represents
    the numerical force applied to the cart (with magnitude representing the amount of
    force and sign representing the direction)
    | Num | Action                    | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit      |
    |-----|---------------------------|-------------|-------------|----------------------------------|-------|-----------|
    | 0   | Force applied on the cart | -3          | 3           | slider                           | slide | Force (N) |
    ### Observation Space
    The state space consists of positional values of different body parts of
    the pendulum system, followed by the velocities of those individual parts (their derivatives)
    with all the positions ordered before all the velocities.
    The observation is a `ndarray` with shape `(4,)` where the elements correspond to the following:
    | Num | Observation           | Min                  | Max                | Name (in corresponding XML file) | Joint| Unit |
    |-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------|
    | 0   | position of the cart along the linear surface | -Inf                 | Inf                | slider | slide | position (m) |
    | 1   | vertical angle of the pole on the cart        | -Inf                 | Inf                | hinge | hinge | angle (rad) |
    | 2   | linear velocity of the cart                   | -Inf                 | Inf                | slider | slide | velocity (m/s) |
    | 3   | angular velocity of the pole on the cart      | -Inf                 | Inf                | hinge | hinge | anglular velocity (rad/s) |
    ### Rewards
    The goal is to make the inverted pendulum stand upright (within a certain angle limit)
    as long as possible - as such a reward of +1 is awarded for each timestep that
    the pole is upright.
    ### Starting State
    All observations start in state
    (0.0, 0.0, 0.0, 0.0) with a uniform noise in the range
    of [-0.01, 0.01] added to the values for stochasticity.
    ### Episode Termination
    The episode terminates when any of the following happens:
    1. The episode duration reaches 1000 timesteps.
    2. Any of the state space values is no longer finite.
    3. The absolutely value of the vertical angle between the pole and the cart is greater than 0.2 radian.
    ### Arguments
    No additional arguments are currently supported (in v2 and lower),
    but modifications can be made to the XML file in the assets folder
    (or by changing the path to a modified XML file in another folder).
    ```
    env = gym.make('InvertedPendulum-v2')
    ```
    There is no v3 for InvertedPendulum, unlike the robot environments where a
    v3 and beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
    ### Version History
    * v4: all mujoco environments now use the mujoco binidings in mujoco>=2.1.3
    * v3: support for gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. rgb rendering comes from tracking camera (so agent does not run away from screen)
    * v2: All continuous control environments now use mujoco_py >= 1.50
    * v1: max_time_steps raised to 1000 for robot based tasks (including inverted pendulum)
    * v0: Initial versions release (1.0.0)
    """
    def __init__(self):
        utils.EzPickle.__init__(self)
        mujoco_env.MujocoEnv.__init__(self, "inverted_pendulum.xml", 2)
    def step(self, a):
        reward = 1.0
        self.do_simulation(a, self.frame_skip)
        ob = self._get_obs()
        notdone = np.isfinite(ob).all() and (np.abs(ob[1]) <= 0.2)
        done = not notdone
        return ob, reward, done, {}
    def reset_model(self):
        qpos = self.init_qpos + self.np_random.uniform(
            size=self.model.nq, low=-0.01, high=0.01
        )
        qvel = self.init_qvel + self.np_random.uniform(
            size=self.model.nv, low=-0.01, high=0.01
        )
        self.set_state(qpos, qvel)
        return self._get_obs()
    def _get_obs(self):
        return np.concatenate([self.data.qpos, self.data.qvel]).ravel()
    def viewer_setup(self):
        v = self.viewer
        v.cam.trackbodyid = 0
        v.cam.distance = self.model.stat.extent
--- a/gym/envs/mujoco/mujoco_env.py
+++ b/gym/envs/mujoco/mujoco_env.py
@@ -1,4 +1,3 @@
 import os
 from collections import OrderedDict
 from os import path
 from typing import Optional
@@ -6,18 +5,9 @@ from typing import Optional
 import numpy as np
 import gym
-from gym import error, spaces
+from gym import error, logger, spaces
-try:
+DEFAULT_SIZE = 480
    import mujoco_py
 except ImportError as e:
    raise error.DependencyNotInstalled(
        "{}. (HINT: you need to install mujoco_py, and also perform the setup instructions here: https://github.com/openai/mujoco-py/.)".format(
            e
        )
    )
 DEFAULT_SIZE = 500
 def convert_observation_to_space(observation):
@@ -43,28 +33,64 @@ def convert_observation_to_space(observation):
 class MujocoEnv(gym.Env):
    """Superclass for all MuJoCo environments."""
-    def __init__(self, model_path, frame_skip):
+    def __init__(self, model_path, frame_skip, mujoco_bindings="mujoco"):
        if model_path.startswith("/"):
            fullpath = model_path
        else:
-            fullpath = os.path.join(os.path.dirname(__file__), "assets", model_path)
+            fullpath = path.join(path.dirname(__file__), "assets", model_path)
        if not path.exists(fullpath):
            raise OSError(f"File {fullpath} does not exist")
-        self.frame_skip = frame_skip
+
-        self.model = mujoco_py.load_model_from_path(fullpath)
+        if mujoco_bindings == "mujoco_py":
-        self.sim = mujoco_py.MjSim(self.model)
+            logger.warn(
-        self.data = self.sim.data
+                "This version of the mujoco environments depends "
-        self.viewer = None
+                "on the mujoco-py bindings, which are no longer maintained "
                "and may stop working. Please upgrade to the v4 versions of "
                "the environments (which depend on the mujoco python bindings instead), unless "
                "you are trying to precisely replicate previous works)."
            )
            try:
                import mujoco_py
                self._mujoco_bindings = mujoco_py
            except ImportError as e:
                raise error.DependencyNotInstalled(
                    "{}. (HINT: you need to install mujoco_py, and also perform the setup instructions here: https://github.com/openai/mujoco-py/.)".format(
                        e
                    )
                )
            self.model = self._mujoco_bindings.load_model_from_path(fullpath)
            self.sim = self._mujoco_bindings.MjSim(self.model)
            self.data = self.sim.data
        elif mujoco_bindings == "mujoco":
            try:
                import mujoco
                self._mujoco_bindings = mujoco
            except ImportError as e:
                raise error.DependencyNotInstalled(
                    f"{e}. (HINT: you need to install mujoco)"
                )
            self.model = self._mujoco_bindings.MjModel.from_xml_path(fullpath)
            self.data = self._mujoco_bindings.MjData(self.model)
        self.init_qpos = self.data.qpos.ravel().copy()
        self.init_qvel = self.data.qvel.ravel().copy()
        self._viewers = {}
        self.frame_skip = frame_skip
        self.viewer = None
        self.metadata = {
            "render_modes": ["human", "rgb_array", "depth_array"],
            "render_fps": int(np.round(1.0 / self.dt)),
        }
        self.init_qpos = self.sim.data.qpos.ravel().copy()
        self.init_qvel = self.sim.data.qvel.ravel().copy()
        self._set_action_space()
        action = self.action_space.sample()
@@ -111,7 +137,12 @@ class MujocoEnv(gym.Env):
        options: Optional[dict] = None,
    ):
        super().reset(seed=seed)
-        self.sim.reset()
+
        if self._mujoco_bindings.__name__ == "mujoco_py":
            self.sim.reset()
        else:
            self._mujoco_bindings.mj_resetData(self.model, self.data)
        ob = self.reset_model()
        if not return_info:
            return ob
@@ -120,12 +151,19 @@ class MujocoEnv(gym.Env):
    def set_state(self, qpos, qvel):
        assert qpos.shape == (self.model.nq,) and qvel.shape == (self.model.nv,)
-        old_state = self.sim.get_state()
+        if self._mujoco_bindings.__name__ == "mujoco_py":
-        new_state = mujoco_py.MjSimState(
+            state = self.sim.get_state()
-            old_state.time, qpos, qvel, old_state.act, old_state.udd_state
+            state = self._mujoco_bindings.MjSimState(
-        )
+                state.time, qpos, qvel, state.act, state.udd_state
-        self.sim.set_state(new_state)
+            )
-        self.sim.forward()
+            self.sim.set_state(state)
            self.sim.forward()
        else:
            self.data.qpos[:] = np.copy(qpos)
            self.data.qvel[:] = np.copy(qvel)
            if self.model.na == 0:
                self.data.act[:] = None
            self._mujoco_bindings.mj_forward(self.model, self.data)
    @property
    def dt(self):
@@ -135,9 +173,21 @@ class MujocoEnv(gym.Env):
        if np.array(ctrl).shape != self.action_space.shape:
            raise ValueError("Action dimension mismatch")
-        self.sim.data.ctrl[:] = ctrl
+        if self._mujoco_bindings.__name__ == "mujoco_py":
            self.sim.data.ctrl[:] = ctrl
        else:
            self.data.ctrl[:] = ctrl
        for _ in range(n_frames):
-            self.sim.step()
+            if self._mujoco_bindings.__name__ == "mujoco_py":
                self.sim.step()
            else:
                self._mujoco_bindings.mj_step(self.model, self.data)
        # As of MuJoCo 2.0, force-related quantities like cacc are not computed
        # unless there's a force sensor in the model.
        # See https://github.com/openai/gym/issues/1541
        if self._mujoco_bindings.__name__ != "mujoco_py":
            self._mujoco_bindings.mj_rnePostConstraint(self.model, self.data)
    def render(
        self,
@@ -158,19 +208,25 @@ class MujocoEnv(gym.Env):
            if no_camera_specified:
                camera_name = "track"
-            if camera_id is None and camera_name in self.model._camera_name2id:
+            if camera_id is None:
-                camera_id = self.model.camera_name2id(camera_name)
+                if self._mujoco_bindings.__name__ == "mujoco_py":
                    if camera_name in self.model._camera_name2id:
                        camera_id = self.model.camera_name2id(camera_name)
                else:
                    camera_id = self._mujoco_bindings.mj_name2id(
                        self.model,
                        self._mujoco_bindings.mjtObj.mjOBJ_CAMERA,
                        camera_name,
                    )
-            self._get_viewer(mode).render(width, height, camera_id=camera_id)
+                self._get_viewer(mode).render(width, height, camera_id=camera_id)
        if mode == "rgb_array":
            # window size used for old mujoco-py:
            data = self._get_viewer(mode).read_pixels(width, height, depth=False)
            # original image is upside-down, so flip it
            return data[::-1, :, :]
        elif mode == "depth_array":
            self._get_viewer(mode).render(width, height)
            # window size used for old mujoco-py:
            # Extract depth part of the read_pixels() tuple
            data = self._get_viewer(mode).read_pixels(width, height, depth=True)[1]
            # original image is upside-down, so flip it
@@ -180,24 +236,40 @@ class MujocoEnv(gym.Env):
    def close(self):
        if self.viewer is not None:
            # self.viewer.finish()
            self.viewer = None
            self._viewers = {}
-    def _get_viewer(self, mode):
+    def _get_viewer(self, mode, width=DEFAULT_SIZE, height=DEFAULT_SIZE):
        self.viewer = self._viewers.get(mode)
        if self.viewer is None:
            if mode == "human":
-                self.viewer = mujoco_py.MjViewer(self.sim)
+                if self._mujoco_bindings.__name__ == "mujoco_py":
                    self.viewer = self._mujoco_bindings.MjViewer(self.sim)
                else:
                    from gym.envs.mujoco.mujoco_rendering import Viewer
                    self.viewer = Viewer(self.model, self.data)
            elif mode == "rgb_array" or mode == "depth_array":
-                self.viewer = mujoco_py.MjRenderContextOffscreen(self.sim, -1)
+                if self._mujoco_bindings.__name__ == "mujoco_py":
                    self.viewer = self._mujoco_bindings.MjRenderContextOffscreen(
                        self.sim, -1
                    )
                else:
                    from gym.envs.mujoco.mujoco_rendering import RenderContextOffscreen
                    self.viewer = RenderContextOffscreen(
                        width, height, self.model, self.data
                    )
            self.viewer_setup()
            self._viewers[mode] = self.viewer
        return self.viewer
    def get_body_com(self, body_name):
-        return self.data.get_body_xpos(body_name)
+        if self._mujoco_bindings.__name__ == "mujoco_py":
            return self.data.get_body_xpos(body_name)
        else:
            return self.data.body(body_name).xpos
    def state_vector(self):
-        return np.concatenate([self.sim.data.qpos.flat, self.sim.data.qvel.flat])
+        return np.concatenate([self.data.qpos.flat, self.data.qvel.flat])
--- a/gym/envs/mujoco/mujoco_rendering.py
+++ b/gym/envs/mujoco/mujoco_rendering.py
@@ -0,0 +1,554 @@
 import collections
 import os
 import sys
 import time
 from threading import Lock
 import glfw
 import imageio
 import mujoco
 import numpy as np
 def _import_egl(width, height):
    from mujoco.egl import GLContext
    return GLContext(width, height)
 def _import_glfw(width, height):
    from mujoco.glfw import GLContext
    return GLContext(width, height)
 def _import_osmesa(width, height):
    from mujoco.osmesa import GLContext
    return GLContext(width, height)
 _ALL_RENDERERS = collections.OrderedDict(
    [
        ("glfw", _import_glfw),
        ("egl", _import_egl),
        ("osmesa", _import_osmesa),
    ]
 )
 class RenderContext:
    """Render context superclass for offscreen and window rendering."""
    def __init__(self, model, data, offscreen=True):
        self.model = model
        self.data = data
        self.offscreen = offscreen
        max_geom = 1000
        mujoco.mj_forward(self.model, self.data)
        self.scn = mujoco.MjvScene(self.model, max_geom)
        self.cam = mujoco.MjvCamera()
        self.vopt = mujoco.MjvOption()
        self.pert = mujoco.MjvPerturb()
        self.con = mujoco.MjrContext(self.model, mujoco.mjtFontScale.mjFONTSCALE_150)
        self._markers = []
        self._overlays = {}
        self._init_camera()
        self._set_mujoco_buffers()
    def _set_mujoco_buffers(self):
        if self.offscreen:
            mujoco.mjr_setBuffer(mujoco.mjtFramebuffer.mjFB_OFFSCREEN, self.con)
            if self.con.currentBuffer != mujoco.mjtFramebuffer.mjFB_OFFSCREEN:
                raise RuntimeError("Offscreen rendering not supported")
        else:
            mujoco.mjr_setBuffer(mujoco.mjtFramebuffer.mjFB_WINDOW, self.con)
            if self.con.currentBuffer != mujoco.mjtFramebuffer.mjFB_WINDOW:
                raise RuntimeError("Window rendering not supported")
    def update_offscreen_size(self, width, height):
        if width != self.con.offWidth or height != self.con.offHeight:
            self.model.vis.global_.offwidth = width
            self.model.vis.global_.offheight = height
            self.con.free()
            self._set_mujoco_buffers()
    def render(self, width, height, camera_id=None, segmentation=False):
        rect = mujoco.MjrRect(left=0, bottom=0, width=width, height=height)
        # Sometimes buffers are too small.
        if width > self.con.offWidth or height > self.con.offHeight:
            new_width = max(width, self.model.vis.global_.offwidth)
            new_height = max(height, self.model.vis.global_.offheight)
            self.update_offscreen_size(new_width, new_height)
        if camera_id is not None:
            if camera_id == -1:
                self.cam.type = mujoco.mjtCamera.mjCAMERA_FREE
            else:
                self.cam.type = mujoco.mjtCamera.mjCAMERA_FIXED
            self.cam.fixedcamid = camera_id
        mujoco.mjv_updateScene(
            self.model,
            self.data,
            self.vopt,
            self.pert,
            self.cam,
            mujoco.mjtCatBit.mjCAT_ALL,
            self.scn,
        )
        if segmentation:
            self.scn.flags[mujoco.mjtRndFlag.mjRND_SEGMENT] = 1
            self.scn.flags[mujoco.mjtRndFlag.mjRND_IDCOLOR] = 1
        for marker_params in self._markers:
            self._add_marker_to_scene(marker_params)
        mujoco.mjr_render(rect, self.scn, self.con)
        for gridpos, (text1, text2) in self._overlays.items():
            mujoco.mjr_overlay(
                mujoco.mjtFontScale.mjFONTSCALE_150,
                gridpos,
                rect,
                text1.encode(),
                text2.encode(),
                self.con,
            )
        if segmentation:
            self.scn.flags[mujoco.mjtRndFlag.mjRND_SEGMENT] = 0
            self.scn.flags[mujoco.mjtRndFlag.mjRND_IDCOLOR] = 0
    def read_pixels(self, width, height, depth=True, segmentation=False):
        rect = mujoco.MjrRect(left=0, bottom=0, width=width, height=height)
        rgb_arr = np.zeros(3 * rect.width * rect.height, dtype=np.uint8)
        depth_arr = np.zeros(rect.width * rect.height, dtype=np.float32)
        mujoco.mjr_readPixels(rgb_arr, depth_arr, rect, self.con)
        rgb_img = rgb_arr.reshape(rect.height, rect.width, 3)
        ret_img = rgb_img
        if segmentation:
            seg_img = (
                rgb_img[:, :, 0]
                + rgb_img[:, :, 1] * (2**8)
                + rgb_img[:, :, 2] * (2**16)
            )
            seg_img[seg_img >= (self.scn.ngeom + 1)] = 0
            seg_ids = np.full((self.scn.ngeom + 1, 2), fill_value=-1, dtype=np.int32)
            for i in range(self.scn.ngeom):
                geom = self.scn.geoms[i]
                if geom.segid != -1:
                    seg_ids[geom.segid + 1, 0] = geom.objtype
                    seg_ids[geom.segid + 1, 1] = geom.objid
            ret_img = seg_ids[seg_img]
        if depth:
            depth_img = depth_arr.reshape(rect.height, rect.width)
            return (ret_img, depth_img)
        else:
            return ret_img
    def _init_camera(self):
        self.cam.type = mujoco.mjtCamera.mjCAMERA_FREE
        self.cam.fixedcamid = -1
        for i in range(3):
            self.cam.lookat[i] = np.median(self.data.geom_xpos[:, i])
        self.cam.distance = self.model.stat.extent
    def add_overlay(self, gridpos: int, text1: str, text2: str):
        """Overlays text on the scene."""
        if gridpos not in self._overlays:
            self._overlays[gridpos] = ["", ""]
        self._overlays[gridpos][0] += text1 + "\n"
        self._overlays[gridpos][1] += text2 + "\n"
    def add_marker(self, **marker_params):
        self._markers.append(marker_params)
    def _add_marker_to_scene(self, marker):
        if self.scn.ngeom >= self.scn.maxgeom:
            raise RuntimeError("Ran out of geoms. maxgeom: %d" % self.scn.maxgeom)
        g = self.scn.geoms[self.scn.ngeom]
        # default values.
        g.dataid = -1
        g.objtype = mujoco.mjtObj.mjOBJ_UNKNOWN
        g.objid = -1
        g.category = mujoco.mjtCatBit.mjCAT_DECOR
        g.texid = -1
        g.texuniform = 0
        g.texrepeat[0] = 1
        g.texrepeat[1] = 1
        g.emission = 0
        g.specular = 0.5
        g.shininess = 0.5
        g.reflectance = 0
        g.type = mujoco.mjtGeom.mjGEOM_BOX
        g.size[:] = np.ones(3) * 0.1
        g.mat[:] = np.eye(3)
        g.rgba[:] = np.ones(4)
        for key, value in marker.items():
            if isinstance(value, (int, float, mujoco._enums.mjtGeom)):
                setattr(g, key, value)
            elif isinstance(value, (tuple, list, np.ndarray)):
                attr = getattr(g, key)
                attr[:] = np.asarray(value).reshape(attr.shape)
            elif isinstance(value, str):
                assert key == "label", "Only label is a string in mjtGeom."
                if value is None:
                    g.label[0] = 0
                else:
                    g.label = value
            elif hasattr(g, key):
                raise ValueError(
                    "mjtGeom has attr {} but type {} is invalid".format(
                        key, type(value)
                    )
                )
            else:
                raise ValueError("mjtGeom doesn't have field %s" % key)
        self.scn.ngeom += 1
 class RenderContextOffscreen(RenderContext):
    """Offscreen rendering class with opengl context."""
    def __init__(self, width, height, model, data):
        self._get_opengl_backend(width, height)
        self.opengl_context.make_current()
        super().__init__(model, data, offscreen=True)
    def _get_opengl_backend(self, width, height):
        backend = os.environ.get("MUJOCO_GL")
        if backend is not None:
            try:
                self.opengl_context = _ALL_RENDERERS[backend](width, height)
            except KeyError:
                raise RuntimeError(
                    "Environment variable {} must be one of {!r}: got {!r}.".format(
                        "MUJOCO_GL", _ALL_RENDERERS.keys(), backend
                    )
                )
        else:
            for name, _ in _ALL_RENDERERS.items():
                try:
                    self.opengl_context = _ALL_RENDERERS[name](width, height)
                    backend = name
                    break
                except:  # noqa:E722
                    pass
            if backend is None:
                raise RuntimeError(
                    "No OpenGL backend could be imported. Attempting to create a "
                    "rendering context will result in a RuntimeError."
                )
 class Viewer(RenderContext):
    """Class for window rendering in all MuJoCo environments."""
    def __init__(self, model, data):
        self._gui_lock = Lock()
        self._button_left_pressed = False
        self._button_right_pressed = False
        self._last_mouse_x = 0
        self._last_mouse_y = 0
        self._paused = False
        self._transparent = False
        self._contacts = False
        self._render_every_frame = True
        self._image_idx = 0
        self._image_path = "/tmp/frame_%07d.png"
        self._time_per_render = 1 / 60.0
        self._run_speed = 1.0
        self._loop_count = 0
        self._advance_by_one_step = False
        self._hide_menu = False
        # glfw init
        glfw.init()
        width, height = glfw.get_video_mode(glfw.get_primary_monitor()).size
        self.window = glfw.create_window(width // 2, height // 2, "mujoco", None, None)
        glfw.make_context_current(self.window)
        glfw.swap_interval(1)
        framebuffer_width, framebuffer_height = glfw.get_framebuffer_size(self.window)
        window_width, _ = glfw.get_window_size(self.window)
        self._scale = framebuffer_width * 1.0 / window_width
        # set callbacks
        glfw.set_cursor_pos_callback(self.window, self._cursor_pos_callback)
        glfw.set_mouse_button_callback(self.window, self._mouse_button_callback)
        glfw.set_scroll_callback(self.window, self._scroll_callback)
        glfw.set_key_callback(self.window, self._key_callback)
        # get viewport
        self.viewport = mujoco.MjrRect(0, 0, framebuffer_width, framebuffer_height)
        super().__init__(model, data, offscreen=False)
    def _key_callback(self, key, action):
        if action != glfw.RELEASE:
            return
        # Switch cameras
        elif key == glfw.KEY_TAB:
            self.cam.fixedcamid += 1
            self.cam.type = mujoco.mjtCamera.mjCAMERA_FIXED
            if self.cam.fixedcamid >= self.model.ncam:
                self.cam.fixedcamid = -1
                self.cam.type = mujoco.mjtCamera.mjCAMERA_FREE
        # Pause simulation
        elif key == glfw.KEY_SPACE and self._paused is not None:
            self._paused = not self._paused
        # Advances simulation by one step.
        elif key == glfw.KEY_RIGHT and self._paused is not None:
            self._advance_by_one_step = True
            self._paused = True
        # Slows down simulation
        elif key == glfw.KEY_S:
            self._run_speed /= 2.0
        # Speeds up simulation
        elif key == glfw.KEY_F:
            self._run_speed *= 2.0
        # Turn off / turn on rendering every frame.
        elif key == glfw.KEY_D:
            self._render_every_frame = not self._render_every_frame
        # Capture screenshot
        elif key == glfw.KEY_T:
            img = np.zeros(
                (
                    glfw.get_framebuffer_size(self.window)[1],
                    glfw.get_framebuffer_size(self.window)[0],
                    3,
                ),
                dtype=np.uint8,
            )
            mujoco.mjr_readPixels(img, None, self.viewport, self.con)
            imageio.imwrite(self._image_path % self._image_idx, np.flipud(img))
            self._image_idx += 1
        # Display contact forces
        elif key == glfw.KEY_C:
            self._contacts = not self._contacts
            self.vopt.flags[mujoco.mjtVisFlag.mjVIS_CONTACTPOINT] = self._contacts
            self.vopt.flags[mujoco.mjtVisFlag.mjVIS_CONTACTFORCE] = self._contacts
        # Display coordinate frames
        elif key == glfw.KEY_E:
            self.vopt.frame = 1 - self.vopt.frame
        # Hide overlay menu
        elif key == glfw.KEY_H:
            self._hide_menu = not self._hide_menu
        # Make transparent
        elif key == glfw.KEY_R:
            self._transparent = not self._transparent
            if self._transparent:
                self.model.geom_rgba[:, 3] /= 5.0
            else:
                self.model.geom_rgba[:, 3] *= 5.0
        # Geom group visibility
        elif key in (glfw.KEY_0, glfw.KEY_1, glfw.KEY_2, glfw.KEY_3, glfw.KEY_4):
            self.vopt.geomgroup[key - glfw.KEY_0] ^= 1
        # Quit
        if key == glfw.KEY_ESCAPE:
            print("Pressed ESC")
            print("Quitting.")
            glfw.terminate()
            sys.exit(0)
    def _cursor_pos_callback(self, window, xpos, ypos):
        if not (self._button_left_pressed or self._button_right_pressed):
            return
        mod_shift = (
            glfw.get_key(window, glfw.KEY_LEFT_SHIFT) == glfw.PRESS
            or glfw.get_key(window, glfw.KEY_RIGHT_SHIFT) == glfw.PRESS
        )
        if self._button_right_pressed:
            action = (
                mujoco.mjtMouse.mjMOUSE_MOVE_H
                if mod_shift
                else mujoco.mjtMouse.mjMOUSE_MOVE_V
            )
        elif self._button_left_pressed:
            action = (
                mujoco.mjtMouse.mjMOUSE_ROTATE_H
                if mod_shift
                else mujoco.mjtMouse.mjMOUSE_ROTATE_V
            )
        else:
            action = mujoco.mjtMouse.mjMOUSE_ZOOM
        dx = int(self._scale * xpos) - self._last_mouse_x
        dy = int(self._scale * ypos) - self._last_mouse_y
        width, height = glfw.get_framebuffer_size(window)
        with self._gui_lock:
            mujoco.mjv_moveCamera(
                self.model, action, dx / height, dy / height, self.scn, self.cam
            )
        self._last_mouse_x = int(self._scale * xpos)
        self._last_mouse_y = int(self._scale * ypos)
    def _mouse_button_callback(self, window, button, act, mods):
        self._button_left_pressed = (
            glfw.get_mouse_button(window, glfw.MOUSE_BUTTON_LEFT) == glfw.PRESS
        )
        self._button_right_pressed = (
            glfw.get_mouse_button(window, glfw.MOUSE_BUTTON_RIGHT) == glfw.PRESS
        )
        x, y = glfw.get_cursor_pos(window)
        self._last_mouse_x = int(self._scale * x)
        self._last_mouse_y = int(self._scale * y)
    def _scroll_callback(self, window, x_offset, y_offset):
        with self._gui_lock:
            mujoco.mjv_moveCamera(
                self.model,
                mujoco.mjtMouse.mjMOUSE_ZOOM,
                0,
                -0.05 * y_offset,
                self.scn,
                self.cam,
            )
    def _create_overlay(self):
        topleft = mujoco.mjtGridPos.mjGRID_TOPLEFT
        bottomleft = mujoco.mjtGridPos.mjGRID_BOTTOMLEFT
        if self._render_every_frame:
            self.add_overlay(topleft, "", "")
        else:
            self.add_overlay(
                topleft,
                "Run speed = %.3f x real time" % self._run_speed,
                "[S]lower, [F]aster",
            )
        self.add_overlay(
            topleft, "Ren[d]er every frame", "On" if self._render_every_frame else "Off"
        )
        self.add_overlay(
            topleft,
            "Switch camera (#cams = %d)" % (self.model.ncam + 1),
            "[Tab] (camera ID = %d)" % self.cam.fixedcamid,
        )
        self.add_overlay(topleft, "[C]ontact forces", "On" if self._contacts else "Off")
        self.add_overlay(topleft, "T[r]ansparent", "On" if self._transparent else "Off")
        if self._paused is not None:
            if not self._paused:
                self.add_overlay(topleft, "Stop", "[Space]")
            else:
                self.add_overlay(topleft, "Start", "[Space]")
                self.add_overlay(
                    topleft, "Advance simulation by one step", "[right arrow]"
                )
        self.add_overlay(
            topleft, "Referenc[e] frames", "On" if self.vopt.frame == 1 else "Off"
        )
        self.add_overlay(topleft, "[H]ide Menu", "")
        if self._image_idx > 0:
            fname = self._image_path % (self._image_idx - 1)
            self.add_overlay(topleft, "Cap[t]ure frame", "Saved as %s" % fname)
        else:
            self.add_overlay(topleft, "Cap[t]ure frame", "")
        self.add_overlay(topleft, "Toggle geomgroup visibility", "0-4")
        self.add_overlay(bottomleft, "FPS", "%d%s" % (1 / self._time_per_render, ""))
        self.add_overlay(
            bottomleft, "Solver iterations", str(self.data.solver_iter + 1)
        )
        self.add_overlay(
            bottomleft, "Step", str(round(self.data.time / self.model.opt.timestep))
        )
        self.add_overlay(bottomleft, "timestep", "%.5f" % self.model.opt.timestep)
    def render(self):
        # mjv_updateScene, mjr_render, mjr_overlay
        def update():
            # fill overlay items
            self._create_overlay()
            render_start = time.time()
            if self.window is None:
                return
            elif glfw.window_should_close(self.window):
                glfw.terminate()
                sys.exit(0)
            self.viewport.width, self.viewport.height = glfw.get_framebuffer_size(
                self.window
            )
            with self._gui_lock:
                # update scene
                mujoco.mjv_updateScene(
                    self.model,
                    self.data,
                    self.vopt,
                    mujoco.MjvPerturb(),
                    self.cam,
                    mujoco.mjtCatBit.mjCAT_ALL.value,
                    self.scn,
                )
                # marker items
                for marker in self._markers:
                    self._add_marker_to_scene(marker)
                # render
                mujoco.mjr_render(self.viewport, self.scn, self.con)
                # overlay items
                if not self._hide_menu:
                    for gridpos, [t1, t2] in self._overlays.items():
                        mujoco.mjr_overlay(
                            mujoco.mjtFontScale.mjFONTSCALE_150,
                            gridpos,
                            self.viewport,
                            t1,
                            t2,
                            self.con,
                        )
                glfw.swap_buffers(self.window)
            glfw.poll_events()
            self._time_per_render = 0.9 * self._time_per_render + 0.1 * (
                time.time() - render_start
            )
            # clear overlay
            self._overlays.clear()
        if self._paused:
            while self._paused:
                update()
                if self._advance_by_one_step:
                    self._advance_by_one_step = False
                    break
        else:
            self._loop_count += self.model.opt.timestep / (
                self._time_per_render * self._run_speed
            )
            if self._render_every_frame:
                self._loop_count = 1
            while self._loop_count > 0:
                update()
                self._loop_count -= 1
        # clear markers
        self._markers[:] = []
    def close(self):
        glfw.terminate()
        sys.exit(0)
--- a/gym/envs/mujoco/pusher.py
+++ b/gym/envs/mujoco/pusher.py
@@ -5,131 +5,11 @@ from gym.envs.mujoco import mujoco_env
 class PusherEnv(mujoco_env.MujocoEnv, utils.EzPickle):
    """
    ### Description
    "Pusher" is a multi-jointed robot arm which is very similar to that of a human.
     The goal is to move a target cylinder (called *object*) to a goal position using the robot's end effector (called *fingertip*).
      The robot consists of shoulder, elbow, forearm, and wrist joints.
    ### Action Space
    The action space is a `Box(-2, 2, (7,), float32)`. An action `(a, b)` represents the torques applied at the hinge joints.
    | Num | Action                                                             | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit         |
    |-----|--------------------------------------------------------------------|-------------|-------------|----------------------------------|-------|--------------|
    | 0    | Rotation of the panning the shoulder                              | -2          | 2           | r_shoulder_pan_joint             | hinge | torque (N m) |
    | 1    | Rotation of the shoulder lifting joint                            | -2          | 2           | r_shoulder_lift_joint            | hinge | torque (N m) |
    | 2    | Rotation of the shoulder rolling joint                            | -2          | 2           | r_upper_arm_roll_joint           | hinge | torque (N m) |
    | 3    | Rotation of hinge joint that flexed the elbow                     | -2          | 2           | r_elbow_flex_joint               | hinge | torque (N m) |
    | 4    | Rotation of hinge that rolls the forearm                          | -2          | 2           | r_forearm_roll_joint             | hinge | torque (N m) |
    | 5    | Rotation of flexing the wrist                                     | -2          | 2           | r_wrist_flex_joint               | hinge | torque (N m) |
    | 6    | Rotation of rolling the wrist                                     | -2          | 2           | r_wrist_roll_joint               | hinge | torque (N m) |
    ### Observation Space
    Observations consist of
    - Angle of rotational joints on the pusher
    - Anglular velocities of rotational joints on the pusher
    - The coordinates of the fingertip of the pusher
    - The coordinates of the object to be moved
    - The coordinates of the goal position
    The observation is a `ndarray` with shape `(23,)` where the elements correspond to the table below.
    An analogy can be drawn to a human arm in order to help understand the state space, with the words flex and roll meaning the
    same as human joints.
    | Num | Observation           | Min                  | Max                | Name (in corresponding XML file) | Joint| Unit |
    |-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------|
    | 0    | Rotation of the panning the shoulder                              | -Inf                 | Inf                | r_shoulder_pan_joint | hinge | angle (rad) |
    | 1    | Rotation of the shoulder lifting joint                            | -Inf                 | Inf                | r_shoulder_lift_joint | hinge | angle (rad) |
    | 2    | Rotation of the shoulder rolling joint                            | -Inf                 | Inf                | r_upper_arm_roll_joint | hinge | angle (rad) |
    | 3    | Rotation of hinge joint that flexed the elbow                     | -Inf                 | Inf                | r_elbow_flex_joint | hinge | angle (rad) |
    | 4    | Rotation of hinge that rolls the forearm                          | -Inf                 | Inf                | r_forearm_roll_joint | hinge | angle (rad) |
    | 5    | Rotation of flexing the wrist                                     | -Inf                 | Inf                | r_wrist_flex_joint | hinge | angle (rad) |
    | 6    | Rotation of rolling the wrist                                     | -Inf                 | Inf                | r_wrist_roll_joint | hinge | angle (rad) |
    | 7    | Rotational velocity of the panning the shoulder                   | -Inf                 | Inf                | r_shoulder_pan_joint | hinge | angular velocity (rad/s) |
    | 8    | Rotational velocity of the shoulder lifting joint                 | -Inf                 | Inf                | r_shoulder_lift_joint | hinge | angular velocity (rad/s) |
    | 9    | Rotational velocity of the shoulder rolling joint                 | -Inf                 | Inf                | r_upper_arm_roll_joint | hinge | angular velocity (rad/s) |
    | 10   | Rotational velocity of hinge joint that flexed the elbow          | -Inf                 | Inf                | r_elbow_flex_joint | hinge | angular velocity (rad/s) |
    | 11   | Rotational velocity of hinge that rolls the forearm               | -Inf                 | Inf                | r_forearm_roll_joint | hinge | angular velocity (rad/s) |
    | 12   | Rotational velocity of flexing the wrist                          | -Inf                 | Inf                | r_wrist_flex_joint | hinge | angular velocity (rad/s) |
    | 13   | Rotational velocity of rolling the wrist                          | -Inf                 | Inf                | r_wrist_roll_joint | hinge | angular velocity (rad/s) |
    | 14   | x-coordinate of the fingertip of the pusher                       | -Inf                 | Inf                | tips_arm | slide | position (m) |
    | 15   | y-coordinate of the fingertip of the pusher                       | -Inf                 | Inf                | tips_arm | slide | position (m) |
    | 16   | z-coordinate of the fingertip of the pusher                       | -Inf                 | Inf                | tips_arm | slide | position (m) |
    | 17   | x-coordinate of the object to be moved                            | -Inf                 | Inf                | object (obj_slidex) | slide | position (m) |
    | 18   | y-coordinate of the object to be moved                            | -Inf                 | Inf                | object (obj_slidey) | slide | position (m) |
    | 19   | z-coordinate of the object to be moved                            | -Inf                 | Inf                | object | cylinder | position (m) |
    | 20   | x-coordinate of the goal position of the object                   | -Inf                 | Inf                | goal (goal_slidex) | slide | position (m) |
    | 21   | y-coordinate of the goal position of the object                   | -Inf                 | Inf                | goal (goal_slidey) | slide | position (m) |
    | 22   | z-coordinate of the goal position of the object                   | -Inf                 | Inf                | goal | sphere | position (m) |
    ### Rewards
    The reward consists of two parts:
    - *reward_near *: This reward is a measure of how far the *fingertip*
    of the pusher (the unattached end) is from the object, with a more negative
    value assigned for when the pusher's *fingertip* is further away from the
    target. It is calculated as the negative vector norm of (position of
    the fingertip - position of target), or *-norm("fingertip" - "target")*.
    - *reward_dist *: This reward is a measure of how far the object is from
    the target goal position, with a more negative value assigned for object is
    further away from the target. It is calculated as the negative vector norm of
    (position of the object - position of goal), or *-norm("object" - "target")*.
    - *reward_control*: A negative reward for penalising the pusher if
    it takes actions that are too large. It is measured as the negative squared
    Euclidean norm of the action, i.e. as *- sum(action<sup>2</sup>)*.
    The total reward returned is ***reward*** *=* *reward_dist + 0.1 * reward_ctrl + 0.5 * reward_near*
    Unlike other environments, Pusher does not allow you to specify weights for the individual reward terms.
    However, `info` does contain the keys *reward_dist* and *reward_ctrl*. Thus, if you'd like to weight the terms,
    you should create a wrapper that computes the weighted reward from `info`.
    ### Starting State
    All pusher (not including object and goal) states start in
    (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0). A uniform noise in the range
    [-0.005, 0.005] is added to the velocity attributes only. The velocities of
    the object and goal are permanently set to 0. The object's x-position is selected uniformly
    between [-0.3, 0] while the y-position is selected uniformly between [-0.2, 0.2], and this
    process is repeated until the vector norm between the object's (x,y) position and origin is not greater
    than 0.17. The goal always have the same position of (0.45, -0.05, -0.323).
    The default framerate is 5 with each frame lasting for 0.01, giving rise to a *dt = 5 * 0.01 = 0.05*
    ### Episode Termination
    The episode terminates when any of the following happens:
    1. The episode duration reaches a 100 timesteps.
    2. Any of the state space values is no longer finite.
    ### Arguments
    No additional arguments are currently supported (in v2 and lower),
    but modifications can be made to the XML file in the assets folder
    (or by changing the path to a modified XML file in another folder)..
    ```
    env = gym.make('Pusher-v2')
    ```
    There is no v3 for Pusher, unlike the robot environments where a v3 and
    beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
    ### Version History
    * v2: All continuous control environments now use mujoco_py >= 1.50
    * v1: max_time_steps raised to 1000 for robot based tasks (not including reacher, which has a max_time_steps of 50). Added reward_threshold to environments.
    * v0: Initial versions release (1.0.0)
    """
    def __init__(self):
        utils.EzPickle.__init__(self)
-        mujoco_env.MujocoEnv.__init__(self, "pusher.xml", 5)
+        mujoco_env.MujocoEnv.__init__(
            self, "pusher.xml", 5, mujoco_bindings="mujoco_py"
        )
    def step(self, a):
        vec_1 = self.get_body_com("object") - self.get_body_com("tips_arm")
--- a/gym/envs/mujoco/pusher_v4.py
+++ b/gym/envs/mujoco/pusher_v4.py
@@ -0,0 +1,185 @@
 import numpy as np
 from gym import utils
 from gym.envs.mujoco import mujoco_env
 class PusherEnv(mujoco_env.MujocoEnv, utils.EzPickle):
    """
    ### Description
    "Pusher" is a multi-jointed robot arm which is very similar to that of a human.
    The goal is to move a target cylinder (called *object*) to a goal position using the robot's end effector (called *fingertip*).
    The robot consists of shoulder, elbow, forearm, and wrist joints.
    ### Action Space
    The action space is a `Box(-2, 2, (7,), float32)`. An action `(a, b)` represents the torques applied at the hinge joints.
    | Num | Action                                                             | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit         |
    |-----|--------------------------------------------------------------------|-------------|-------------|----------------------------------|-------|--------------|
    | 0    | Rotation of the panning the shoulder                              | -2          | 2           | r_shoulder_pan_joint             | hinge | torque (N m) |
    | 1    | Rotation of the shoulder lifting joint                            | -2          | 2           | r_shoulder_lift_joint            | hinge | torque (N m) |
    | 2    | Rotation of the shoulder rolling joint                            | -2          | 2           | r_upper_arm_roll_joint           | hinge | torque (N m) |
    | 3    | Rotation of hinge joint that flexed the elbow                     | -2          | 2           | r_elbow_flex_joint               | hinge | torque (N m) |
    | 4    | Rotation of hinge that rolls the forearm                          | -2          | 2           | r_forearm_roll_joint             | hinge | torque (N m) |
    | 5    | Rotation of flexing the wrist                                     | -2          | 2           | r_wrist_flex_joint               | hinge | torque (N m) |
    | 6    | Rotation of rolling the wrist                                     | -2          | 2           | r_wrist_roll_joint               | hinge | torque (N m) |
    ### Observation Space
    Observations consist of
    - Angle of rotational joints on the pusher
    - Anglular velocities of rotational joints on the pusher
    - The coordinates of the fingertip of the pusher
    - The coordinates of the object to be moved
    - The coordinates of the goal position
    The observation is a `ndarray` with shape `(23,)` where the elements correspond to the table below.
    An analogy can be drawn to a human arm in order to help understand the state space, with the words flex and roll meaning the
    same as human joints.
    | Num | Observation           | Min                  | Max                | Name (in corresponding XML file) | Joint| Unit |
    |-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------|
    | 0    | Rotation of the panning the shoulder                              | -Inf                 | Inf                | r_shoulder_pan_joint | hinge | angle (rad) |
    | 1    | Rotation of the shoulder lifting joint                            | -Inf                 | Inf                | r_shoulder_lift_joint | hinge | angle (rad) |
    | 2    | Rotation of the shoulder rolling joint                            | -Inf                 | Inf                | r_upper_arm_roll_joint | hinge | angle (rad) |
    | 3    | Rotation of hinge joint that flexed the elbow                     | -Inf                 | Inf                | r_elbow_flex_joint | hinge | angle (rad) |
    | 4    | Rotation of hinge that rolls the forearm                          | -Inf                 | Inf                | r_forearm_roll_joint | hinge | angle (rad) |
    | 5    | Rotation of flexing the wrist                                     | -Inf                 | Inf                | r_wrist_flex_joint | hinge | angle (rad) |
    | 6    | Rotation of rolling the wrist                                     | -Inf                 | Inf                | r_wrist_roll_joint | hinge | angle (rad) |
    | 7    | Rotational velocity of the panning the shoulder                   | -Inf                 | Inf                | r_shoulder_pan_joint | hinge | angular velocity (rad/s) |
    | 8    | Rotational velocity of the shoulder lifting joint                 | -Inf                 | Inf                | r_shoulder_lift_joint | hinge | angular velocity (rad/s) |
    | 9    | Rotational velocity of the shoulder rolling joint                 | -Inf                 | Inf                | r_upper_arm_roll_joint | hinge | angular velocity (rad/s) |
    | 10   | Rotational velocity of hinge joint that flexed the elbow          | -Inf                 | Inf                | r_elbow_flex_joint | hinge | angular velocity (rad/s) |
    | 11   | Rotational velocity of hinge that rolls the forearm               | -Inf                 | Inf                | r_forearm_roll_joint | hinge | angular velocity (rad/s) |
    | 12   | Rotational velocity of flexing the wrist                          | -Inf                 | Inf                | r_wrist_flex_joint | hinge | angular velocity (rad/s) |
    | 13   | Rotational velocity of rolling the wrist                          | -Inf                 | Inf                | r_wrist_roll_joint | hinge | angular velocity (rad/s) |
    | 14   | x-coordinate of the fingertip of the pusher                       | -Inf                 | Inf                | tips_arm | slide | position (m) |
    | 15   | y-coordinate of the fingertip of the pusher                       | -Inf                 | Inf                | tips_arm | slide | position (m) |
    | 16   | z-coordinate of the fingertip of the pusher                       | -Inf                 | Inf                | tips_arm | slide | position (m) |
    | 17   | x-coordinate of the object to be moved                            | -Inf                 | Inf                | object (obj_slidex) | slide | position (m) |
    | 18   | y-coordinate of the object to be moved                            | -Inf                 | Inf                | object (obj_slidey) | slide | position (m) |
    | 19   | z-coordinate of the object to be moved                            | -Inf                 | Inf                | object | cylinder | position (m) |
    | 20   | x-coordinate of the goal position of the object                   | -Inf                 | Inf                | goal (goal_slidex) | slide | position (m) |
    | 21   | y-coordinate of the goal position of the object                   | -Inf                 | Inf                | goal (goal_slidey) | slide | position (m) |
    | 22   | z-coordinate of the goal position of the object                   | -Inf                 | Inf                | goal | sphere | position (m) |
    ### Rewards
    The reward consists of two parts:
    - *reward_near *: This reward is a measure of how far the *fingertip*
    of the pusher (the unattached end) is from the object, with a more negative
    value assigned for when the pusher's *fingertip* is further away from the
    target. It is calculated as the negative vector norm of (position of
    the fingertip - position of target), or *-norm("fingertip" - "target")*.
    - *reward_dist *: This reward is a measure of how far the object is from
    the target goal position, with a more negative value assigned for object is
    further away from the target. It is calculated as the negative vector norm of
    (position of the object - position of goal), or *-norm("object" - "target")*.
    - *reward_control*: A negative reward for penalising the pusher if
    it takes actions that are too large. It is measured as the negative squared
    Euclidean norm of the action, i.e. as *- sum(action<sup>2</sup>)*.
    The total reward returned is ***reward*** *=* *reward_dist + 0.1 * reward_ctrl + 0.5 * reward_near*
    Unlike other environments, Pusher does not allow you to specify weights for the individual reward terms.
    However, `info` does contain the keys *reward_dist* and *reward_ctrl*. Thus, if you'd like to weight the terms,
    you should create a wrapper that computes the weighted reward from `info`.
    ### Starting State
    All pusher (not including object and goal) states start in
    (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0). A uniform noise in the range
    [-0.005, 0.005] is added to the velocity attributes only. The velocities of
    the object and goal are permanently set to 0. The object's x-position is selected uniformly
    between [-0.3, 0] while the y-position is selected uniformly between [-0.2, 0.2], and this
    process is repeated until the vector norm between the object's (x,y) position and origin is not greater
    than 0.17. The goal always have the same position of (0.45, -0.05, -0.323).
    The default framerate is 5 with each frame lasting for 0.01, giving rise to a *dt = 5 * 0.01 = 0.05*
    ### Episode Termination
    The episode terminates when any of the following happens:
    1. The episode duration reaches a 100 timesteps.
    2. Any of the state space values is no longer finite.
    ### Arguments
    No additional arguments are currently supported (in v2 and lower),
    but modifications can be made to the XML file in the assets folder
    (or by changing the path to a modified XML file in another folder)..
    ```
    env = gym.make('Pusher-v2')
    ```
    There is no v3 for Pusher, unlike the robot environments where a v3 and
    beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
    ### Version History
    * v4: all mujoco environments now use the mujoco binidings in mujoco>=2.1.3
    * v2: All continuous control environments now use mujoco_py >= 1.50
    * v1: max_time_steps raised to 1000 for robot based tasks (not including reacher, which has a max_time_steps of 50). Added reward_threshold to environments.
    * v0: Initial versions release (1.0.0)
    """
    def __init__(self):
        utils.EzPickle.__init__(self)
        mujoco_env.MujocoEnv.__init__(self, "pusher.xml", 5)
    def step(self, a):
        vec_1 = self.get_body_com("object") - self.get_body_com("tips_arm")
        vec_2 = self.get_body_com("object") - self.get_body_com("goal")
        reward_near = -np.linalg.norm(vec_1)
        reward_dist = -np.linalg.norm(vec_2)
        reward_ctrl = -np.square(a).sum()
        reward = reward_dist + 0.1 * reward_ctrl + 0.5 * reward_near
        self.do_simulation(a, self.frame_skip)
        ob = self._get_obs()
        done = False
        return ob, reward, done, dict(reward_dist=reward_dist, reward_ctrl=reward_ctrl)
    def viewer_setup(self):
        self.viewer.cam.trackbodyid = -1
        self.viewer.cam.distance = 4.0
    def reset_model(self):
        qpos = self.init_qpos
        self.goal_pos = np.asarray([0, 0])
        while True:
            self.cylinder_pos = np.concatenate(
                [
                    self.np_random.uniform(low=-0.3, high=0, size=1),
                    self.np_random.uniform(low=-0.2, high=0.2, size=1),
                ]
            )
            if np.linalg.norm(self.cylinder_pos - self.goal_pos) > 0.17:
                break
        qpos[-4:-2] = self.cylinder_pos
        qpos[-2:] = self.goal_pos
        qvel = self.init_qvel + self.np_random.uniform(
            low=-0.005, high=0.005, size=self.model.nv
        )
        qvel[-4:] = 0
        self.set_state(qpos, qvel)
        return self._get_obs()
    def _get_obs(self):
        return np.concatenate(
            [
                self.data.qpos.flat[:7],
                self.data.qvel.flat[:7],
                self.get_body_com("tips_arm"),
                self.get_body_com("object"),
                self.get_body_com("goal"),
            ]
        )
--- a/gym/envs/mujoco/reacher.py
+++ b/gym/envs/mujoco/reacher.py
@@ -5,121 +5,11 @@ from gym.envs.mujoco import mujoco_env
 class ReacherEnv(mujoco_env.MujocoEnv, utils.EzPickle):
    """
    ### Description
    "Reacher" is a two-jointed robot arm. The goal is to move the robot's end effector (called *fingertip*) close to a
    target that is spawned at a random position.
    ### Action Space
    The action space is a `Box(-1, 1, (2,), float32)`. An action `(a, b)` represents the torques applied at the hinge joints.
    | Num | Action                                                                          | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit |
    |-----|---------------------------------------------------------------------------------|-------------|-------------|--------------------------|-------|------|
    | 0   | Torque applied at the first hinge (connecting the link to the point of fixture) | -1 | 1 | joint0  | hinge | torque (N m) |
    | 1   |  Torque applied at the second hinge (connecting the two links)                  | -1 | 1 | joint1  | hinge | torque (N m) |
    ### Observation Space
    Observations consist of
    - The cosine of the angles of the two arms
    - The sine of the angles of the two arms
    - The coordinates of the target
    - The angular velocities of the arms
    - The vector between the target and the reacher's fingertip (3 dimensional with the last element being 0)
    The observation is a `ndarray` with shape `(11,)` where the elements correspond to the following:
    | Num | Observation           | Min                  | Max                | Name (in corresponding XML file) | Joint| Unit |
    |-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------|
    | 0   | cosine of the angle of the first arm                                                            | -Inf                 | Inf                | cos(joint0) | hinge | unitless |
    | 1   | cosine of the angle of the second arm                                                           | -Inf                 | Inf                | cos(joint1) | hinge | unitless |
    | 2   | sine of the angle of the first arm                                                              | -Inf                 | Inf                | cos(joint0) | hinge | unitless |
    | 3   | sine of the angle of the second arm                                                             | -Inf                 | Inf                | cos(joint1) | hinge | unitless |
    | 4   |  x-coorddinate of the target                                                                    | -Inf                 | Inf                | target_x | slide | position (m) |
    | 5   |  y-coorddinate of the target                                                                    | -Inf                 | Inf                | target_y | slide | position (m) |
    | 6   | angular velocity of the first arm                                                               | -Inf                 | Inf                | joint0 | hinge | angular velocity (rad/s) |
    | 7   | angular velocity of the second arm                                                              | -Inf                 | Inf                | joint1 | hinge | angular velocity (rad/s) |
    | 8   | x-value of position_fingertip - position_target                                                 | -Inf                 | Inf                | NA | slide | position (m) |
    | 9   | y-value of position_fingertip - position_target                                                 | -Inf                 | Inf                | NA | slide | position (m) |
    | 10  | z-value of position_fingertip - position_target (0 since reacher is 2d and z is same for both)  | -Inf                 | Inf                | NA | slide | position (m) |
    Most Gym environments just return the positions and velocity of the
    joints in the `.xml` file as the state of the environment. However, in
    reacher the state is created by combining only certain elements of the
    position and velocity, and performing some function transformations on them.
    If one is to read the `.xml` for reacher then they will find 4 joints:
    | Num | Observation                 | Min      | Max      | Name (in corresponding XML file) | Joint | Unit               |
    |-----|-----------------------------|----------|----------|----------------------------------|-------|--------------------|
    | 0   | angle of the first arm      | -Inf     | Inf      | joint0                           | hinge | angle (rad)        |
    | 1   | angle of the second arm     | -Inf     | Inf      | joint1                           | hinge | angle (rad)        |
    | 2   | x-coordinate of the target  | -Inf     | Inf      | target_x                         | slide | position (m)       |
    | 3   | y-coordinate of the target  | -Inf     | Inf      | target_y                         | slide | position (m)       |
    ### Rewards
    The reward consists of two parts:
    - *reward_distance*: This reward is a measure of how far the *fingertip*
    of the reacher (the unattached end) is from the target, with a more negative
    value assigned for when the reacher's *fingertip* is further away from the
    target. It is calculated as the negative vector norm of (position of
    the fingertip - position of target), or *-norm("fingertip" - "target")*.
    - *reward_control*: A negative reward for penalising the walker if
    it takes actions that are too large. It is measured as the negative squared
    Euclidean norm of the action, i.e. as *- sum(action<sup>2</sup>)*.
    The total reward returned is ***reward*** *=* *reward_distance + reward_control*
    Unlike other environments, Reacher does not allow you to specify weights for the individual reward terms.
    However, `info` does contain the keys *reward_dist* and *reward_ctrl*. Thus, if you'd like to weight the terms,
    you should create a wrapper that computes the weighted reward from `info`.
    ### Starting State
    All observations start in state
    (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
    with a noise added for stochasticity. A uniform noise in the range
    [-0.1, 0.1] is added to the positional attributes, while the target position
    is selected uniformly at random in a disk of radius 0.2 around the origin.
    Independent, uniform noise in the
    range of [-0.005, 0.005] is added to the velocities, and the last
    element ("fingertip" - "target") is calculated at the end once everything
    is set. The default setting has a framerate of 2 and a *dt = 2 * 0.01 = 0.02*
    ### Episode Termination
    The episode terminates when any of the following happens:
    1. The episode duration reaches a 50 timesteps (with a new random target popping up if the reacher's fingertip reaches it before 50 timesteps)
    2. Any of the state space values is no longer finite.
    ### Arguments
    No additional arguments are currently supported (in v2 and lower),
    but modifications can be made to the XML file in the assets folder
    (or by changing the path to a modified XML file in another folder)..
    ```
    env = gym.make('Reacher-v2')
    ```
    There is no v3 for Reacher, unlike the robot environments where a v3 and
    beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
    ### Version History
    * v2: All continuous control environments now use mujoco_py >= 1.50
    * v1: max_time_steps raised to 1000 for robot based tasks (not including reacher, which has a max_time_steps of 50). Added reward_threshold to environments.
    * v0: Initial versions release (1.0.0)
    """
    def __init__(self):
        utils.EzPickle.__init__(self)
-        mujoco_env.MujocoEnv.__init__(self, "reacher.xml", 2)
+        mujoco_env.MujocoEnv.__init__(
            self, "reacher.xml", 2, mujoco_bindings="mujoco_py"
        )
    def step(self, a):
        vec = self.get_body_com("fingertip") - self.get_body_com("target")
--- a/gym/envs/mujoco/reacher_v4.py
+++ b/gym/envs/mujoco/reacher_v4.py
@@ -0,0 +1,165 @@
 import numpy as np
 from gym import utils
 from gym.envs.mujoco import mujoco_env
 class ReacherEnv(mujoco_env.MujocoEnv, utils.EzPickle):
    """
    ### Description
    "Reacher" is a two-jointed robot arm. The goal is to move the robot's end effector (called *fingertip*) close to a
    target that is spawned at a random position.
    ### Action Space
    The action space is a `Box(-1, 1, (2,), float32)`. An action `(a, b)` represents the torques applied at the hinge joints.
    | Num | Action                                                                          | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit |
    |-----|---------------------------------------------------------------------------------|-------------|-------------|--------------------------|-------|------|
    | 0   | Torque applied at the first hinge (connecting the link to the point of fixture) | -1 | 1 | joint0  | hinge | torque (N m) |
    | 1   |  Torque applied at the second hinge (connecting the two links)                  | -1 | 1 | joint1  | hinge | torque (N m) |
    ### Observation Space
    Observations consist of
    - The cosine of the angles of the two arms
    - The sine of the angles of the two arms
    - The coordinates of the target
    - The angular velocities of the arms
    - The vector between the target and the reacher's fingertip (3 dimensional with the last element being 0)
    The observation is a `ndarray` with shape `(11,)` where the elements correspond to the following:
    | Num | Observation           | Min                  | Max                | Name (in corresponding XML file) | Joint| Unit |
    |-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------|
    | 0   | cosine of the angle of the first arm                                                            | -Inf                 | Inf                | cos(joint0) | hinge | unitless |
    | 1   | cosine of the angle of the second arm                                                           | -Inf                 | Inf                | cos(joint1) | hinge | unitless |
    | 2   | sine of the angle of the first arm                                                              | -Inf                 | Inf                | cos(joint0) | hinge | unitless |
    | 3   | sine of the angle of the second arm                                                             | -Inf                 | Inf                | cos(joint1) | hinge | unitless |
    | 4   |  x-coorddinate of the target                                                                    | -Inf                 | Inf                | target_x | slide | position (m) |
    | 5   |  y-coorddinate of the target                                                                    | -Inf                 | Inf                | target_y | slide | position (m) |
    | 6   | angular velocity of the first arm                                                               | -Inf                 | Inf                | joint0 | hinge | angular velocity (rad/s) |
    | 7   | angular velocity of the second arm                                                              | -Inf                 | Inf                | joint1 | hinge | angular velocity (rad/s) |
    | 8   | x-value of position_fingertip - position_target                                                 | -Inf                 | Inf                | NA | slide | position (m) |
    | 9   | y-value of position_fingertip - position_target                                                 | -Inf                 | Inf                | NA | slide | position (m) |
    | 10  | z-value of position_fingertip - position_target (0 since reacher is 2d and z is same for both)  | -Inf                 | Inf                | NA | slide | position (m) |
    Most Gym environments just return the positions and velocity of the
    joints in the `.xml` file as the state of the environment. However, in
    reacher the state is created by combining only certain elements of the
    position and velocity, and performing some function transformations on them.
    If one is to read the `.xml` for reacher then they will find 4 joints:
    | Num | Observation                 | Min      | Max      | Name (in corresponding XML file) | Joint | Unit               |
    |-----|-----------------------------|----------|----------|----------------------------------|-------|--------------------|
    | 0   | angle of the first arm      | -Inf     | Inf      | joint0                           | hinge | angle (rad)        |
    | 1   | angle of the second arm     | -Inf     | Inf      | joint1                           | hinge | angle (rad)        |
    | 2   | x-coordinate of the target  | -Inf     | Inf      | target_x                         | slide | position (m)       |
    | 3   | y-coordinate of the target  | -Inf     | Inf      | target_y                         | slide | position (m)       |
    ### Rewards
    The reward consists of two parts:
    - *reward_distance*: This reward is a measure of how far the *fingertip*
    of the reacher (the unattached end) is from the target, with a more negative
    value assigned for when the reacher's *fingertip* is further away from the
    target. It is calculated as the negative vector norm of (position of
    the fingertip - position of target), or *-norm("fingertip" - "target")*.
    - *reward_control*: A negative reward for penalising the walker if
    it takes actions that are too large. It is measured as the negative squared
    Euclidean norm of the action, i.e. as *- sum(action<sup>2</sup>)*.
    The total reward returned is ***reward*** *=* *reward_distance + reward_control*
    Unlike other environments, Reacher does not allow you to specify weights for the individual reward terms.
    However, `info` does contain the keys *reward_dist* and *reward_ctrl*. Thus, if you'd like to weight the terms,
    you should create a wrapper that computes the weighted reward from `info`.
    ### Starting State
    All observations start in state
    (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
    with a noise added for stochasticity. A uniform noise in the range
    [-0.1, 0.1] is added to the positional attributes, while the target position
    is selected uniformly at random in a disk of radius 0.2 around the origin.
    Independent, uniform noise in the
    range of [-0.005, 0.005] is added to the velocities, and the last
    element ("fingertip" - "target") is calculated at the end once everything
    is set. The default setting has a framerate of 2 and a *dt = 2 * 0.01 = 0.02*
    ### Episode Termination
    The episode terminates when any of the following happens:
    1. The episode duration reaches a 50 timesteps (with a new random target popping up if the reacher's fingertip reaches it before 50 timesteps)
    2. Any of the state space values is no longer finite.
    ### Arguments
    No additional arguments are currently supported (in v2 and lower),
    but modifications can be made to the XML file in the assets folder
    (or by changing the path to a modified XML file in another folder)..
    ```
    env = gym.make('Reacher-v2')
    ```
    There is no v3 for Reacher, unlike the robot environments where a v3 and
    beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
    ### Version History
    * v4: all mujoco environments now use the mujoco binidings in mujoco>=2.1.3
    * v2: All continuous control environments now use mujoco_py >= 1.50
    * v1: max_time_steps raised to 1000 for robot based tasks (not including reacher, which has a max_time_steps of 50). Added reward_threshold to environments.
    * v0: Initial versions release (1.0.0)
    """
    def __init__(self):
        utils.EzPickle.__init__(self)
        mujoco_env.MujocoEnv.__init__(self, "reacher.xml", 2)
    def step(self, a):
        vec = self.get_body_com("fingertip") - self.get_body_com("target")
        reward_dist = -np.linalg.norm(vec)
        reward_ctrl = -np.square(a).sum()
        reward = reward_dist + reward_ctrl
        self.do_simulation(a, self.frame_skip)
        ob = self._get_obs()
        done = False
        return ob, reward, done, dict(reward_dist=reward_dist, reward_ctrl=reward_ctrl)
    def viewer_setup(self):
        self.viewer.cam.trackbodyid = 0
    def reset_model(self):
        qpos = (
            self.np_random.uniform(low=-0.1, high=0.1, size=self.model.nq)
            + self.init_qpos
        )
        while True:
            self.goal = self.np_random.uniform(low=-0.2, high=0.2, size=2)
            if np.linalg.norm(self.goal) < 0.2:
                break
        qpos[-2:] = self.goal
        qvel = self.init_qvel + self.np_random.uniform(
            low=-0.005, high=0.005, size=self.model.nv
        )
        qvel[-2:] = 0
        self.set_state(qpos, qvel)
        return self._get_obs()
    def _get_obs(self):
        theta = self.data.qpos.flat[:2]
        return np.concatenate(
            [
                np.cos(theta),
                np.sin(theta),
                self.data.qpos.flat[2:],
                self.data.qvel.flat[:2],
                self.get_body_com("fingertip") - self.get_body_com("target"),
            ]
        )
--- a/gym/envs/mujoco/swimmer.py
+++ b/gym/envs/mujoco/swimmer.py
@@ -6,7 +6,9 @@ from gym.envs.mujoco import mujoco_env
 class SwimmerEnv(mujoco_env.MujocoEnv, utils.EzPickle):
    def __init__(self):
-        mujoco_env.MujocoEnv.__init__(self, "swimmer.xml", 4)
+        mujoco_env.MujocoEnv.__init__(
            self, "swimmer.xml", 4, mujoco_bindings="mujoco_py"
        )
        utils.EzPickle.__init__(self)
    def step(self, a):
--- a/gym/envs/mujoco/swimmer_v3.py
+++ b/gym/envs/mujoco/swimmer_v3.py
@@ -9,119 +9,6 @@ DEFAULT_CAMERA_CONFIG = {}
 class SwimmerEnv(mujoco_env.MujocoEnv, utils.EzPickle):
    """
    ### Description
    This environment corresponds to the Swimmer environment described in Rémi Coulom's PhD thesis
    ["Reinforcement Learning Using Neural Networks, with Applications to Motor Control"](https://tel.archives-ouvertes.fr/tel-00003985/document).
    The environment aims to increase the number of independent state and control
    variables as compared to the classic control environments. The swimmers
    consist of three or more segments ('***links***') and one less articulation
    joints ('***rotors***') - one rotor joint connecting exactly two links to
    form a linear chain. The swimmer is suspended in a two dimensional pool and
    always starts in the same position (subject to some deviation drawn from an
    uniform distribution), and the goal is to move as fast as possible towards
    the right by applying torque on the rotors and using the fluids friction.
    ### Notes
    The problem parameters are:
    Problem parameters:
    * *n*: number of body parts
    * *m<sub>i</sub>*: mass of part *i* (*i* ∈ {1...n})
    * *l<sub>i</sub>*: length of part *i* (*i* ∈ {1...n})
    * *k*: viscous-friction coefficient
    While the default environment has *n* = 3, *l<sub>i</sub>* = 0.1,
    and *k* = 0.1. It is possible to pass a custom MuJoCo XML file during construction to increase the
    number of links, or to tweak any of the parameters.
    ### Action Space
    The action space is a `Box(-1, 1, (2,), float32)`. An action represents the torques applied between *links*
    | Num | Action                             | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit         |
    |-----|------------------------------------|-------------|-------------|----------------------------------|-------|--------------|
    | 0   | Torque applied on the first rotor  | -1          | 1           | rot2                             | hinge | torque (N m) |
    | 1   | Torque applied on the second rotor | -1          | 1           | rot3                             | hinge | torque (N m) |
    ### Observation Space
    By default, observations consists of:
    * θ<sub>i</sub>: angle of part *i* with respect to the *x* axis
    * θ<sub>i</sub>': its derivative with respect to time (angular velocity)
    In the default case, observations do not include the x- and y-coordinates of the front tip. These may
    be included by passing `exclude_current_positions_from_observation=False` during construction.
    Then, the observation space will have 10 dimensions where the first two dimensions
    represent the x- and y-coordinates of the front tip.
    Regardless of whether `exclude_current_positions_from_observation` was set to true or false, the x- and y-coordinates
    will be returned in `info` with keys `"x_position"` and `"y_position"`, respectively.
    By default, the observation is a `ndarray` with shape `(8,)` where the elements correspond to the following:
    | Num | Observation           | Min                  | Max                | Name (in corresponding XML file) | Joint| Unit |
    |-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------|
    | 0   | angle of the front tip                     | -Inf                 | Inf                | rot | hinge | angle (rad) |
    | 1   | angle of the second rotor                  | -Inf                 | Inf                | rot2 | hinge | angle (rad) |
    | 2   | angle of the second rotor                  | -Inf                 | Inf                | rot3 | hinge | angle (rad) |
    | 3   | velocity of the tip along the x-axis       | -Inf                 | Inf                | slider1 | slide | velocity (m/s) |
    | 4   | velocity of the tip along the y-axis       | -Inf                 | Inf                | slider2 | slide | velocity (m/s) |
    | 5   | angular velocity of front tip              | -Inf                 | Inf                | rot | hinge | angular velocity (rad/s) |
    | 6   | angular velocity of second rotor           | -Inf                 | Inf                | rot2 | hinge | angular velocity (rad/s) |
    | 7   | angular velocity of third rotor            | -Inf                 | Inf                | rot3 | hinge | angular velocity (rad/s) |
    ### Rewards
    The reward consists of two parts:
    - *forward_reward*: A reward of moving forward which is measured
    as *`forward_reward_weight` * (x-coordinate before action - x-coordinate after action)/dt*. *dt* is
    the time between actions and is dependent on the frame_skip parameter
    (default is 4), where the frametime is 0.01 - making the
    default *dt = 4 * 0.01 = 0.04*. This reward would be positive if the swimmer
    swims right as desired.
    - *ctrl_cost*: A cost for penalising the swimmer if it takes
    actions that are too large. It is measured as *`ctrl_cost_weight` *
    sum(action<sup>2</sup>)* where *`ctrl_cost_weight`* is a parameter set for the
    control and has a default value of 1e-4
    The total reward returned is ***reward*** *=* *forward_reward - ctrl_cost* and `info` will also contain the individual reward terms
    ### Starting State
    All observations start in state (0,0,0,0,0,0,0,0) with a Uniform noise in the range of [-`reset_noise_scale`, `reset_noise_scale`] is added to the initial state for stochasticity.
    ### Episode Termination
    The episode terminates when the episode length is greater than 1000.
    ### Arguments
    No additional arguments are currently supported in v2 and lower.
    ```
    gym.make('Swimmer-v2')
    ```
    v3 and beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
    ```
    env = gym.make('Swimmer-v3', ctrl_cost_weight=0.1, ....)
    ```
    | Parameter               | Type       | Default      |Description                    |
    |-------------------------|------------|--------------|-------------------------------|
    | `xml_file`              | **str**    | `"swimmer.xml"`| Path to a MuJoCo model |
    | `forward_reward_weight` | **float**  | `1.0`        | Weight for *forward_reward* term (see section on reward) |
    | `ctrl_cost_weight`      | **float**  | `1e-4`       | Weight for *ctrl_cost* term (see section on reward) |
    | `reset_noise_scale`     | **float**  | `0.1`        | Scale of random perturbations of initial position and velocity (see section on Starting State) |
    | `exclude_current_positions_from_observation`| **bool** | `True`| Whether or not to omit the x- and y-coordinates from observations. Excluding the position can serve as an inductive bias to induce position-agnostic behavior in policies |
    ### Version History
    * v3: support for gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. rgb rendering comes from tracking camera (so agent does not run away from screen)
    * v2: All continuous control environments now use mujoco_py >= 1.50
    * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments.
    * v0: Initial versions release (1.0.0)
    """
    def __init__(
        self,
        xml_file="swimmer.xml",
@@ -141,7 +28,7 @@ class SwimmerEnv(mujoco_env.MujocoEnv, utils.EzPickle):
            exclude_current_positions_from_observation
        )
-        mujoco_env.MujocoEnv.__init__(self, xml_file, 4)
+        mujoco_env.MujocoEnv.__init__(self, xml_file, 4, mujoco_bindings="mujoco_py")
    def control_cost(self, action):
        control_cost = self._ctrl_cost_weight * np.sum(np.square(action))
--- a/gym/envs/mujoco/swimmer_v4.py
+++ b/gym/envs/mujoco/swimmer_v4.py
@@ -0,0 +1,217 @@
 __credits__ = ["Rushiv Arora"]
 import numpy as np
 from gym import utils
 from gym.envs.mujoco import mujoco_env
 DEFAULT_CAMERA_CONFIG = {}
 class SwimmerEnv(mujoco_env.MujocoEnv, utils.EzPickle):
    """
    ### Description
    This environment corresponds to the Swimmer environment described in Rémi Coulom's PhD thesis
    ["Reinforcement Learning Using Neural Networks, with Applications to Motor Control"](https://tel.archives-ouvertes.fr/tel-00003985/document).
    The environment aims to increase the number of independent state and control
    variables as compared to the classic control environments. The swimmers
    consist of three or more segments ('***links***') and one less articulation
    joints ('***rotors***') - one rotor joint connecting exactly two links to
    form a linear chain. The swimmer is suspended in a two dimensional pool and
    always starts in the same position (subject to some deviation drawn from an
    uniform distribution), and the goal is to move as fast as possible towards
    the right by applying torque on the rotors and using the fluids friction.
    ### Notes
    The problem parameters are:
    Problem parameters:
    * *n*: number of body parts
    * *m<sub>i</sub>*: mass of part *i* (*i* ∈ {1...n})
    * *l<sub>i</sub>*: length of part *i* (*i* ∈ {1...n})
    * *k*: viscous-friction coefficient
    While the default environment has *n* = 3, *l<sub>i</sub>* = 0.1,
    and *k* = 0.1. It is possible to tweak the MuJoCo XML files to increase the
    number of links, or to tweak any of the parameters.
    ### Action Space
    The agent take a 2-element vector for actions.
    The action space is a continuous `(action, action)` in `[-1, 1]`, where
    `action` represents the numerical torques applied between *links*
    | Num | Action                             | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit         |
    |-----|------------------------------------|-------------|-------------|----------------------------------|-------|--------------|
    | 0   | Torque applied on the first rotor  | -1          | 1           | rot2                             | hinge | torque (N m) |
    | 1   | Torque applied on the second rotor | -1          | 1           | rot3                             | hinge | torque (N m) |
    ### Observation Space
    The state space consists of:
    * A<sub>0</sub>: position of first point
    * θ<sub>i</sub>: angle of part *i* with respect to the *x* axis
    * A<sub>0</sub>, θ<sub>i</sub>: their derivatives with respect to time (velocity and angular velocity)
    The observation is a `ndarray` with shape `(8,)` where the elements correspond to the following:
    | Num | Observation           | Min                  | Max                | Name (in corresponding XML file) | Joint| Unit |
    |-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------|
    | 0   | x-coordinate of the front tip              | -Inf                 | Inf                | slider1 | slide | position (m) |
    | 1   | y-coordinate of the front tip              | -Inf                 | Inf                | slider2 | slide | position (m) |
    | 2   | angle of the front tip                     | -Inf                 | Inf                | rot | hinge | angle (rad) |
    | 3   | angle of the second rotor                  | -Inf                 | Inf                | rot2 | hinge | angle (rad) |
    | 4   | angle of the second rotor                  | -Inf                 | Inf                | rot3 | hinge | angle (rad) |
    | 5   | velocity of the tip along the x-axis       | -Inf                 | Inf                | slider1 | slide | velocity (m/s) |
    | 6   | velocity of the tip along the y-axis       | -Inf                 | Inf                | slider2 | slide | velocity (m/s) |
    | 7   | angular velocity of front tip              | -Inf                 | Inf                | rot | hinge | angular velocity (rad/s) |
    | 8   | angular velocity of second rotor           | -Inf                 | Inf                | rot2 | hinge | angular velocity (rad/s) |
    | 9   | angular velocity of third rotor            | -Inf                 | Inf                | rot3 | hinge | angular velocity (rad/s) |
    **Note:**
    In practice (and Gym implementation), the first two positional elements are
    omitted from the state space since the reward function is calculated based
    on those values. Therefore, observation space has shape `(8,)` and looks like:
    | Num | Observation           | Min                  | Max                | Name (in corresponding XML file) | Joint| Unit |
    |-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------|
    | 0   | angle of the front tip                     | -Inf                 | Inf                | rot | hinge | angle (rad) |
    | 1   | angle of the second rotor                  | -Inf                 | Inf                | rot2 | hinge | angle (rad) |
    | 2   | angle of the second rotor                  | -Inf                 | Inf                | rot3 | hinge | angle (rad) |
    | 3   | velocity of the tip along the x-axis       | -Inf                 | Inf                | slider1 | slide | velocity (m/s) |
    | 4   | velocity of the tip along the y-axis       | -Inf                 | Inf                | slider2 | slide | velocity (m/s) |
    | 5   | angular velocity of front tip              | -Inf                 | Inf                | rot | hinge | angular velocity (rad/s) |
    | 6   | angular velocity of second rotor           | -Inf                 | Inf                | rot2 | hinge | angular velocity (rad/s) |
    | 7   | angular velocity of third rotor            | -Inf                 | Inf                | rot3 | hinge | angular velocity (rad/s) |
    ### Rewards
    The reward consists of two parts:
    - *reward_front*: A reward of moving forward which is measured
    as *(x-coordinate before action - x-coordinate after action)/dt*. *dt* is
    the time between actions and is dependeent on the frame_skip parameter
    (default is 4), where the *dt* for one frame is 0.01 - making the
    default *dt = 4 * 0.01 = 0.04*. This reward would be positive if the swimmer
    swims right as desired.
    - *reward_control*: A negative reward for penalising the swimmer if it takes
    actions that are too large. It is measured as *-coefficient x
    sum(action<sup>2</sup>)* where *coefficient* is a parameter set for the
    control and has a default value of 0.0001
    The total reward returned is ***reward*** *=* *reward_front + reward_control*
    ### Starting State
    All observations start in state (0,0,0,0,0,0,0,0) with a Uniform noise in the range of [-0.1, 0.1] is added to the initial state for stochasticity.
    ### Episode Termination
    The episode terminates when the episode length is greater than 1000.
    ### Arguments
    No additional arguments are currently supported (in v2 and lower), but
    modifications can be made to the XML file in the assets folder
    (or by changing the path to a modified XML file in another folder).
    ```
    gym.make('Swimmer-v2')
    ```
    v3 and beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
    ```
    env = gym.make('Swimmer-v3', ctrl_cost_weight=0.1, ....)
    ```
    ### Version History
    * v4: all mujoco environments now use the mujoco binidings in mujoco>=2.1.3
    * v3: support for gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. rgb rendering comes from tracking camera (so agent does not run away from screen)
    * v2: All continuous control environments now use mujoco_py >= 1.50
    * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments.
    * v0: Initial versions release (1.0.0)
    """
    def __init__(
        self,
        xml_file="swimmer.xml",
        forward_reward_weight=1.0,
        ctrl_cost_weight=1e-4,
        reset_noise_scale=0.1,
        exclude_current_positions_from_observation=True,
    ):
        utils.EzPickle.__init__(**locals())
        self._forward_reward_weight = forward_reward_weight
        self._ctrl_cost_weight = ctrl_cost_weight
        self._reset_noise_scale = reset_noise_scale
        self._exclude_current_positions_from_observation = (
            exclude_current_positions_from_observation
        )
        mujoco_env.MujocoEnv.__init__(self, xml_file, 4)
    def control_cost(self, action):
        control_cost = self._ctrl_cost_weight * np.sum(np.square(action))
        return control_cost
    def step(self, action):
        xy_position_before = self.data.qpos[0:2].copy()
        self.do_simulation(action, self.frame_skip)
        xy_position_after = self.data.qpos[0:2].copy()
        xy_velocity = (xy_position_after - xy_position_before) / self.dt
        x_velocity, y_velocity = xy_velocity
        forward_reward = self._forward_reward_weight * x_velocity
        ctrl_cost = self.control_cost(action)
        observation = self._get_obs()
        reward = forward_reward - ctrl_cost
        done = False
        info = {
            "reward_fwd": forward_reward,
            "reward_ctrl": -ctrl_cost,
            "x_position": xy_position_after[0],
            "y_position": xy_position_after[1],
            "distance_from_origin": np.linalg.norm(xy_position_after, ord=2),
            "x_velocity": x_velocity,
            "y_velocity": y_velocity,
            "forward_reward": forward_reward,
        }
        return observation, reward, done, info
    def _get_obs(self):
        position = self.data.qpos.flat.copy()
        velocity = self.data.qvel.flat.copy()
        if self._exclude_current_positions_from_observation:
            position = position[2:]
        observation = np.concatenate([position, velocity]).ravel()
        return observation
    def reset_model(self):
        noise_low = -self._reset_noise_scale
        noise_high = self._reset_noise_scale
        qpos = self.init_qpos + self.np_random.uniform(
            low=noise_low, high=noise_high, size=self.model.nq
        )
        qvel = self.init_qvel + self.np_random.uniform(
            low=noise_low, high=noise_high, size=self.model.nv
        )
        self.set_state(qpos, qvel)
        observation = self._get_obs()
        return observation
    def viewer_setup(self):
        for key, value in DEFAULT_CAMERA_CONFIG.items():
            if isinstance(value, np.ndarray):
                getattr(self.viewer.cam, key)[:] = value
            else:
                setattr(self.viewer.cam, key, value)
--- a/gym/envs/mujoco/walker2d.py
+++ b/gym/envs/mujoco/walker2d.py
@@ -6,7 +6,9 @@ from gym.envs.mujoco import mujoco_env
 class Walker2dEnv(mujoco_env.MujocoEnv, utils.EzPickle):
    def __init__(self):
-        mujoco_env.MujocoEnv.__init__(self, "walker2d.xml", 4)
+        mujoco_env.MujocoEnv.__init__(
            self, "walker2d.xml", 4, mujoco_bindings="mujoco_py"
        )
        utils.EzPickle.__init__(self)
    def step(self, a):
--- a/gym/envs/mujoco/walker2d_v3.py
+++ b/gym/envs/mujoco/walker2d_v3.py
@@ -12,137 +12,6 @@ DEFAULT_CAMERA_CONFIG = {
 class Walker2dEnv(mujoco_env.MujocoEnv, utils.EzPickle):
    """
    ### Description
    This environment builds on the hopper environment based on the work done by Erez, Tassa, and Todorov
    in ["Infinite Horizon Model Predictive Control for Nonlinear Periodic Tasks"](http://www.roboticsproceedings.org/rss07/p10.pdf)
    by adding another set of legs making it possible for the robot to walker forward instead of
    hop. Like other Mujoco environments, this environment aims to increase the number of independent state
    and control variables as compared to the classic control environments. The walker is a
    two-dimensional two-legged figure that consist of four main body parts - a single torso at the top
    (with the two legs splitting after the torso), two thighs in the middle below the torso, two legs
    in the bottom below the thighs, and two feet attached to the legs on which the entire body rests.
    The goal is to make coordinate both sets of feet, legs, and thighs to move in the forward (right)
    direction by applying torques on the six hinges connecting the six body parts.
    ### Action Space
    The action space is a `Box(-1, 1, (6,), float32)`. An action represents the torques applied at the hinge joints.
    | Num | Action                                 | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit         |
    |-----|----------------------------------------|-------------|-------------|----------------------------------|-------|--------------|
    | 0   | Torque applied on the thigh rotor      | -1          | 1           | thigh_joint                      | hinge | torque (N m) |
    | 1   | Torque applied on the leg rotor        | -1          | 1           | leg_joint                        | hinge | torque (N m) |
    | 2   | Torque applied on the foot rotor       | -1          | 1           | foot_joint                       | hinge | torque (N m) |
    | 3   | Torque applied on the left thigh rotor | -1          | 1           | thigh_left_joint                 | hinge | torque (N m) |
    | 4   | Torque applied on the left leg rotor   | -1          | 1           | leg_left_joint                   | hinge | torque (N m) |
    | 5   | Torque applied on the left foot rotor  | -1          | 1           | foot_left_joint                  | hinge | torque (N m) |
    ### Observation Space
    Observations consist of positional values of different body parts of the walker,
    followed by the velocities of those individual parts (their derivatives) with all the positions ordered before all the velocities.
    By default, observations do not include the x-coordinate of the top. It may
    be included by passing `exclude_current_positions_from_observation=False` during construction.
    In that case, the observation space will have 18 dimensions where the first dimension
    represent the x-coordinates of the top of the walker.
    Regardless of whether `exclude_current_positions_from_observation` was set to true or false, the x-coordinate
    of the top will be returned in `info` with key `"x_position"`.
    By default, observation is a `ndarray` with shape `(17,)` where the elements correspond to the following:
    | Num | Observation                                                       | Min                | Max                | Name (in corresponding XML file) | Joint | Unit |
    |-----|--------------------------------------------------------|----------------|-----------------|----------------------------------------|-------|------|
    | 0   | z-coordinate of the top (height of hopper)         | -Inf                 | Inf                | rootz (torso)      | slide | position (m) |
    | 1   | angle of the top                                   | -Inf                 | Inf                | rooty (torso)      | hinge | angle (rad) |
    | 2   | angle of the thigh joint                           | -Inf                 | Inf                | thigh_joint        | hinge | angle (rad) |
    | 3   | angle of the leg joint                             | -Inf                 | Inf                | leg_joint            | hinge | angle (rad) |
    | 4   | angle of the foot joint                            | -Inf                 | Inf                | foot_joint          | hinge | angle (rad) |
    | 5   | angle of the left thigh joint                      | -Inf                 | Inf                | thigh_left_joint  | hinge | angle (rad) |
    | 6   | angle of the left leg joint                        | -Inf                 | Inf                | leg_left_joint    | hinge | angle (rad) |
    | 7   | angle of the left foot joint                       | -Inf                 | Inf                | foot_left_joint  | hinge | angle (rad) |
    | 8   | velocity of the x-coordinate of the top            | -Inf                 | Inf                | rootx               | slide | velocity (m/s) |
    | 9   | velocity of the z-coordinate (height) of the top   | -Inf                 | Inf                | rootz                | slide | velocity (m/s)  |
    | 10  | angular velocity of the angle of the top           | -Inf                 | Inf                | rooty                | hinge | angular velocity (rad/s) |
    | 11  | angular velocity of the thigh hinge                | -Inf                 | Inf                | thigh_joint        | hinge | angular velocity (rad/s) |
    | 12  | angular velocity of the leg hinge                  | -Inf                 | Inf                | leg_joint            |  hinge | angular velocity (rad/s) |
    | 13  | angular velocity of the foot hinge                 | -Inf                 | Inf                | foot_joint           | hinge | angular velocity (rad/s) |
    | 14  | angular velocity of the thigh hinge                | -Inf                 | Inf                | thigh_left_joint   | hinge | angular velocity (rad/s) |
    | 15  | angular velocity of the leg hinge                  | -Inf                 | Inf                | leg_left_joint     | hinge | angular velocity (rad/s) |
    | 16  | angular velocity of the foot hinge                 | -Inf                 | Inf                | foot_left_joint    | hinge | angular velocity (rad/s) |
    ### Rewards
    The reward consists of three parts:
    - *healthy_reward*: Every timestep that the walker is alive, it receives a fixed reward of value `healthy_reward`,
    - *forward_reward*: A reward of walking forward which is measured as
    *`forward_reward_weight` * (x-coordinate before action - x-coordinate after action)/dt*.
    *dt* is the time between actions and is dependeent on the frame_skip parameter
    (default is 4), where the frametime is 0.002 - making the default
    *dt = 4 * 0.002 = 0.008*. This reward would be positive if the walker walks forward (right) desired.
    - *ctrl_cost*: A cost for penalising the walker if it
    takes actions that are too large. It is measured as
    *`ctrl_cost_weight` * sum(action<sup>2</sup>)* where *`ctrl_cost_weight`* is
    a parameter set for the control and has a default value of 0.001
    The total reward returned is ***reward*** *=* *healthy_reward bonus + forward_reward - ctrl_cost* and `info` will also contain the individual reward terms
    ### Starting State
    All observations start in state
    (0.0, 1.25, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
    with a uniform noise in the range of [-`reset_noise_scale`, `reset_noise_scale`] added to the values for stochasticity.
    ### Episode Termination
    The walker is said to be unhealthy if any of the following happens:
    1. Any of the state space values is no longer finite
    2. The height of the walker is ***not*** in the closed interval specified by `healthy_z_range`
    3. The absolute value of the angle (`observation[1]` if `exclude_current_positions_from_observation=False`, else `observation[2]`) is ***not*** in the closed interval specified by `healthy_angle_range`
    If `terminate_when_unhealthy=True` is passed during construction (which is the default),
    the episode terminates when any of the following happens:
    1. The episode duration reaches a 1000 timesteps
    2. The walker is unhealthy
    If `terminate_when_unhealthy=False` is passed, the episode is terminated only when 1000 timesteps are exceeded.
    ### Arguments
    No additional arguments are currently supported in v2 and lower.
    ```
    env = gym.make('Walker2d-v2')
    ```
    v3 and beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
    ```
    env = gym.make('Walker2d-v3', ctrl_cost_weight=0.1, ....)
    ```
    | Parameter               | Type       | Default      |Description                    |
    |-------------------------|------------|--------------|-------------------------------|
    | `xml_file`              | **str**    | `"walker2d.xml"` | Path to a MuJoCo model |
    | `forward_reward_weight` | **float**  | `1.0`        | Weight for *forward_reward* term (see section on reward) |
    | `ctrl_cost_weight`      | **float**  | `1e-3`       | Weight for *ctr_cost* term (see section on reward) |
    | `healthy_reward`        | **float**  | `1.0`        | Constant reward given if the ant is "healthy" after timestep |
    | `terminate_when_unhealthy` | **bool**| `True`       | If true, issue a done signal if the z-coordinate of the walker is no longer healthy |
    | `healthy_z_range`       | **tuple**  | `(0.8, 2)`   | The z-coordinate of the top of the walker must be in this range to be considered healthy |
    | `healthy_angle_range`   | **tuple**  | `(-1, 1)`    | The angle must be in this range to be considered healthy|
    | `reset_noise_scale`     | **float**  | `5e-3`       | Scale of random perturbations of initial position and velocity (see section on Starting State) |
    | `exclude_current_positions_from_observation`| **bool** | `True`| Whether or not to omit the x-coordinate from observations. Excluding the position can serve as an inductive bias to induce position-agnostic behavior in policies |
    ### Version History
    * v3: support for gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. rgb rendering comes from tracking camera (so agent does not run away from screen)
    * v2: All continuous control environments now use mujoco_py >= 1.50
    * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments.
    * v0: Initial versions release (1.0.0)
    """
    def __init__(
        self,
        xml_file="walker2d.xml",
@@ -172,7 +41,7 @@ class Walker2dEnv(mujoco_env.MujocoEnv, utils.EzPickle):
            exclude_current_positions_from_observation
        )
-        mujoco_env.MujocoEnv.__init__(self, xml_file, 4)
+        mujoco_env.MujocoEnv.__init__(self, xml_file, 4, mujoco_bindings="mujoco_py")
    @property
    def healthy_reward(self):
--- a/gym/envs/mujoco/walker2d_v4.py
+++ b/gym/envs/mujoco/walker2d_v4.py
@@ -0,0 +1,276 @@
 import numpy as np
 from gym import utils
 from gym.envs.mujoco import mujoco_env
 DEFAULT_CAMERA_CONFIG = {
    "trackbodyid": 2,
    "distance": 4.0,
    "lookat": np.array((0.0, 0.0, 1.15)),
    "elevation": -20.0,
 }
 DEFAULT_CAMERA_CONFIG = {
    "trackbodyid": 2,
    "distance": 4.0,
    "lookat": np.array((0.0, 0.0, 1.15)),
    "elevation": -20.0,
 }
 class Walker2dEnv(mujoco_env.MujocoEnv, utils.EzPickle):
    """
    ### Description
    This environment builds on the hopper environment based on the work done by Erez, Tassa, and Todorov
    in ["Infinite Horizon Model Predictive Control for Nonlinear Periodic Tasks"](http://www.roboticsproceedings.org/rss07/p10.pdf)
    by adding another set of legs making it possiible for the robot to walker forward instead of
    hop. Like other Mujoco environments, this environment aims to increase the number of independent state
    and control variables as compared to the classic control environments. The walker is a
    two-dimensional two-legged figure that consist of four main body parts - a single torso at the top
    (with the two legs splitting after the torso), two thighs in the middle below the torso, two legs
    in the bottom below the thighs, and two feet attached to the legs on which the entire body rests.
    The goal is to make coordinate both sets of feet, legs, and thighs to move in the forward (right)
    direction by applying torques on the six hinges connecting the six body parts.
    ### Action Space
    The agent take a 6-element vector for actions.
    The action space is a continuous `(action, action, action, action, action, action)`
    all in `[-1, 1]`, where `action` represents the numerical torques applied at the hinge joints.
    | Num | Action                                 | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit         |
    |-----|----------------------------------------|-------------|-------------|----------------------------------|-------|--------------|
    | 0   | Torque applied on the thigh rotor      | -1          | 1           | thigh_joint                      | hinge | torque (N m) |
    | 1   | Torque applied on the leg rotor        | -1          | 1           | leg_joint                        | hinge | torque (N m) |
    | 2   | Torque applied on the foot rotor       | -1          | 1           | foot_joint                       | hinge | torque (N m) |
    | 3   | Torque applied on the left thigh rotor | -1          | 1           | thigh_left_joint                 | hinge | torque (N m) |
    | 4   | Torque applied on the left leg rotor   | -1          | 1           | leg_left_joint                   | hinge | torque (N m) |
    | 5   | Torque applied on the left foot rotor  | -1          | 1           | foot_left_joint                  | hinge | torque (N m) |
    ### Observation Space
    The state space consists of positional values of different body parts of the walker,
    followed by the velocities of those individual parts (their derivatives) with all the positions ordered before all the velocities.
    The observation is a `ndarray` with shape `(17,)` where the elements correspond to the following:
    | Num | Observation                                                       | Min                | Max                | Name (in corresponding XML file) | Joint | Unit |
    |-----|--------------------------------------------------------|----------------|-----------------|----------------------------------------|-------|------|
    | 0   | x-coordinate of the top                            | -Inf                 | Inf                | rootx (torso)      | slide | position (m) |
    | 1   | z-coordinate of the top (height of hopper)         | -Inf                 | Inf                | rootz (torso)      | slide | position (m) |
    | 2   | angle of the top                                   | -Inf                 | Inf                | rooty (torso)      | hinge | angle (rad) |
    | 3   | angle of the thigh joint                           | -Inf                 | Inf                | thigh_joint        | hinge | angle (rad) |
    | 4   | angle of the leg joint                             | -Inf                 | Inf                | leg_joint            | hinge | angle (rad) |
    | 5   | angle of the foot joint                            | -Inf                 | Inf                | foot_joint          | hinge | angle (rad) |
    | 6   | angle of the left thigh joint                      | -Inf                 | Inf                | thigh_left_joint  | hinge | angle (rad) |
    | 7   | angle of the left leg joint                        | -Inf                 | Inf                | leg_left_joint    | hinge | angle (rad) |
    | 8   | angle of the left foot joint                       | -Inf                 | Inf                | foot_left_joint  | hinge | angle (rad) |
    | 9   | velocity of the x-coordinate of the top            | -Inf                 | Inf                | rootx               | slide | velocity (m/s) |
    | 10  | velocity of the z-coordinate (height) of the top   | -Inf                 | Inf                | rootz                | slide | velocity (m/s)  |
    | 11  | angular velocity of the angle of the top           | -Inf                 | Inf                | rooty                | hinge | angular velocity (rad/s) |
    | 12  | angular velocity of the thigh hinge                | -Inf                 | Inf                | thigh_joint        | hinge | angular velocity (rad/s) |
    | 13  | angular velocity of the leg hinge                  | -Inf                 | Inf                | leg_joint            |  hinge | angular velocity (rad/s) |
    | 14  | angular velocity of the foot hinge                 | -Inf                 | Inf                | foot_joint           | hinge | angular velocity (rad/s) |
    | 15  | angular velocity of the thigh hinge                | -Inf                 | Inf                | thigh_left_joint   | hinge | angular velocity (rad/s) |
    | 16  | angular velocity of the leg hinge                  | -Inf                 | Inf                | leg_left_joint     | hinge | angular velocity (rad/s) |
    | 17  | angular velocity of the foot hinge                 | -Inf                 | Inf                | foot_left_joint    | hinge | angular velocity (rad/s) |
    **Note:**
    In practice (and Gym implementation), the first positional element is omitted from the
    state space since the reward function is calculated based on that value. This value is
    hidden from the algorithm, which in turn has to develop an abstract understanding of it
    from the observed rewards. Therefore, observation space has shape `(17,)`
    instead of `(18,)` and looks like:
    | Num | Observation                                                       | Min                | Max                | Name (in corresponding XML file) | Joint | Unit |
    |-----|--------------------------------------------------------|----------------|-----------------|----------------------------------------|-------|------|
    | 0   | z-coordinate of the top (height of hopper)         | -Inf                 | Inf                | rootz (torso)      | slide | position (m) |
    | 1   | angle of the top                                   | -Inf                 | Inf                | rooty (torso)      | hinge | angle (rad) |
    | 2   | angle of the thigh joint                           | -Inf                 | Inf                | thigh_joint        | hinge | angle (rad) |
    | 3   | angle of the leg joint                             | -Inf                 | Inf                | leg_joint            | hinge | angle (rad) |
    | 4   | angle of the foot joint                            | -Inf                 | Inf                | foot_joint          | hinge | angle (rad) |
    | 5   | angle of the left thigh joint                      | -Inf                 | Inf                | thigh_left_joint  | hinge | angle (rad) |
    | 6   | angle of the left leg joint                        | -Inf                 | Inf                | leg_left_joint    | hinge | angle (rad) |
    | 7   | angle of the left foot joint                       | -Inf                 | Inf                | foot_left_joint  | hinge | angle (rad) |
    | 8   | velocity of the x-coordinate of the top            | -Inf                 | Inf                | rootx               | slide | velocity (m/s) |
    | 9   | velocity of the z-coordinate (height) of the top   | -Inf                 | Inf                | rootz                | slide | velocity (m/s)  |
    | 10  | angular velocity of the angle of the top           | -Inf                 | Inf                | rooty                | hinge | angular velocity (rad/s) |
    | 11  | angular velocity of the thigh hinge                | -Inf                 | Inf                | thigh_joint        | hinge | angular velocity (rad/s) |
    | 12  | angular velocity of the leg hinge                  | -Inf                 | Inf                | leg_joint            |  hinge | angular velocity (rad/s) |
    | 13  | angular velocity of the foot hinge                 | -Inf                 | Inf                | foot_joint           | hinge | angular velocity (rad/s) |
    | 14  | angular velocity of the thigh hinge                | -Inf                 | Inf                | thigh_left_joint   | hinge | angular velocity (rad/s) |
    | 15  | angular velocity of the leg hinge                  | -Inf                 | Inf                | leg_left_joint     | hinge | angular velocity (rad/s) |
    | 16  | angular velocity of the foot hinge                 | -Inf                 | Inf                | foot_left_joint    | hinge | angular velocity (rad/s) |
    ### Rewards
    The reward consists of three parts:
    - *alive bonus*: Every timestep that the walker is alive, it gets a reward of 1,
    - *reward_forward*: A reward of walking forward which is measured as
    *(x-coordinate before action - x-coordinate after action)/dt*.
    *dt* is the time between actions and is dependeent on the frame_skip parameter
    (default is 4), where the *dt* for one frame is 0.002 - making the default
    *dt = 4 * 0.002 = 0.008*. This reward would be positive if the walker walks forward (right) desired.
    - *reward_control*: A negative reward for penalising the walker if it
    takes actions that are too large. It is measured as
    *-coefficient **x** sum(action<sup>2</sup>)* where *coefficient* is
    a parameter set for the control and has a default value of 0.001
    The total reward returned is ***reward*** *=* *alive bonus + reward_forward + reward_control*
    ### Starting State
    All observations start in state
    (0.0, 1.25, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
    with a uniform noise in the range of [-0.005, 0.005] added to the values for stochasticity.
    ### Episode Termination
    The episode terminates when any of the following happens:
    1. The episode duration reaches a 1000 timesteps
    2. Any of the state space values is no longer finite
    3. The height of the walker (index 1) is ***not*** in the range `[0.8, 2]`
    4. The absolute value of the angle (index 2) is ***not*** in the range `[-1, 1]`
    ### Arguments
    No additional arguments are currently supported (in v2 and lower),
    but modifications can be made to the XML file in the assets folder
    (or by changing the path to a modified XML file in another folder)..
    ```
    env = gym.make('Walker2d-v2')
    ```
    v3 and beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
    ```
    env = gym.make('Walker2d-v3', ctrl_cost_weight=0.1, ....)
    ```
    ### Version History
    * v4: all mujoco environments now use the mujoco binidings in mujoco>=2.1.3
    * v3: support for gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. rgb rendering comes from tracking camera (so agent does not run away from screen)
    * v2: All continuous control environments now use mujoco_py >= 1.50
    * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments.
    * v0: Initial versions release (1.0.0)
    """
    def __init__(
        self,
        xml_file="walker2d.xml",
        forward_reward_weight=1.0,
        ctrl_cost_weight=1e-3,
        healthy_reward=1.0,
        terminate_when_unhealthy=True,
        healthy_z_range=(0.8, 2.0),
        healthy_angle_range=(-1.0, 1.0),
        reset_noise_scale=5e-3,
        exclude_current_positions_from_observation=True,
    ):
        utils.EzPickle.__init__(**locals())
        self._forward_reward_weight = forward_reward_weight
        self._ctrl_cost_weight = ctrl_cost_weight
        self._healthy_reward = healthy_reward
        self._terminate_when_unhealthy = terminate_when_unhealthy
        self._healthy_z_range = healthy_z_range
        self._healthy_angle_range = healthy_angle_range
        self._reset_noise_scale = reset_noise_scale
        self._exclude_current_positions_from_observation = (
            exclude_current_positions_from_observation
        )
        mujoco_env.MujocoEnv.__init__(self, xml_file, 4)
    @property
    def healthy_reward(self):
        return (
            float(self.is_healthy or self._terminate_when_unhealthy)
            * self._healthy_reward
        )
    def control_cost(self, action):
        control_cost = self._ctrl_cost_weight * np.sum(np.square(action))
        return control_cost
    @property
    def is_healthy(self):
        z, angle = self.data.qpos[1:3]
        min_z, max_z = self._healthy_z_range
        min_angle, max_angle = self._healthy_angle_range
        healthy_z = min_z < z < max_z
        healthy_angle = min_angle < angle < max_angle
        is_healthy = healthy_z and healthy_angle
        return is_healthy
    @property
    def done(self):
        done = not self.is_healthy if self._terminate_when_unhealthy else False
        return done
    def _get_obs(self):
        position = self.data.qpos.flat.copy()
        velocity = np.clip(self.data.qvel.flat.copy(), -10, 10)
        if self._exclude_current_positions_from_observation:
            position = position[1:]
        observation = np.concatenate((position, velocity)).ravel()
        return observation
    def step(self, action):
        x_position_before = self.data.qpos[0]
        self.do_simulation(action, self.frame_skip)
        x_position_after = self.data.qpos[0]
        x_velocity = (x_position_after - x_position_before) / self.dt
        ctrl_cost = self.control_cost(action)
        forward_reward = self._forward_reward_weight * x_velocity
        healthy_reward = self.healthy_reward
        rewards = forward_reward + healthy_reward
        costs = ctrl_cost
        observation = self._get_obs()
        reward = rewards - costs
        done = self.done
        info = {
            "x_position": x_position_after,
            "x_velocity": x_velocity,
        }
        return observation, reward, done, info
    def reset_model(self):
        noise_low = -self._reset_noise_scale
        noise_high = self._reset_noise_scale
        qpos = self.init_qpos + self.np_random.uniform(
            low=noise_low, high=noise_high, size=self.model.nq
        )
        qvel = self.init_qvel + self.np_random.uniform(
            low=noise_low, high=noise_high, size=self.model.nv
        )
        self.set_state(qpos, qvel)
        observation = self._get_obs()
        return observation
    def viewer_setup(self):
        for key, value in DEFAULT_CAMERA_CONFIG.items():
            if isinstance(value, np.ndarray):
                getattr(self.viewer.cam, key)[:] = value
            else:
                setattr(self.viewer.cam, key, value)
--- a/py.Dockerfile
+++ b/py.Dockerfile
@@ -6,15 +6,14 @@ RUN apt-get -y update && apt-get install -y unzip libglu1-mesa-dev libgl1-mesa-d
 # Download mujoco
 RUN mkdir /root/.mujoco && \
    cd /root/.mujoco  && \
-    curl -O https://www.roboti.us/download/mjpro150_linux.zip && \
+    wget https://github.com/deepmind/mujoco/releases/download/2.1.0/mujoco210-linux-x86_64.tar.gz &&\
-    unzip mjpro150_linux.zip && \
+    tar -xf mujoco210-linux-x86_64.tar.gz
    echo DUMMY_KEY > /root/.mujoco/mjkey.txt
-ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/root/.mujoco/mjpro150/bin
+ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/root/.mujoco/mujoco210/bin
 COPY . /usr/local/gym/
 WORKDIR /usr/local/gym/
-RUN pip install .[nomujoco] && pip install -r test_requirements.txt
+RUN pip install .[noatari] && pip install -r test_requirements.txt
 ENTRYPOINT ["/usr/local/gym/bin/docker_entrypoint"]
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,6 @@
 ale-py~=0.7
 opencv-python>=3.0
 box2d-py==2.3.5
 mujoco_py>=1.50, <2.0
 scipy>=1.4.1
 numpy>=1.18.0
 pyglet>=1.4.0
--- a/setup.py
+++ b/setup.py
@@ -15,19 +15,25 @@ extras = {
    "accept-rom-license": ["autorom[accept-rom-license]~=0.4.2"],
    "box2d": ["box2d-py==2.3.5", "pygame==2.1.0"],
    "classic_control": ["pygame==2.1.0"],
-    "mujoco": ["mujoco_py>=1.50, <2.0"],
+    "mujoco_py": ["mujoco_py<2.2,>=2.1"],
    "mujoco": ["mujoco==2.1.5", "imageio>=2.14.1"],
    "toy_text": ["pygame==2.1.0", "scipy>=1.4.1"],
    "other": ["lz4>=3.1.0", "opencv-python>=3.0", "matplotlib>=3.0"],
 }
 # Meta dependency groups.
-nomujoco_blacklist = {"mujoco", "accept-rom-license", "atari"}
+nomujoco_blacklist = {"mujoco_py", "mujoco", "accept-rom-license", "atari"}
 nomujoco_groups = set(extras.keys()) - nomujoco_blacklist
 extras["nomujoco"] = list(
    itertools.chain.from_iterable(map(lambda group: extras[group], nomujoco_groups))
 )
 noatari_blacklist = {"accept-rom-license", "atari"}
 noatari_groups = set(extras.keys()) - noatari_blacklist
 extras["noatari"] = list(
    itertools.chain.from_iterable(map(lambda group: extras[group], noatari_groups))
 )
 all_blacklist = {"accept-rom-license"}
 all_groups = set(extras.keys()) - all_blacklist
--- a/tests/envs/spec_list.py
+++ b/tests/envs/spec_list.py
@@ -1,19 +1,14 @@
 import os
 from gym import envs, logger
-SKIP_MUJOCO_WARNING_MESSAGE = (
+SKIP_MUJOCO_V3_WARNING_MESSAGE = (
-    "Cannot run mujoco test (either license key not found or mujoco not"
+    "Cannot run mujoco test because mujoco-py is not installed"
    "installed properly)."
 )
-
+skip_mujoco_v3 = False
-skip_mujoco = not (os.environ.get("MUJOCO_KEY"))
+try:
-if not skip_mujoco:
+    import mujoco_py  # noqa:F401
-    try:
+except ImportError:
-        import mujoco_py  # noqa:F401
+    skip_mujoco_v3 = True
    except ImportError:
        skip_mujoco = True
 def should_skip_env_spec_for_tests(spec):
@@ -21,7 +16,7 @@ def should_skip_env_spec_for_tests(spec):
    # troublesome to run frequently
    ep = spec.entry_point
    # Skip mujoco tests for pull request CI
-    if skip_mujoco and ep.startswith("gym.envs.mujoco"):
+    if skip_mujoco_v3 and ep.startswith("gym.envs.mujoco"):
        return True
    try:
        import gym.envs.atari  # noqa:F401
@@ -48,8 +43,19 @@ def should_skip_env_spec_for_tests(spec):
    return False
 def skip_mujoco_py_env_for_test(spec):
    ep = spec.entry_point
    version = spec.version
    if ep.startswith("gym.envs.mujoco") and version < 4:
        return True
    return False
 spec_list = [
    spec
    for spec in sorted(envs.registry.values(), key=lambda x: x.id)
    if spec.entry_point is not None and not should_skip_env_spec_for_tests(spec)
 ]
 spec_list_no_mujoco_py = [
    spec for spec in spec_list if not skip_mujoco_py_env_for_test(spec)
 ]
--- a/tests/envs/test_action_dim_check.py
+++ b/tests/envs/test_action_dim_check.py
@@ -8,17 +8,19 @@ from gym.envs.registration import EnvSpec
 from gym.spaces.box import Box
 from gym.spaces.discrete import Discrete
 from gym.spaces.space import Space
-from tests.envs.spec_list import SKIP_MUJOCO_WARNING_MESSAGE, skip_mujoco, spec_list
+from tests.envs.spec_list import (
    SKIP_MUJOCO_V3_WARNING_MESSAGE,
    skip_mujoco_v3,
    spec_list,
 )
 ENVIRONMENT_IDS = ("HalfCheetah-v2",)
 def make_envs_by_action_space_type(spec_list: List[EnvSpec], action_space: Space):
    """Make environments of specific action_space type.
    This function returns a filtered list of environment from the
    spec_list that matches the action_space type.
    Args:
        spec_list (list): list of registered environments' specification
        action_space (gym.spaces.Space): action_space type
@@ -31,7 +33,7 @@ def make_envs_by_action_space_type(spec_list: List[EnvSpec], action_space: Space
    return filtered_envs
-@pytest.mark.skipif(skip_mujoco, reason=SKIP_MUJOCO_WARNING_MESSAGE)
+@pytest.mark.skipif(skip_mujoco_v3, reason=SKIP_MUJOCO_V3_WARNING_MESSAGE)
@pytest.mark.parametrize("environment_id", ENVIRONMENT_IDS)
 def test_serialize_deserialize(environment_id):
    env = envs.make(environment_id)
@@ -47,10 +49,8 @@ def test_serialize_deserialize(environment_id):
@pytest.mark.parametrize("env", make_envs_by_action_space_type(spec_list, Discrete))
 def test_discrete_actions_out_of_bound(env):
    """Test out of bound actions in Discrete action_space.
    In discrete action_space environments, `out-of-bound`
    actions are not allowed and should raise an exception.
    Args:
        env (gym.Env): the gym environment
    """
@@ -69,11 +69,9 @@ def test_discrete_actions_out_of_bound(env):
 )
 def test_box_actions_out_of_bound(env, seed):
    """Test out of bound actions in Box action_space.
    Environments with Box actions spaces perform clipping inside `step`.
    The expected behaviour is that an action `out-of-bound` has the same effect
    of an action with value exactly at the upper (or lower) bound.
    Args:
        env (gym.Env): the gym environment
        seed (int): seed value for determinism
--- a/tests/envs/test_envs.py
+++ b/tests/envs/test_envs.py
@@ -4,7 +4,7 @@ import pytest
 from gym import envs
 from gym.spaces import Box
 from gym.utils.env_checker import check_env
-from tests.envs.spec_list import spec_list
+from tests.envs.spec_list import spec_list, spec_list_no_mujoco_py
 # This runs a smoketest on each official registered env. We may want
@@ -13,7 +13,9 @@ from tests.envs.spec_list import spec_list
@pytest.mark.filterwarnings(
    "ignore:.*We recommend you to use a symmetric and normalized Box action space.*"
 )
-@pytest.mark.parametrize("spec", spec_list, ids=[spec.id for spec in spec_list])
+@pytest.mark.parametrize(
    "spec", spec_list_no_mujoco_py, ids=[spec.id for spec in spec_list_no_mujoco_py]
 )
 def test_env(spec):
    # Capture warnings
    with pytest.warns(None) as warnings:
@@ -47,13 +49,15 @@ def test_env(spec):
        assert (
            observation.dtype == ob_space.dtype
        ), f"Step observation dtype: {ob.dtype}, expected: {ob_space.dtype}"
    for mode in env.metadata.get("render_modes", []):
-        env.render(mode=mode)
+        if not (mode == "human" and spec.entry_point.startswith("gym.envs.mujoco")):
            env.render(mode=mode)
    # Make sure we can render the environment after close.
    for mode in env.metadata.get("render_modes", []):
-        env.render(mode=mode)
+        if not (mode == "human" and spec.entry_point.startswith("gym.envs.mujoco")):
            env.render(mode=mode)
    env.close()
--- a/tests/envs/test_mujoco_v2_to_v3_conversion.py
+++ b/tests/envs/test_mujoco_v2_to_v3_conversion.py
@@ -3,7 +3,7 @@ import unittest
 import numpy as np
 from gym import envs
-from tests.envs.spec_list import SKIP_MUJOCO_WARNING_MESSAGE, skip_mujoco
+from tests.envs.spec_list import SKIP_MUJOCO_V3_WARNING_MESSAGE, skip_mujoco_v3
 def verify_environments_match(
@@ -31,7 +31,7 @@ def verify_environments_match(
            np.testing.assert_allclose(old_info[key], new_info[key], atol=eps)
-@unittest.skipIf(skip_mujoco, SKIP_MUJOCO_WARNING_MESSAGE)
+@unittest.skipIf(skip_mujoco_v3, SKIP_MUJOCO_V3_WARNING_MESSAGE)
 class Mujocov2Tov3ConversionTest(unittest.TestCase):
    def test_environments_match(self):
        test_cases = (