Add new MuJoCo bindings (#2762)

2025-07-31 13:54:31 +00:00 · 2022-05-24 08:47:51 -04:00
parent 8f9b62f6a6
commit 3e006f3ea5
43 changed files with 3329 additions and 1523 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -11,7 +11,6 @@ jobs:
      - uses: actions/checkout@v2
      - run: |
           docker build -f py.Dockerfile \
-             --build-arg MUJOCO_KEY=$MUJOCO_KEY \
             --build-arg PYTHON_VERSION=${{ matrix.python-version }} \
             --tag gym-docker .
      - name: Run tests
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -9,7 +9,7 @@ repos:
    hooks:
      - id: codespell
        args:
-          - --ignore-words-list=nd,reacher,thist,ths
+          - --ignore-words-list=nd,reacher,thist,ths, ure, referenc
  - repo: https://gitlab.com/PyCQA/flake8
    rev: 4.0.1
    hooks:
--- a/README.md
+++ b/README.md
@@ -46,6 +46,11 @@ env.close()

 Gym keeps strict versioning for reproducibility reasons. All environments end in a suffix like "\_v0".  When changes are made to environments that might impact learning results, the number is increased by one to prevent potential confusion.

+## MuJoCo Environments
+
+The latest "\_v4" and future versions of the MuJoCo environments will no longer depend on `mujoco-py`. Instead `mujoco` will be the required dependency for future gym MuJoCo environment versions. Old gym MuJoCo environment versions that depend on `mujoco-py` will still be kept but unmaintained.
+To install the dependencies for the latest gym MuJoCo environments use `pip install gym[mujoco]`. Dependencies for old MuJoCo environments can still be installed by `pip install gym[mujoco_py]`. 
+
 ## Citation

 A whitepaper from when Gym just came out is available https://arxiv.org/pdf/1606.01540, and can be cited with the following bibtex entry:
--- a/gym/envs/init.py
+++ b/gym/envs/init.py
@@ -162,6 +162,13 @@ register(
    reward_threshold=-3.75,
 )

+register(
+    id="Reacher-v4",
+    entry_point="gym.envs.mujoco.reacher_v4:ReacherEnv",
+    max_episode_steps=50,
+    reward_threshold=-3.75,
+)
+
 register(
    id="Pusher-v2",
    entry_point="gym.envs.mujoco:PusherEnv",
@@ -169,6 +176,13 @@ register(
    reward_threshold=0.0,
 )

+register(
+    id="Pusher-v4",
+    entry_point="gym.envs.mujoco.pusher_v4:PusherEnv",
+    max_episode_steps=100,
+    reward_threshold=0.0,
+)
+
 register(
    id="InvertedPendulum-v2",
    entry_point="gym.envs.mujoco:InvertedPendulumEnv",
@@ -176,6 +190,13 @@ register(
    reward_threshold=950.0,
 )

+register(
+    id="InvertedPendulum-v4",
+    entry_point="gym.envs.mujoco.inverted_pendulum_v4:InvertedPendulumEnv",
+    max_episode_steps=1000,
+    reward_threshold=950.0,
+)
+
 register(
    id="InvertedDoublePendulum-v2",
    entry_point="gym.envs.mujoco:InvertedDoublePendulumEnv",
@@ -183,6 +204,13 @@ register(
    reward_threshold=9100.0,
 )

+register(
+    id="InvertedDoublePendulum-v4",
+    entry_point="gym.envs.mujoco.inverted_double_pendulum_v4:InvertedDoublePendulumEnv",
+    max_episode_steps=1000,
+    reward_threshold=9100.0,
+)
+
 register(
    id="HalfCheetah-v2",
    entry_point="gym.envs.mujoco:HalfCheetahEnv",
@@ -197,6 +225,13 @@ register(
    reward_threshold=4800.0,
 )

+register(
+    id="HalfCheetah-v4",
+    entry_point="gym.envs.mujoco.half_cheetah_v4:HalfCheetahEnv",
+    max_episode_steps=1000,
+    reward_threshold=4800.0,
+)
+
 register(
    id="Hopper-v2",
    entry_point="gym.envs.mujoco:HopperEnv",
@@ -211,6 +246,13 @@ register(
    reward_threshold=3800.0,
 )

+register(
+    id="Hopper-v4",
+    entry_point="gym.envs.mujoco.hopper_v4:HopperEnv",
+    max_episode_steps=1000,
+    reward_threshold=3800.0,
+)
+
 register(
    id="Swimmer-v2",
    entry_point="gym.envs.mujoco:SwimmerEnv",
@@ -225,6 +267,13 @@ register(
    reward_threshold=360.0,
 )

+register(
+    id="Swimmer-v4",
+    entry_point="gym.envs.mujoco.swimmer_v4:SwimmerEnv",
+    max_episode_steps=1000,
+    reward_threshold=360.0,
+)
+
 register(
    id="Walker2d-v2",
    max_episode_steps=1000,
@@ -237,6 +286,12 @@ register(
    entry_point="gym.envs.mujoco.walker2d_v3:Walker2dEnv",
 )

+register(
+    id="Walker2d-v4",
+    max_episode_steps=1000,
+    entry_point="gym.envs.mujoco.walker2d_v4:Walker2dEnv",
+)
+
 register(
    id="Ant-v2",
    entry_point="gym.envs.mujoco:AntEnv",
@@ -251,6 +306,13 @@ register(
    reward_threshold=6000.0,
 )

+register(
+    id="Ant-v4",
+    entry_point="gym.envs.mujoco.ant_v4:AntEnv",
+    max_episode_steps=1000,
+    reward_threshold=6000.0,
+)
+
 register(
    id="Humanoid-v2",
    entry_point="gym.envs.mujoco:HumanoidEnv",
@@ -263,8 +325,20 @@ register(
    max_episode_steps=1000,
 )

+register(
+    id="Humanoid-v4",
+    entry_point="gym.envs.mujoco.humanoid_v4:HumanoidEnv",
+    max_episode_steps=1000,
+)
+
 register(
    id="HumanoidStandup-v2",
    entry_point="gym.envs.mujoco:HumanoidStandupEnv",
    max_episode_steps=1000,
 )
+
+register(
+    id="HumanoidStandup-v4",
+    entry_point="gym.envs.mujoco.humanoidstandup_v4:HumanoidStandupEnv",
+    max_episode_steps=1000,
+)
--- a/gym/envs/mujoco/init.py
+++ b/gym/envs/mujoco/init.py
@@ -9,6 +9,7 @@ from gym.envs.mujoco.humanoid import HumanoidEnv
 from gym.envs.mujoco.humanoidstandup import HumanoidStandupEnv
 from gym.envs.mujoco.inverted_double_pendulum import InvertedDoublePendulumEnv
 from gym.envs.mujoco.inverted_pendulum import InvertedPendulumEnv
+from gym.envs.mujoco.mujoco_rendering import RenderContextOffscreen, Viewer
 from gym.envs.mujoco.pusher import PusherEnv
 from gym.envs.mujoco.reacher import ReacherEnv
 from gym.envs.mujoco.swimmer import SwimmerEnv
--- a/gym/envs/mujoco/ant.py
+++ b/gym/envs/mujoco/ant.py
@@ -6,7 +6,7 @@ from gym.envs.mujoco import mujoco_env

 class AntEnv(mujoco_env.MujocoEnv, utils.EzPickle):
    def __init__(self):
-        mujoco_env.MujocoEnv.__init__(self, "ant.xml", 5)
+        mujoco_env.MujocoEnv.__init__(self, "ant.xml", 5, mujoco_bindings="mujoco_py")
        utils.EzPickle.__init__(self)

    def step(self, a):
--- a/gym/envs/mujoco/ant_v3.py
+++ b/gym/envs/mujoco/ant_v3.py
@@ -9,167 +9,6 @@ DEFAULT_CAMERA_CONFIG = {


 class AntEnv(mujoco_env.MujocoEnv, utils.EzPickle):
-    """
-    ### Description
-
-    This environment is based on the environment introduced by Schulman,
-    Moritz, Levine, Jordan and Abbeel in ["High-Dimensional Continuous Control
-    Using Generalized Advantage Estimation"](https://arxiv.org/abs/1506.02438).
-    The ant is a 3D robot consisting of one torso (free rotational body) with
-    four legs attached to it with each leg having two links. The goal is to
-    coordinate the four legs to move in the forward (right) direction by applying
-    torques on the eight hinges connecting the two links of each leg and the torso
-    (nine parts and eight hinges).
-
-    ### Action Space
-    The action space is a `Box(-1, 1, (8,), float32)`. An action represents the torques applied at the hinge joints.
-
-    | Num | Action                    | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit |
-    |-----|----------------------|---------------|----------------|---------------------------------------|-------|------|
-    | 0   | Torque applied on the rotor between the torso and front left hip   | -1 | 1 | hip_1 (front_left_leg)       | hinge | torque (N m) |
-    | 1   | Torque applied on the rotor between the front left two links       | -1 | 1 | angle_1 (front_left_leg)     | hinge | torque (N m) |
-    | 2   | Torque applied on the rotor between the torso and front right hip  | -1 | 1 | hip_2 (front_right_leg)      | hinge | torque (N m) |
-    | 3   | Torque applied on the rotor between the front right two links      | -1 | 1 | angle_2 (front_right_leg)    | hinge | torque (N m) |
-    | 4   | Torque applied on the rotor between the torso and back left hip    | -1 | 1 | hip_3 (back_leg)             | hinge | torque (N m) |
-    | 5   | Torque applied on the rotor between the back left two links        | -1 | 1 | angle_3 (back_leg)           | hinge | torque (N m) |
-    | 6   | Torque applied on the rotor between the torso and back right hip   | -1 | 1 | hip_4 (right_back_leg)       | hinge | torque (N m) |
-    | 7   | Torque applied on the rotor between the back right two links       | -1 | 1 | angle_4 (right_back_leg)     | hinge | torque (N m) |
-
-    ### Observation Space
-
-    Observations consist of positional values of different body parts of the ant,
-    followed by the velocities of those individual parts (their derivatives) with all
-    the positions ordered before all the velocities.
-
-    By default, observations do not include the x- and y-coordinates of the ant's torso. These may
-    be included by passing `exclude_current_positions_from_observation=False` during construction.
-    In that case, the observation space will have 113 dimensions where the first two dimensions
-    represent the x- and y- coordinates of the ant's torso.
-    Regardless of whether `exclude_current_positions_from_observation` was set to true or false, the x- and y-coordinates
-    of the torso will be returned in `info` with keys `"x_position"` and `"y_position"`, respectively.
-
-    However, by default, an observation is a `ndarray` with shape `(111,)`
-    where the elements correspond to the following:
-
-    | Num | Observation                                                 | Min                | Max                | Name (in corresponding XML file) | Joint | Unit |
-    |-----|-------------------------------------------------------------|----------------|-----------------|----------------------------------------|-------|------|
-    | 0   | z-coordinate of the torso (centre)                          | -Inf                 | Inf                | torso      | free | position (m) |
-    | 1   | x-orientation of the torso (centre)                         | -Inf                 | Inf                | torso      | free | angle (rad) |
-    | 2   | y-orientation of the torso (centre)                         | -Inf                 | Inf                | torso      | free | angle (rad) |
-    | 3   | z-orientation of the torso (centre)                         | -Inf                 | Inf                | torso      | free | angle (rad) |
-    | 4   | w-orientation of the torso (centre)                         | -Inf                 | Inf               | torso       | free | angle (rad) |
-    | 5   | angle between torso and first link on front left            | -Inf                 | Inf               | hip_1 (front_left_leg) | hinge | angle (rad) |
-    | 6   | angle between the two links on the front left               | -Inf                 | Inf               | ankle_1 (front_left_leg) | hinge | angle (rad) |
-    | 7   | angle between torso and first link on front right           | -Inf                 | Inf               | hip_2 (front_right_leg) | hinge | angle (rad) |
-    | 8  | angle between the two links on the front right              | -Inf                 | Inf               | ankle_2 (front_right_leg) | hinge | angle (rad) |
-    | 9  | angle between torso and first link on back left             | -Inf                 | Inf               | hip_3 (back_leg) | hinge | angle (rad) |
-    | 10  | angle between the two links on the back left                | -Inf                 | Inf               | ankle_3 (back_leg) | hinge | angle (rad) |
-    | 11 | angle between torso and first link on back right            | -Inf                 | Inf               | hip_4 (right_back_leg) | hinge | angle (rad) |
-    | 12  | angle between the two links on the back right               | -Inf                 | Inf               | ankle_4 (right_back_leg) | hinge | angle (rad) |
-    | 13  | x-coordinate velocity of the torso                          | -Inf                 | Inf                | torso      | free | velocity (m/s) |
-    | 14  | y-coordinate velocity of the torso                          | -Inf                 | Inf                | torso      | free | velocity (m/s) |
-    | 15  | z-coordinate velocity of the torso                          | -Inf                 | Inf                | torso      | free | velocity (m/s) |
-    | 16  | x-coordinate angular velocity of the torso                  | -Inf                 | Inf                | torso      | free | angular velocity (rad/s) |
-    | 17  | y-coordinate angular velocity of the torso                  | -Inf                 | Inf                | torso      | free | angular velocity (rad/s) |
-    | 18  | z-coordinate angular velocity of the torso                  | -Inf                 | Inf                | torso      | free | angular velocity (rad/s) |
-    | 19  | angular velocity of angle between torso and front left link | -Inf                 | Inf               | hip_1 (front_left_leg) | hinge | angle (rad) |
-    | 20  | angular velocity of the angle between front left links      | -Inf                 | Inf               | ankle_1 (front_left_leg) | hinge | angle (rad) |
-    | 21  | angular velocity of angle between torso and front right link| -Inf                 | Inf               | hip_2 (front_right_leg) | hinge | angle (rad) |
-    | 22  | angular velocity of the angle between front right links     | -Inf                 | Inf               | ankle_2 (front_right_leg) | hinge | angle (rad) |
-    | 23  | angular velocity of angle between torso and back left link  | -Inf                 | Inf               | hip_3 (back_leg) | hinge | angle (rad) |
-    | 24  | angular velocity of the angle between back left links       | -Inf                 | Inf               | ankle_3 (back_leg) | hinge | angle (rad) |
-    | 25  | angular velocity of angle between torso and back right link | -Inf                 | Inf               | hip_4 (right_back_leg) | hinge | angle (rad) |
-    | 26  |angular velocity of the angle between back right links       | -Inf                 | Inf               | ankle_4 (right_back_leg) | hinge | angle (rad) |
-
-
-    The remaining 14*6 = 84 elements of the observation are contact forces
-    (external forces - force x, y, z and torque x, y, z) applied to the
-    center of mass of each of the links. The 14 links are: the ground link,
-    the torso link, and 3 links for each leg (1 + 1 + 12) with the 6 external forces.
-
-    The (x,y,z) coordinates are translational DOFs while the orientations are rotational
-    DOFs expressed as quaternions. One can read more about free joints on the [Mujoco Documentation](https://mujoco.readthedocs.io/en/latest/XMLreference.html).
-
-
-    **Note:** There have been reported issues that using a Mujoco-Py version > 2.0 results
-    in the contact forces always being 0. As such we recommend to use a Mujoco-Py version < 2.0
-    when using the Ant environment if you would like to report results with contact forces (if
-    contact forces are not used in your experiments, you can use version > 2.0).
-
-    ### Rewards
-    The reward consists of three parts:
-    - *healthy_reward*: Every timestep that the ant is healthy (see definition in section "Episode Termination"), it gets a reward of fixed value `healthy_reward`
-    - *forward_reward*: A reward of moving forward which is measured as
-    *(x-coordinate before action - x-coordinate after action)/dt*. *dt* is the time
-    between actions and is dependent on the `frame_skip` parameter (default is 5),
-    where the frametime is 0.01 - making the default *dt = 5 * 0.01 = 0.05*.
-    This reward would be positive if the ant moves forward (in positive x direction).
-    - *ctrl_cost*: A negative reward for penalising the ant if it takes actions
-    that are too large. It is measured as *`ctrl_cost_weight` * sum(action<sup>2</sup>)*
-    where *`ctr_cost_weight`* is a parameter set for the control and has a default value of 0.5.
-    - *contact_cost*: A negative reward for penalising the ant if the external contact
-    force is too large. It is calculated *`contact_cost_weight` * sum(clip(external contact
-    force to `contact_force_range`)<sup>2</sup>)*.
-
-    The total reward returned is ***reward*** *=* *healthy_reward + forward_reward - ctrl_cost - contact_cost* and `info` will also contain the individual reward terms.
-
-    ### Starting State
-    All observations start in state
-    (0.0, 0.0,  0.75, 1.0, 0.0  ... 0.0) with a uniform noise in the range
-    of [-`reset_noise_scale`, `reset_noise_scale`] added to the positional values and standard normal noise
-    with mean 0 and standard deviation `reset_noise_scale` added to the velocity values for
-    stochasticity. Note that the initial z coordinate is intentionally selected
-    to be slightly high, thereby indicating a standing up ant. The initial orientation
-    is designed to make it face forward as well.
-
-    ### Episode Termination
-    The ant is said to be unhealthy if any of the following happens:
-
-    1. Any of the state space values is no longer finite
-    2. The z-coordinate of the torso is **not** in the closed interval given by `healthy_z_range` (defaults to [0.2, 1.0])
-
-    If `terminate_when_unhealthy=True` is passed during construction (which is the default),
-    the episode terminates when any of the following happens:
-
-    1. The episode duration reaches a 1000 timesteps
-    2. The ant is unhealthy
-
-    If `terminate_when_unhealthy=False` is passed, the episode is terminated only when 1000 timesteps are exceeded.
-
-    ### Arguments
-
-    No additional arguments are currently supported in v2 and lower.
-
-    ```
-    env = gym.make('Ant-v2')
-    ```
-
-    v3 and beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
-
-    ```
-    env = gym.make('Ant-v3', ctrl_cost_weight=0.1, ...)
-    ```
-
-    | Parameter               | Type       | Default      |Description                    |
-    |-------------------------|------------|--------------|-------------------------------|
-    | `xml_file`              | **str**    | `"ant.xml"`  | Path to a MuJoCo model |
-    | `ctrl_cost_weight`      | **float**  | `0.5`        | Weight for *ctrl_cost* term (see section on reward) |
-    | `contact_cost_weight`   | **float**  | `5e-4`       | Weight for *contact_cost* term (see section on reward) |
-    | `healthy_reward`        | **float**  | `1`          | Constant reward given if the ant is "healthy" after timestep |
-    | `terminate_when_unhealthy` | **bool**| `True`       | If true, issue a done signal if the z-coordinate of the torso is no longer in the `healthy_z_range` |
-    | `healthy_z_range`       | **tuple**  | `(0.2, 1)`   | The ant is considered healthy if the z-coordinate of the torso is in this range |
-    | `contact_force_range`   | **tuple**  | `(-1, 1)`    | Contact forces are clipped to this range in the computation of *contact_cost* |
-    | `reset_noise_scale`     | **float**  | `0.1`        | Scale of random perturbations of initial position and velocity (see section on Starting State) |
-    | `exclude_current_positions_from_observation`| **bool** | `True`| Whether or not to omit the x- and y-coordinates from observations. Excluding the position can serve as an inductive bias to induce position-agnostic behavior in policies |
-
-    ### Version History
-
-    * v3: support for gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. rgb rendering comes from tracking camera (so agent does not run away from screen)
-    * v2: All continuous control environments now use mujoco_py >= 1.50
-    * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments.
-    * v0: Initial versions release (1.0.0)
-    """
-
    def __init__(
        self,
        xml_file="ant.xml",
@@ -199,7 +38,7 @@ class AntEnv(mujoco_env.MujocoEnv, utils.EzPickle):
            exclude_current_positions_from_observation
        )

-        mujoco_env.MujocoEnv.__init__(self, xml_file, 5)
+        mujoco_env.MujocoEnv.__init__(self, xml_file, 5, mujoco_bindings="mujoco_py")

    @property
    def healthy_reward(self):
--- a/gym/envs/mujoco/ant_v4.py
+++ b/gym/envs/mujoco/ant_v4.py
@@ -0,0 +1,308 @@
+import numpy as np
+
+from gym import utils
+from gym.envs.mujoco import mujoco_env
+
+DEFAULT_CAMERA_CONFIG = {
+    "distance": 4.0,
+}
+
+
+class AntEnv(mujoco_env.MujocoEnv, utils.EzPickle):
+    """
+    ### Description
+
+    This environment is based on the environment introduced by Schulman,
+    Moritz, Levine, Jordan and Abbeel in ["High-Dimensional Continuous Control
+    Using Generalized Advantage Estimation"](https://arxiv.org/abs/1506.02438).
+    The ant is a 3D robot consisting of one torso (free rotational body) with
+    four legs attached to it with each leg having two links. The goal is to
+    coordinate the four legs to move in the forward (right) direction by applying
+    torques on the eight hinges connecting the two links of each leg and the torso
+    (nine parts and eight hinges).
+
+    ### Action Space
+    The agent take a 8-element vector for actions.
+
+    The action space is a continuous `(action, action, action, action, action, action,
+    action, action)` all in `[-1, 1]`, where `action` represents the numerical torques
+    applied at the hinge joints.
+
+    | Num | Action                    | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit |
+    |-----|----------------------|---------------|----------------|---------------------------------------|-------|------|
+    | 0   | Torque applied on the rotor between the torso and front left hip   | -1 | 1 | hip_1 (front_left_leg)       | hinge | torque (N m) |
+    | 1   | Torque applied on the rotor between the front left two links       | -1 | 1 | angle_1 (front_left_leg)     | hinge | torque (N m) |
+    | 2   | Torque applied on the rotor between the torso and front right hip  | -1 | 1 | hip_2 (front_right_leg)      | hinge | torque (N m) |
+    | 3   | Torque applied on the rotor between the front right two links      | -1 | 1 | angle_2 (front_right_leg)    | hinge | torque (N m) |
+    | 4   | Torque applied on the rotor between the torso and back left hip    | -1 | 1 | hip_3 (back_leg)             | hinge | torque (N m) |
+    | 5   | Torque applied on the rotor between the back left two links        | -1 | 1 | angle_3 (back_leg)           | hinge | torque (N m) |
+    | 6   | Torque applied on the rotor between the torso and back right hip   | -1 | 1 | hip_4 (right_back_leg)       | hinge | torque (N m) |
+    | 7   | Torque applied on the rotor between the back right two links       | -1 | 1 | angle_4 (right_back_leg)     | hinge | torque (N m) |
+
+    ### Observation Space
+
+    The state space consists of positional values of different body parts of the ant,
+    followed by the velocities of those individual parts (their derivatives) with all
+    the positions ordered before all the velocities.
+
+    The observation is a `ndarray` with shape `(111,)` where the elements correspond to the following:
+
+    | Num | Observation                                                 | Min                | Max                | Name (in corresponding XML file) | Joint | Unit |
+    |-----|-------------------------------------------------------------|----------------|-----------------|----------------------------------------|-------|------|
+    | 0   | x-coordinate of the torso (centre)                          | -Inf                 | Inf                | torso      | free | position (m) |
+    | 1   | y-coordinate of the torso (centre)                          | -Inf                 | Inf                | torso      | free | position (m) |
+    | 2   | z-coordinate of the torso (centre)                          | -Inf                 | Inf                | torso      | free | position (m) |
+    | 3   | x-orientation of the torso (centre)                         | -Inf                 | Inf                | torso      | free | angle (rad) |
+    | 4   | y-orientation of the torso (centre)                         | -Inf                 | Inf                | torso      | free | angle (rad) |
+    | 5   | z-orientation of the torso (centre)                         | -Inf                 | Inf                | torso      | free | angle (rad) |
+    | 6   | w-orientation of the torso (centre)                         | -Inf                 | Inf               | torso       | free | angle (rad) |
+    | 7   | angle between torso and first link on front left            | -Inf                 | Inf               | hip_1 (front_left_leg) | hinge | angle (rad) |
+    | 8   | angle between the two links on the front left               | -Inf                 | Inf               | ankle_1 (front_left_leg) | hinge | angle (rad) |
+    | 9   | angle between torso and first link on front right           | -Inf                 | Inf               | hip_2 (front_right_leg) | hinge | angle (rad) |
+    | 10  | angle between the two links on the front right              | -Inf                 | Inf               | ankle_2 (front_right_leg) | hinge | angle (rad) |
+    | 11  | angle between torso and first link on back left             | -Inf                 | Inf               | hip_3 (back_leg) | hinge | angle (rad) |
+    | 12  | angle between the two links on the back left                | -Inf                 | Inf               | ankle_3 (back_leg) | hinge | angle (rad) |
+    | 13  | angle between torso and first link on back right            | -Inf                 | Inf               | hip_4 (right_back_leg) | hinge | angle (rad) |
+    | 14  | angle between the two links on the back right               | -Inf                 | Inf               | ankle_4 (right_back_leg) | hinge | angle (rad) |
+    | 15  | x-coordinate velocity of the torso                          | -Inf                 | Inf                | torso      | free | velocity (m/s) |
+    | 16  | y-coordinate velocity of the torso                          | -Inf                 | Inf                | torso      | free | velocity (m/s) |
+    | 17  | z-coordinate velocity of the torso                          | -Inf                 | Inf                | torso      | free | velocity (m/s) |
+    | 18  | x-coordinate angular velocity of the torso                  | -Inf                 | Inf                | torso      | free | angular velocity (rad/s) |
+    | 19  | y-coordinate angular velocity of the torso                  | -Inf                 | Inf                | torso      | free | angular velocity (rad/s) |
+    | 20  | z-coordinate angular velocity of the torso                  | -Inf                 | Inf                | torso      | free | angular velocity (rad/s) |
+    | 21  | angular velocity of angle between torso and front left link | -Inf                 | Inf               | hip_1 (front_left_leg) | hinge | angle (rad) |
+    | 22  | angular velocity of the angle between front left links      | -Inf                 | Inf               | ankle_1 (front_left_leg) | hinge | angle (rad) |
+    | 23  | angular velocity of angle between torso and front right link| -Inf                 | Inf               | hip_2 (front_right_leg) | hinge | angle (rad) |
+    | 24  | angular velocity of the angle between front right links     | -Inf                 | Inf               | ankle_2 (front_right_leg) | hinge | angle (rad) |
+    | 25  | angular velocity of angle between torso and back left link  | -Inf                 | Inf               | hip_3 (back_leg) | hinge | angle (rad) |
+    | 26  | angular velocity of the angle between back left links       | -Inf                 | Inf               | ankle_3 (back_leg) | hinge | angle (rad) |
+    | 27  | angular velocity of angle between torso and back right link | -Inf                 | Inf               | hip_4 (right_back_leg) | hinge | angle (rad) |
+    | 28  |angular velocity of the angle between back right links       | -Inf                 | Inf               | ankle_4 (right_back_leg) | hinge | angle (rad) |
+
+
+    The remaining 14*6 = 84 elements in the state are contact forces
+    (external forces - force x, y, z and torque x, y, z) applied to the
+    center of mass of each of the links. The 14 links are: the ground link,
+    the torso link, and 3 links for each leg (1 + 1 + 12) with the 6 external forces.
+
+    The (x,y,z) coordinates are translational DOFs while the orientations are rotational
+    DOFs expressed as quaternions. One can read more about free joints on the [Mujoco Documentation](https://mujoco.readthedocs.io/en/latest/XMLreference.html).
+
+    **Note:** There are 29 elements in the table above - giving rise to `(113,)` elements
+    in the state space. In practice (and Gym implementation), the first two positional
+    elements are omitted from the state space since the reward function is calculated based
+    on the x-coordinate value. This value is hidden from the algorithm, which in turn has to
+    develop an abstract understanding of it from the observed rewards. Therefore, observation
+    space has shape `(111,)` instead of `(113,)` and the table should not have the first two rows.
+
+    **Note:** Ant-v4 environment no longer has the following contact forces issue.
+    If using previous Ant versions from v4, there have been reported issues that using a Mujoco-Py version > 2.0 results
+    in the contact forces always being 0. As such we recommend to use a Mujoco-Py version < 2.0
+    when using the Ant environment if you would like to report results with contact forces (if
+    contact forces are not used in your experiments, you can use version > 2.0).
+
+    **Note:** Ant-v4 has the option of including contact forces in the observation space. To add contact forces set the argument
+    'use_contact_forces" to True. The default value is False. Also note that training including contact forces can perform worse
+    than not using them as shown in (https://github.com/openai/gym/pull/2762).
+
+    ### Rewards
+    The reward consists of three parts:
+    - *survive_reward*: Every timestep that the ant is alive, it gets a reward of 1.
+    - *forward_reward*: A reward of moving forward which is measured as
+    *(x-coordinate before action - x-coordinate after action)/dt*. *dt* is the time
+    between actions and is dependent on the frame_skip parameter (default is 5),
+    where the *dt* for one frame is 0.01 - making the default *dt = 5 * 0.01 = 0.05*.
+    This reward would be positive if the ant moves forward (right) desired.
+    - *ctrl_cost*: A negative reward for penalising the ant if it takes actions
+    that are too large. It is measured as *coefficient **x** sum(action<sup>2</sup>)*
+    where *coefficient* is a parameter set for the control and has a default value of 0.5.
+    - *contact_cost*: A negative reward for penalising the ant if the external contact
+    force is too large. It is calculated *0.5 * 0.001 * sum(clip(external contact
+    force to [-1,1])<sup>2</sup>)*.
+
+    The total reward returned is ***reward*** *=* *alive survive_reward + forward_reward - ctrl_cost - contact_cost*
+
+    ### Starting State
+    All observations start in state
+    (0.0, 0.0,  0.75, 1.0, 0.0  ... 0.0) with a uniform noise in the range
+    of [-0.1, 0.1] added to the positional values and standard normal noise
+    with 0 mean and 0.1 standard deviation added to the velocity values for
+    stochasticity. Note that the initial z coordinate is intentionally selected
+    to be slightly high, thereby indicating a standing up ant. The initial orientation
+    is designed to make it face forward as well.
+
+    ### Episode Termination
+    The episode terminates when any of the following happens:
+
+    1. The episode duration reaches a 1000 timesteps
+    2. Any of the state space values is no longer finite
+    3. The y-orientation (index 2) in the state is **not** in the range `[0.2, 1.0]`
+
+    ### Arguments
+
+    No additional arguments are currently supported (in v2 and lower), but modifications
+    can be made to the XML file in the assets folder (or by changing the path to a modified
+    XML file in another folder).
+
+    ```
+    env = gym.make('Ant-v2')
+    ```
+
+    v3 and beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
+
+    ```
+    env = gym.make('Ant-v3', ctrl_cost_weight=0.1, ...)
+    ```
+
+    ### Version History
+    * v4: all mujoco environments now use the mujoco binidings in mujoco>=2.1.3
+    * v3: support for gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. rgb rendering comes from tracking camera (so agent does not run away from screen)
+    * v2: All continuous control environments now use mujoco_py >= 1.50
+    * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments.
+    * v0: Initial versions release (1.0.0)
+    """
+
+    def __init__(
+        self,
+        xml_file="ant.xml",
+        ctrl_cost_weight=0.5,
+        use_contact_forces=False,
+        contact_cost_weight=5e-4,
+        healthy_reward=1.0,
+        terminate_when_unhealthy=True,
+        healthy_z_range=(0.2, 1.0),
+        contact_force_range=(-1.0, 1.0),
+        reset_noise_scale=0.1,
+        exclude_current_positions_from_observation=True,
+    ):
+        utils.EzPickle.__init__(**locals())
+
+        self._ctrl_cost_weight = ctrl_cost_weight
+        self._contact_cost_weight = contact_cost_weight
+
+        self._healthy_reward = healthy_reward
+        self._terminate_when_unhealthy = terminate_when_unhealthy
+        self._healthy_z_range = healthy_z_range
+
+        self._contact_force_range = contact_force_range
+
+        self._reset_noise_scale = reset_noise_scale
+
+        self._use_contact_forces = use_contact_forces
+
+        self._exclude_current_positions_from_observation = (
+            exclude_current_positions_from_observation
+        )
+
+        mujoco_env.MujocoEnv.__init__(self, xml_file, 5)
+
+    @property
+    def healthy_reward(self):
+        return (
+            float(self.is_healthy or self._terminate_when_unhealthy)
+            * self._healthy_reward
+        )
+
+    def control_cost(self, action):
+        control_cost = self._ctrl_cost_weight * np.sum(np.square(action))
+        return control_cost
+
+    @property
+    def contact_forces(self):
+        raw_contact_forces = self.data.cfrc_ext
+        min_value, max_value = self._contact_force_range
+        contact_forces = np.clip(raw_contact_forces, min_value, max_value)
+        return contact_forces
+
+    @property
+    def contact_cost(self):
+        contact_cost = self._contact_cost_weight * np.sum(
+            np.square(self.contact_forces)
+        )
+        return contact_cost
+
+    @property
+    def is_healthy(self):
+        state = self.state_vector()
+        min_z, max_z = self._healthy_z_range
+        is_healthy = np.isfinite(state).all() and min_z <= state[2] <= max_z
+        return is_healthy
+
+    @property
+    def done(self):
+        done = not self.is_healthy if self._terminate_when_unhealthy else False
+        return done
+
+    def step(self, action):
+        xy_position_before = self.get_body_com("torso")[:2].copy()
+        self.do_simulation(action, self.frame_skip)
+        xy_position_after = self.get_body_com("torso")[:2].copy()
+
+        xy_velocity = (xy_position_after - xy_position_before) / self.dt
+        x_velocity, y_velocity = xy_velocity
+
+        forward_reward = x_velocity
+        healthy_reward = self.healthy_reward
+
+        rewards = forward_reward + healthy_reward
+
+        costs = ctrl_cost = self.control_cost(action)
+
+        done = self.done
+        observation = self._get_obs()
+        info = {
+            "reward_forward": forward_reward,
+            "reward_ctrl": -ctrl_cost,
+            "reward_survive": healthy_reward,
+            "x_position": xy_position_after[0],
+            "y_position": xy_position_after[1],
+            "distance_from_origin": np.linalg.norm(xy_position_after, ord=2),
+            "x_velocity": x_velocity,
+            "y_velocity": y_velocity,
+            "forward_reward": forward_reward,
+        }
+        if self._use_contact_forces:
+            contact_cost = self.contact_cost
+            costs += contact_cost
+            info["reward_ctrl"] = -contact_cost
+
+        reward = rewards - costs
+
+        return observation, reward, done, info
+
+    def _get_obs(self):
+        position = self.data.qpos.flat.copy()
+        velocity = self.data.qvel.flat.copy()
+
+        if self._exclude_current_positions_from_observation:
+            position = position[2:]
+
+        if self._use_contact_forces:
+            contact_force = self.contact_forces.flat.copy()
+            return np.concatenate((position, velocity, contact_force))
+        else:
+            return np.concatenate((position, velocity))
+
+    def reset_model(self):
+        noise_low = -self._reset_noise_scale
+        noise_high = self._reset_noise_scale
+
+        qpos = self.init_qpos + self.np_random.uniform(
+            low=noise_low, high=noise_high, size=self.model.nq
+        )
+        qvel = (
+            self.init_qvel
+            + self._reset_noise_scale * self.np_random.standard_normal(self.model.nv)
+        )
+        self.set_state(qpos, qvel)
+
+        observation = self._get_obs()
+
+        return observation
+
+    def viewer_setup(self):
+        for key, value in DEFAULT_CAMERA_CONFIG.items():
+            if isinstance(value, np.ndarray):
+                getattr(self.viewer.cam, key)[:] = value
+            else:
+                setattr(self.viewer.cam, key, value)
--- a/gym/envs/mujoco/assets/inverted_pendulum.xml
+++ b/gym/envs/mujoco/assets/inverted_pendulum.xml
@@ -22,6 +22,6 @@
 		</body>
 	</worldbody>
 	<actuator>
-		<motor gear="100" joint="slider" name="slide"/>
+		<motor ctrllimited="true" ctrlrange="-3 3" gear="100" joint="slider" name="slide"/>
 	</actuator>
 </mujoco>
--- a/gym/envs/mujoco/half_cheetah.py
+++ b/gym/envs/mujoco/half_cheetah.py
@@ -6,7 +6,9 @@ from gym.envs.mujoco import mujoco_env

 class HalfCheetahEnv(mujoco_env.MujocoEnv, utils.EzPickle):
    def __init__(self):
-        mujoco_env.MujocoEnv.__init__(self, "half_cheetah.xml", 5)
+        mujoco_env.MujocoEnv.__init__(
+            self, "half_cheetah.xml", 5, mujoco_bindings="mujoco_py"
+        )
        utils.EzPickle.__init__(self)

    def step(self, action):
--- a/gym/envs/mujoco/half_cheetah_v3.py
+++ b/gym/envs/mujoco/half_cheetah_v3.py
@@ -10,125 +10,6 @@ DEFAULT_CAMERA_CONFIG = {


 class HalfCheetahEnv(mujoco_env.MujocoEnv, utils.EzPickle):
-    """
-    ### Description
-
-    This environment is based on the work by P. Wawrzy´nski in
-    ["A Cat-Like Robot Real-Time Learning to Run"](http://staff.elka.pw.edu.pl/~pwawrzyn/pub-s/0812_LSCLRR.pdf).
-    The HalfCheetah is a 2-dimensional robot consisting of 9 links and 8
-    joints connecting them (including two paws). The goal is to apply a torque
-    on the joints to make the cheetah run forward (right) as fast as possible,
-    with a positive reward allocated based on the distance moved forward and a
-    negative reward allocated for moving backward. The torso and head of the
-    cheetah are fixed, and the torque can only be applied on the other 6 joints
-    over the front and back thighs (connecting to the torso), shins
-    (connecting to the thighs) and feet (connecting to the shins).
-
-    ### Action Space
-    The action space is a `Box(-1, 1, (6,), float32)`. An action represents the torques applied between *links*.
-
-    | Num | Action                                 | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit |
-    |-------|--------------------------------------|---------------|----------------|---------------------------------------|-------|------|
-    | 0   | Torque applied on the back thigh rotor | -1 | 1 | bthigh | hinge | torque (N m) |
-    | 1   | Torque applied on the back shin rotor  | -1 | 1 | bshin | hinge | torque (N m) |
-    | 2   | Torque applied on the back foot rotor  | -1 | 1 | bfoot | hinge | torque (N m) |
-    | 3   | Torque applied on the front thigh rotor| -1 | 1 | fthigh | hinge | torque (N m) |
-    | 4   | Torque applied on the front shin rotor | -1 | 1 | fshin | hinge | torque (N m) |
-    | 5   | Torque applied on the front foot rotor | -1 | 1 | ffoot | hinge | torque (N m) |
-
-    ### Observation Space
-
-    Observations consist of positional values of different body parts of the
-    cheetah, followed by the velocities of those individual parts (their derivatives) with all the positions ordered before all the velocities.
-
-    By default, observations do not include the x-coordinate of the cheetah's center of mass. It may
-    be included by passing `exclude_current_positions_from_observation=False` during construction.
-    In that case, the observation space will have 18 dimensions where the first dimension
-    represents the x-coordinate of the cheetah's center of mass.
-    Regardless of whether `exclude_current_positions_from_observation` was set to true or false, the x-coordinate
-    will be returned in `info` with key `"x_position"`.
-
-    However, by default, the observation is a `ndarray` with shape `(17,)` where the elements correspond to the following:
-
-
-    | Num | Observation           | Min                  | Max                | Name (in corresponding XML file) | Joint| Unit |
-    |-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------|
-    | 0     | z-coordinate of the front tip              | -Inf                 | Inf                | rootz | slide | position (m) |
-    | 1     | angle of the front tip                     | -Inf                 | Inf                | rooty | hinge | angle (rad) |
-    | 2     | angle of the second rotor                  | -Inf                 | Inf                | bthigh | hinge | angle (rad) |
-    | 3     | angle of the second rotor                  | -Inf                 | Inf                | bshin | hinge | angle (rad) |
-    | 4     | velocity of the tip along the x-axis       | -Inf                 | Inf                | bfoot | hinge | angle (rad) |
-    | 5     | velocity of the tip along the y-axis       | -Inf                 | Inf                | fthigh | hinge | angle (rad) |
-    | 6     | angular velocity of front tip              | -Inf                 | Inf                | fshin | hinge | angle (rad) |
-    | 7     | angular velocity of second rotor           | -Inf                 | Inf                | ffoot | hinge | angle (rad) |
-    | 8     | x-coordinate of the front tip              | -Inf                 | Inf                | rootx | slide | velocity (m/s) |
-    | 9     | y-coordinate of the front tip              | -Inf                 | Inf                | rootz | slide | velocity (m/s) |
-    | 10   | angle of the front tip                      | -Inf                 | Inf                | rooty | hinge | angular velocity (rad/s) |
-    | 11   | angle of the second rotor                   | -Inf                 | Inf                | bthigh | hinge | angular velocity (rad/s) |
-    | 12   | angle of the second rotor                   | -Inf                 | Inf                | bshin | hinge | angular velocity (rad/s) |
-    | 13   | velocity of the tip along the x-axis        | -Inf                 | Inf                | bfoot | hinge | angular velocity (rad/s) |
-    | 14   | velocity of the tip along the y-axis        | -Inf                 | Inf                | fthigh | hinge |angular velocity (rad/s) |
-    | 15   | angular velocity of front tip               | -Inf                 | Inf                | fshin | hinge | angular velocity (rad/s) |
-    | 16   | angular velocity of second rotor            | -Inf                 | Inf                | ffoot | hinge | angular velocity (rad/s) |
-
-    ### Rewards
-    The reward consists of two parts:
-    - *forward_reward*: A reward of moving forward which is measured
-    as *`forward_reward_weight` * (x-coordinate before action - x-coordinate after action)/dt*. *dt* is
-    the time between actions and is dependent on the frame_skip parameter
-    (fixed to 5), where the frametime is 0.01 - making the
-    default *dt = 5 * 0.01 = 0.05*. This reward would be positive if the cheetah
-    runs forward (right).
-    - *ctrl_cost*: A cost for penalising the cheetah if it takes
-    actions that are too large. It is measured as *`ctrl_cost_weight` *
-    sum(action<sup>2</sup>)* where *`ctrl_cost_weight`* is a parameter set for the
-    control and has a default value of 0.1
-
-    The total reward returned is ***reward*** *=* *forward_reward - ctrl_cost* and `info` will also contain the individual reward terms
-
-    ### Starting State
-    All observations start in state (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
-    0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,) with a noise added to the
-    initial state for stochasticity. As seen before, the first 8 values in the
-    state are positional and the last 9 values are velocity. A uniform noise in
-    the range of [-`reset_noise_scale`, `reset_noise_scale`] is added to the positional values while a standard
-    normal noise with a mean of 0 and standard deviation of `reset_noise_scale` is added to the
-    initial velocity values of all zeros.
-
-    ### Episode Termination
-    The episode terminates when the episode length is greater than 1000.
-
-    ### Arguments
-
-    No additional arguments are currently supported in v2 and lower.
-
-    ```
-    env = gym.make('HalfCheetah-v2')
-    ```
-
-    v3 and beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
-
-    ```
-    env = gym.make('HalfCheetah-v3', ctrl_cost_weight=0.1, ....)
-    ```
-
-    | Parameter               | Type       | Default              |Description                    |
-    |-------------------------|------------|----------------------|-------------------------------|
-    | `xml_file`              | **str**    | `"half_cheetah.xml"` | Path to a MuJoCo model |
-    | `forward_reward_weight` | **float**  | `1.0`                | Weight for *forward_reward* term (see section on reward) |
-    | `ctrl_cost_weight`      | **float**  | `0.1`                | Weight for *ctrl_cost* weight (see section on reward) |
-    | `reset_noise_scale`     | **float**  | `0.1`                | Scale of random perturbations of initial position and velocity (see section on Starting State) |
-    | `exclude_current_positions_from_observation`| **bool** | `True` | Whether or not to omit the x-coordinate from observations. Excluding the position can serve as an inductive bias to induce position-agnostic behavior in policies |
-
-
-    ### Version History
-
-    * v3: support for gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. rgb rendering comes from tracking camera (so agent does not run away from screen)
-    * v2: All continuous control environments now use mujoco_py >= 1.50
-    * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments.
-    * v0: Initial versions release (1.0.0)
-    """
-
    def __init__(
        self,
        xml_file="half_cheetah.xml",
@@ -149,7 +30,7 @@ class HalfCheetahEnv(mujoco_env.MujocoEnv, utils.EzPickle):
            exclude_current_positions_from_observation
        )

-        mujoco_env.MujocoEnv.__init__(self, xml_file, 5)
+        mujoco_env.MujocoEnv.__init__(self, xml_file, 5, mujoco_bindings="mujoco_py")

    def control_cost(self, action):
        control_cost = self._ctrl_cost_weight * np.sum(np.square(action))
--- a/gym/envs/mujoco/half_cheetah_v4.py
+++ b/gym/envs/mujoco/half_cheetah_v4.py
@@ -0,0 +1,230 @@
+__credits__ = ["Rushiv Arora"]
+import numpy as np
+
+from gym import utils
+from gym.envs.mujoco import mujoco_env
+
+DEFAULT_CAMERA_CONFIG = {
+    "distance": 4.0,
+}
+
+
+class HalfCheetahEnv(mujoco_env.MujocoEnv, utils.EzPickle):
+    """
+    ### Description
+
+    This environment is based on the work by P. Wawrzy´nski in
+    ["A Cat-Like Robot Real-Time Learning to Run"](http://staff.elka.pw.edu.pl/~pwawrzyn/pub-s/0812_LSCLRR.pdf).
+    The HalfCheetah is a 2-dimensional robot consisting of 9 links and 8
+    joints connecting them (including two paws). The goal is to apply a torque
+    on the joints to make the cheetah run forward (right) as fast as possible,
+    with a positive reward allocated based on the distance moved forward and a
+    negative reward allocated for moving backward. The torso and head of the
+    cheetah are fixed, and the torque can only be applied on the other 6 joints
+    over the front and back thighs (connecting to the torso), shins
+    (connecting to the thighs) and feet (connecting to the shins).
+
+    ### Action Space
+    The agent take a 6-element vector for actions.
+    The action space is a continuous `(action, action, action, action, action, action)` all in `[-1.0, 1.0]`, where `action` represents the numerical torques applied between *links*
+
+    | Num | Action                                 | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit |
+    |-------|--------------------------------------|---------------|----------------|---------------------------------------|-------|------|
+    | 0   | Torque applied on the back thigh rotor | -1 | 1 | bthigh | hinge | torque (N m) |
+    | 1   | Torque applied on the back shin rotor  | -1 | 1 | bshin | hinge | torque (N m) |
+    | 2   | Torque applied on the back foot rotor  | -1 | 1 | bfoot | hinge | torque (N m) |
+    | 3   | Torque applied on the front thigh rotor| -1 | 1 | fthigh | hinge | torque (N m) |
+    | 4   | Torque applied on the front shin rotor | -1 | 1 | fshin | hinge | torque (N m) |
+    | 5   | Torque applied on the front foot rotor | -1 | 1 | ffoot | hinge | torque (N m) |
+
+    ### Observation Space
+
+    The state space consists of positional values of different body parts of the
+    cheetah, followed by the velocities of those individual parts (their derivatives) with all the positions ordered before all the velocities.
+
+    The observation is a `ndarray` with shape `(17,)` where the elements correspond to the following:
+
+    | Num | Observation           | Min                  | Max                | Name (in corresponding XML file) | Joint| Unit |
+    |-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------|
+    | 0     | x-coordinate of the center of mass         | -Inf                 | Inf                | rootx | slide | position (m) |
+    | 1     | y-coordinate of the center of mass         | -Inf                 | Inf                | rootz | slide | position (m) |
+    | 2     | angle of the front tip                     | -Inf                 | Inf                | rooty | hinge | angle (rad) |
+    | 3     | angle of the back thigh rotor              | -Inf                 | Inf                | bthigh | hinge | angle (rad) |
+    | 4     | angle of the back shin rotor               | -Inf                 | Inf                | bshin | hinge | angle (rad) |
+    | 5     | angle of the back foot rotor               | -Inf                 | Inf                | bfoot | hinge | angle (rad) |
+    | 6     | velocity of the tip along the y-axis       | -Inf                 | Inf                | fthigh | hinge | angle (rad) |
+    | 7     | angular velocity of front tip              | -Inf                 | Inf                | fshin | hinge | angle (rad) |
+    | 8     | angular velocity of second rotor           | -Inf                 | Inf                | ffoot | hinge | angle (rad) |
+    | 9     | x-coordinate of the front tip              | -Inf                 | Inf                | rootx | slide | velocity (m/s) |
+    | 10   | y-coordinate of the front tip               | -Inf                 | Inf                | rootz | slide | velocity (m/s) |
+    | 11   | angle of the front tip                      | -Inf                 | Inf                | rooty | hinge | angular velocity (rad/s) |
+    | 12   | angle of the second rotor                   | -Inf                 | Inf                | bthigh | hinge | angular velocity (rad/s) |
+    | 13   | angle of the second rotor                   | -Inf                 | Inf                | bshin | hinge | angular velocity (rad/s) |
+    | 14   | velocity of the tip along the x-axis        | -Inf                 | Inf                | bfoot | hinge | angular velocity (rad/s) |
+    | 15   | velocity of the tip along the y-axis        | -Inf                 | Inf                | fthigh | hinge |angular velocity (rad/s) |
+    | 16   | angular velocity of front tip               | -Inf                 | Inf                | fshin | hinge | angular velocity (rad/s) |
+    | 17   | angular velocity of second rotor            | -Inf                 | Inf                | ffoot | hinge | angular velocity (rad/s) |
+
+
+    **Note:**
+    In practice (and Gym implementation), the first positional element is
+    omitted from the state space since the reward function is calculated based
+    on that value. This value is hidden from the algorithm, which in turn has
+    to develop an abstract understanding of it from the observed rewards.
+    Therefore, observation space has shape `(8,)` and looks like:
+
+    | Num | Observation           | Min                  | Max                | Name (in corresponding XML file) | Joint| Unit |
+    |-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------|
+    | 0     | y-coordinate of the front tip              | -Inf                 | Inf                | rootz | slide | position (m) |
+    | 1     | angle of the front tip                     | -Inf                 | Inf                | rooty | hinge | angle (rad) |
+    | 2     | angle of the second rotor                  | -Inf                 | Inf                | bthigh | hinge | angle (rad) |
+    | 3     | angle of the second rotor                  | -Inf                 | Inf                | bshin | hinge | angle (rad) |
+    | 4     | velocity of the tip along the x-axis       | -Inf                 | Inf                | bfoot | hinge | angle (rad) |
+    | 5     | velocity of the tip along the y-axis       | -Inf                 | Inf                | fthigh | hinge | angle (rad) |
+    | 6     | angular velocity of front tip              | -Inf                 | Inf                | fshin | hinge | angle (rad) |
+    | 7     | angular velocity of second rotor           | -Inf                 | Inf                | ffoot | hinge | angle (rad) |
+    | 8     | x-coordinate of the front tip              | -Inf                 | Inf                | rootx | slide | velocity (m/s) |
+    | 9     | y-coordinate of the front tip              | -Inf                 | Inf                | rootz | slide | velocity (m/s) |
+    | 10   | angle of the front tip                      | -Inf                 | Inf                | rooty | hinge | angular velocity (rad/s) |
+    | 11   | angle of the second rotor                   | -Inf                 | Inf                | bthigh | hinge | angular velocity (rad/s) |
+    | 12   | angle of the second rotor                   | -Inf                 | Inf                | bshin | hinge | angular velocity (rad/s) |
+    | 13   | velocity of the tip along the x-axis        | -Inf                 | Inf                | bfoot | hinge | angular velocity (rad/s) |
+    | 14   | velocity of the tip along the y-axis        | -Inf                 | Inf                | fthigh | hinge |angular velocity (rad/s) |
+    | 15   | angular velocity of front tip               | -Inf                 | Inf                | fshin | hinge | angular velocity (rad/s) |
+    | 16   | angular velocity of second rotor            | -Inf                 | Inf                | ffoot | hinge | angular velocity (rad/s) |
+
+    ### Rewards
+    The reward consists of two parts:
+    - *reward_run*: A reward of moving forward which is measured
+    as *(x-coordinate before action - x-coordinate after action)/dt*. *dt* is
+    the time between actions and is dependeent on the frame_skip parameter
+    (default is 5), where the *dt* for one frame is 0.01 - making the
+    default *dt = 5*0.01 = 0.05*. This reward would be positive if the cheetah
+    runs forward (right) desired.
+    - *reward_control*: A negative reward for penalising the cheetah if it takes
+    actions that are too large. It is measured as *-coefficient x
+    sum(action<sup>2</sup>)* where *coefficient* is a parameter set for the
+    control and has a default value of 0.1
+
+    The total reward returned is ***reward*** *=* *reward_run + reward_control*
+
+    ### Starting State
+    All observations start in state (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+    0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,) with a noise added to the
+    initial state for stochasticity. As seen before, the first 8 values in the
+    state are positional and the last 9 values are velocity. A uniform noise in
+    the range of [-0.1, 0.1] is added to the positional values while a standard
+    normal noise with a mean of 0 and standard deviation of 0.1 is added to the
+    initial velocity values of all zeros.
+
+    ### Episode Termination
+    The episode terminates when the episode length is greater than 1000.
+
+    ### Arguments
+
+    No additional arguments are currently supported (in v2 and lower), but
+    modifications can be made to the XML file in the assets folder at
+    `gym/envs/mujoco/assets/half_cheetah.xml` (or by changing the path to a
+    modified XML file in another folder).
+
+    ```
+    env = gym.make('HalfCheetah-v2')
+    ```
+
+    v3 and beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
+
+    ```
+    env = gym.make('HalfCheetah-v3', ctrl_cost_weight=0.1, ....)
+    ```
+
+
+    ### Version History
+
+    * v4: all mujoco environments now use the mujoco binidings in mujoco>=2.1.3
+    * v3: support for gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. rgb rendering comes from tracking camera (so agent does not run away from screen)
+    * v2: All continuous control environments now use mujoco_py >= 1.50
+    * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments.
+    * v0: Initial versions release (1.0.0)
+    """
+
+    def __init__(
+        self,
+        xml_file="half_cheetah.xml",
+        forward_reward_weight=1.0,
+        ctrl_cost_weight=0.1,
+        reset_noise_scale=0.1,
+        exclude_current_positions_from_observation=True,
+    ):
+        utils.EzPickle.__init__(**locals())
+
+        self._forward_reward_weight = forward_reward_weight
+
+        self._ctrl_cost_weight = ctrl_cost_weight
+
+        self._reset_noise_scale = reset_noise_scale
+
+        self._exclude_current_positions_from_observation = (
+            exclude_current_positions_from_observation
+        )
+
+        mujoco_env.MujocoEnv.__init__(self, xml_file, 5)
+
+    def control_cost(self, action):
+        control_cost = self._ctrl_cost_weight * np.sum(np.square(action))
+        return control_cost
+
+    def step(self, action):
+        x_position_before = self.data.qpos[0]
+        self.do_simulation(action, self.frame_skip)
+        x_position_after = self.data.qpos[0]
+        x_velocity = (x_position_after - x_position_before) / self.dt
+
+        ctrl_cost = self.control_cost(action)
+
+        forward_reward = self._forward_reward_weight * x_velocity
+
+        observation = self._get_obs()
+        reward = forward_reward - ctrl_cost
+        done = False
+        info = {
+            "x_position": x_position_after,
+            "x_velocity": x_velocity,
+            "reward_run": forward_reward,
+            "reward_ctrl": -ctrl_cost,
+        }
+
+        return observation, reward, done, info
+
+    def _get_obs(self):
+        position = self.data.qpos.flat.copy()
+        velocity = self.data.qvel.flat.copy()
+
+        if self._exclude_current_positions_from_observation:
+            position = position[1:]
+
+        observation = np.concatenate((position, velocity)).ravel()
+        return observation
+
+    def reset_model(self):
+        noise_low = -self._reset_noise_scale
+        noise_high = self._reset_noise_scale
+
+        qpos = self.init_qpos + self.np_random.uniform(
+            low=noise_low, high=noise_high, size=self.model.nq
+        )
+        qvel = (
+            self.init_qvel
+            + self._reset_noise_scale * self.np_random.standard_normal(self.model.nv)
+        )
+
+        self.set_state(qpos, qvel)
+
+        observation = self._get_obs()
+        return observation
+
+    def viewer_setup(self):
+        for key, value in DEFAULT_CAMERA_CONFIG.items():
+            if isinstance(value, np.ndarray):
+                getattr(self.viewer.cam, key)[:] = value
+            else:
+                setattr(self.viewer.cam, key, value)
--- a/gym/envs/mujoco/hopper.py
+++ b/gym/envs/mujoco/hopper.py
@@ -6,7 +6,9 @@ from gym.envs.mujoco import mujoco_env

 class HopperEnv(mujoco_env.MujocoEnv, utils.EzPickle):
    def __init__(self):
-        mujoco_env.MujocoEnv.__init__(self, "hopper.xml", 4)
+        mujoco_env.MujocoEnv.__init__(
+            self, "hopper.xml", 4, mujoco_bindings="mujoco_py"
+        )
        utils.EzPickle.__init__(self)

    def step(self, a):
--- a/gym/envs/mujoco/hopper_v3.py
+++ b/gym/envs/mujoco/hopper_v3.py
@@ -14,130 +14,6 @@ DEFAULT_CAMERA_CONFIG = {


 class HopperEnv(mujoco_env.MujocoEnv, utils.EzPickle):
-    """
-    ### Description
-
-    This environment is based on the work done by Erez, Tassa, and Todorov in
-    ["Infinite Horizon Model Predictive Control for Nonlinear Periodic Tasks"](http://www.roboticsproceedings.org/rss07/p10.pdf). The environment aims to
-    increase the number of independent state and control variables as compared to
-    the classic control environments. The hopper is a two-dimensional
-    one-legged figure that consist of four main body parts - the torso at the
-    top, the thigh in the middle, the leg in the bottom, and a single foot on
-    which the entire body rests. The goal is to make hops that move in the
-    forward (right) direction by applying torques on the three hinges
-    connecting the four body parts.
-
-    ### Action Space
-    The action space is a `Box(-1, 1, (3,), float32)`. An action represents the torques applied between *links*
-
-    | Num | Action                             | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit         |
-    |-----|------------------------------------|-------------|-------------|----------------------------------|-------|--------------|
-    | 0   | Torque applied on the thigh rotor  | -1          | 1           | thigh_joint                      | hinge | torque (N m) |
-    | 1   | Torque applied on the leg rotor    | -1          | 1           | leg_joint                        | hinge | torque (N m) |
-    | 3   | Torque applied on the foot rotor   | -1          | 1           | foot_joint                       | hinge | torque (N m) |
-
-    ### Observation Space
-
-    Observations consist of positional values of different body parts of the
-    hopper, followed by the velocities of those individual parts
-    (their derivatives) with all the positions ordered before all the velocities.
-
-    By default, observations do not include the x-coordinate of the hopper. It may
-    be included by passing `exclude_current_positions_from_observation=False` during construction.
-    In that case, the observation space will have 12 dimensions where the first dimension
-    represents the x-coordinate of the hopper.
-    Regardless of whether `exclude_current_positions_from_observation` was set to true or false, the x-coordinate
-    will be returned in `info` with key `"x_position"`.
-
-    However, by default, the observation is a `ndarray` with shape `(11,)` where the elements
-    correspond to the following:
-
-    | Num | Observation           | Min                  | Max                | Name (in corresponding XML file) | Joint| Unit |
-    |-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------|
-    | 0   | z-coordinate of the top (height of hopper)       | -Inf                 | Inf                | rootz | slide | position (m) |
-    | 1   | angle of the top                                 | -Inf                 | Inf                | rooty | hinge | angle (rad) |
-    | 2   | angle of the thigh joint                         | -Inf                 | Inf                | thigh_joint | hinge | angle (rad) |
-    | 3   | angle of the leg joint                           | -Inf                 | Inf                | leg_joint | hinge | angle (rad) |
-    | 4   | angle of the foot joint                          | -Inf                 | Inf                | foot_joint | hinge | angle (rad) |
-    | 5   | velocity of the x-coordinate of the top          | -Inf                 | Inf                | rootx | slide | velocity (m/s) |
-    | 6   | velocity of the z-coordinate (height) of the top | -Inf                 | Inf                | rootz | slide | velocity (m/s)  |
-    | 7   | angular velocity of the angle of the top         | -Inf                 | Inf                | rooty | hinge | angular velocity (rad/s) |
-    | 8   | angular velocity of the thigh hinge              | -Inf                 | Inf                | thigh_joint | hinge | angular velocity (rad/s) |
-    | 9   | angular velocity of the leg hinge                | -Inf                 | Inf                | leg_joint | hinge | angular velocity (rad/s) |
-    | 10  | angular velocity of the foot hinge               | -Inf                 | Inf                | foot_joint | hinge | angular velocity (rad/s) |
-
-    ### Rewards
-    The reward consists of three parts:
-    - *healthy_reward*: Every timestep that the hopper is healthy (see definition in section "Episode Termination"), it gets a reward of fixed value `healthy_reward`.
-    - *forward_reward*: A reward of hopping forward which is measured
-    as *`forward_reward_weight` * (x-coordinate before action - x-coordinate after action)/dt*. *dt* is
-    the time between actions and is dependent on the frame_skip parameter
-    (fixed to 4), where the frametime is 0.002 - making the
-    default *dt = 4 * 0.002 = 0.008*. This reward would be positive if the hopper
-    hops forward (positive x direction).
-    - *ctrl_cost*: A cost for penalising the hopper if it takes
-    actions that are too large. It is measured as *`ctrl_cost_weight` *
-    sum(action<sup>2</sup>)* where *`ctrl_cost_weight`* is a parameter set for the
-    control and has a default value of 0.001
-
-    The total reward returned is ***reward*** *=* *healthy_reward + forward_reward - ctrl_cost* and `info` will also contain the individual reward terms
-
-    ### Starting State
-    All observations start in state
-    (0.0, 1.25, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0) with a uniform noise
-     in the range of [-`reset_noise_scale`, `reset_noise_scale`] added to the values for stochasticity.
-
-    ### Episode Termination
-    The hopper is said to be unhealthy if any of the following happens:
-
-    1. An element of `observation[1:]` (if  `exclude_current_positions_from_observation=True`, else `observation[2:]`) is no longer contained in the closed interval specified by the argument `healthy_state_range`
-    2. The height of the hopper (`observation[0]` if  `exclude_current_positions_from_observation=True`, else `observation[1]`) is no longer contained in the closed interval specified by the argument `healthy_z_range` (usually meaning that it has fallen)
-    3. The angle (`observation[1]` if  `exclude_current_positions_from_observation=True`, else `observation[2]`) is no longer contained in the closed interval specified by the argument `healthy_angle_range`
-
-    If `terminate_when_unhealthy=True` is passed during construction (which is the default),
-    the episode terminates when any of the following happens:
-
-    1. The episode duration reaches a 1000 timesteps
-    2. The hopper is unhealthy
-
-    If `terminate_when_unhealthy=False` is passed, the episode is terminated only when 1000 timesteps are exceeded.
-
-    ### Arguments
-
-    No additional arguments are currently supported in v2 and lower.
-
-    ```
-    env = gym.make('Hopper-v2')
-    ```
-
-    v3 and beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
-
-    ```
-    env = gym.make('Hopper-v3', ctrl_cost_weight=0.1, ....)
-    ```
-
-    | Parameter               | Type       | Default        |Description                    |
-    |-------------------------|------------|----------------|-------------------------------|
-    | `xml_file`              | **str**    | `"hopper.xml"` | Path to a MuJoCo model |
-    | `forward_reward_weight` | **float**  | `1.0`          | Weight for *forward_reward* term (see section on reward) |
-    | `ctrl_cost_weight`      | **float**  | `0.001`        | Weight for *ctrl_cost* reward (see section on reward) |
-    | `healthy_reward`        | **float**  | `1`            | Constant reward given if the ant is "healthy" after timestep |
-    | `terminate_when_unhealthy` | **bool**| `True`         | If true, issue a done signal if the hopper is no longer healthy |
-    | `healthy_state_range`   | **tuple**  | `(-100, 100)`  | The elements of `observation[1:]` (if  `exclude_current_positions_from_observation=True`, else `observation[2:]`) must be in this range for the hopper to be considered healthy |
-    | `healthy_z_range`       | **tuple**  | `(0.7, float("inf"))`    | The z-coordinate must be in this range for the hopper to be considered healthy |
-    | `healthy_angle_range`   | **tuple**  | `(-0.2, 0.2)`   | The angle given by `observation[1]` (if  `exclude_current_positions_from_observation=True`, else `observation[2]`) must be in this range for the hopper to be considered healthy |
-    | `reset_noise_scale`     | **float**  | `5e-3`         | Scale of random perturbations of initial position and velocity (see section on Starting State) |
-    | `exclude_current_positions_from_observation`| **bool** | `True`| Whether or not to omit the x-coordinate from observations. Excluding the position can serve as an inductive bias to induce position-agnostic behavior in policies |
-
-
-    ### Version History
-
-    * v3: support for gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. rgb rendering comes from tracking camera (so agent does not run away from screen)
-    * v2: All continuous control environments now use mujoco_py >= 1.50
-    * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments.
-    * v0: Initial versions release (1.0.0)
-    """
-
    def __init__(
        self,
        xml_file="hopper.xml",
@@ -170,7 +46,7 @@ class HopperEnv(mujoco_env.MujocoEnv, utils.EzPickle):
            exclude_current_positions_from_observation
        )

-        mujoco_env.MujocoEnv.__init__(self, xml_file, 4)
+        mujoco_env.MujocoEnv.__init__(self, xml_file, 4, mujoco_bindings="mujoco_py")

    @property
    def healthy_reward(self):
--- a/gym/envs/mujoco/hopper_v4.py
+++ b/gym/envs/mujoco/hopper_v4.py
@@ -0,0 +1,263 @@
+import numpy as np
+
+from gym import utils
+from gym.envs.mujoco import mujoco_env
+
+DEFAULT_CAMERA_CONFIG = {
+    "trackbodyid": 2,
+    "distance": 3.0,
+    "lookat": np.array((0.0, 0.0, 1.15)),
+    "elevation": -20.0,
+}
+
+
+class HopperEnv(mujoco_env.MujocoEnv, utils.EzPickle):
+    """
+    ### Description
+
+    This environment is based on the work done by Erez, Tassa, and Todorov in
+    ["Infinite Horizon Model Predictive Control for Nonlinear Periodic Tasks"](http://www.roboticsproceedings.org/rss07/p10.pdf). The environment aims to
+    increase the number of independent state and control variables as compared to
+    the classic control environments. The hopper is a two-dimensional
+    one-legged figure that consist of four main body parts - the torso at the
+    top, the thigh in the middle, the leg in the bottom, and a single foot on
+    which the entire body rests. The goal is to make hops that move in the
+    forward (right) direction by applying torques on the three hinges
+    connecting the four body parts.
+
+    ### Action Space
+    The agent take a 3-element vector for actions.
+    The action space is a continuous `(action, action, action)` all in `[-1, 1]`
+    , where `action` represents the numerical torques applied between *links*
+
+    | Num | Action                             | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit         |
+    |-----|------------------------------------|-------------|-------------|----------------------------------|-------|--------------|
+    | 0   | Torque applied on the thigh rotor  | -1          | 1           | thigh_joint                      | hinge | torque (N m) |
+    | 1   | Torque applied on the leg rotor    | -1          | 1           | leg_joint                        | hinge | torque (N m) |
+    | 3   | Torque applied on the foot rotor   | -1          | 1           | foot_joint                       | hinge | torque (N m) |
+
+    ### Observation Space
+
+    The state space consists of positional values of different body parts of the
+    hopper, followed by the velocities of those individual parts
+    (their derivatives) with all the positions ordered before all the velocities.
+
+    The observation is a `ndarray` with shape `(11,)` where the elements
+    correspond to the following:
+
+    | Num | Observation           | Min                  | Max                | Name (in corresponding XML file) | Joint| Unit |
+    |-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------|
+    | 0   | x-coordinate of the top                          | -Inf                 | Inf                | rootx | slide | position (m) |
+    | 1   | z-coordinate of the top (height of hopper)       | -Inf                 | Inf                | rootz | slide | position (m) |
+    | 2   | angle of the top                                 | -Inf                 | Inf                | rooty | hinge | angle (rad) |
+    | 3   | angle of the thigh joint                         | -Inf                 | Inf                | thigh_joint | hinge | angle (rad) |
+    | 4   | angle of the leg joint                           | -Inf                 | Inf                | leg_joint | hinge | angle (rad) |
+    | 5   | angle of the foot joint                          | -Inf                 | Inf                | foot_joint | hinge | angle (rad) |
+    | 6   | velocity of the x-coordinate of the top          | -Inf                 | Inf                | rootx | slide | velocity (m/s) |
+    | 7   | velocity of the z-coordinate (height) of the top | -Inf                 | Inf                | rootz | slide | velocity (m/s)  |
+    | 8   | angular velocity of the angle of the top         | -Inf                 | Inf                | rooty | hinge | angular velocity (rad/s) |
+    | 9   | angular velocity of the thigh hinge              | -Inf                 | Inf                | thigh_joint | hinge | angular velocity (rad/s) |
+    | 10  | angular velocity of the leg hinge                | -Inf                 | Inf                | leg_joint | hinge | angular velocity (rad/s) |
+    | 11  | angular velocity of the foot hinge               | -Inf                 | Inf                | foot_joint | hinge | angular velocity (rad/s) |
+
+
+
+    **Note:**
+    In practice (and Gym implementation), the first positional element is
+    omitted from the state space since the reward function is calculated based
+    on that value. This value is hidden from the algorithm, which in turn has
+    to develop an abstract understanding of it from the observed rewards.
+    Therefore, observation space has shape `(11,)` instead of `(12,)` and looks like:
+
+    | Num | Observation           | Min                  | Max                | Name (in corresponding XML file) | Joint| Unit |
+    |-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------|
+    | 0   | z-coordinate of the top (height of hopper)       | -Inf                 | Inf                | rootz | slide | position (m) |
+    | 1   | angle of the top                                 | -Inf                 | Inf                | rooty | hinge | angle (rad) |
+    | 2   | angle of the thigh joint                         | -Inf                 | Inf                | thigh_joint | hinge | angle (rad) |
+    | 3   | angle of the leg joint                           | -Inf                 | Inf                | leg_joint | hinge | angle (rad) |
+    | 4   | angle of the foot joint                          | -Inf                 | Inf                | foot_joint | hinge | angle (rad) |
+    | 5   | velocity of the x-coordinate of the top          | -Inf                 | Inf                | rootx | slide | velocity (m/s) |
+    | 6   | velocity of the z-coordinate (height) of the top | -Inf                 | Inf                | rootz | slide | velocity (m/s)  |
+    | 7   | angular velocity of the angle of the top         | -Inf                 | Inf                | rooty | hinge | angular velocity (rad/s) |
+    | 8   | angular velocity of the thigh hinge              | -Inf                 | Inf                | thigh_joint | hinge | angular velocity (rad/s) |
+    | 9   | angular velocity of the leg hinge                | -Inf                 | Inf                | leg_joint | hinge | angular velocity (rad/s) |
+    | 10  | angular velocity of the foot hinge               | -Inf                 | Inf                | foot_joint | hinge | angular velocity (rad/s) |
+
+    ### Rewards
+    The reward consists of three parts:
+    - *alive bonus*: Every timestep that the hopper is alive, it gets a reward of 1,
+    - *reward_forward*: A reward of hopping forward which is measured
+    as *(x-coordinate before action - x-coordinate after action)/dt*. *dt* is
+    the time between actions and is dependeent on the frame_skip parameter
+    (default is 4), where the *dt* for one frame is 0.002 - making the
+    default *dt = 4*0.002 = 0.008*. This reward would be positive if the hopper
+    hops forward (right) desired.
+    - *reward_control*: A negative reward for penalising the hopper if it takes
+    actions that are too large. It is measured as *-coefficient **x**
+    sum(action<sup>2</sup>)* where *coefficient* is a parameter set for the
+    control and has a default value of 0.001
+
+    The total reward returned is ***reward*** *=* *alive bonus + reward_forward + reward_control*
+
+    ### Starting State
+    All observations start in state
+    (0.0, 1.25, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0) with a uniform noise
+     in the range of [-0.005, 0.005] added to the values for stochasticity.
+
+    ### Episode Termination
+    The episode terminates when any of the following happens:
+
+    1. The episode duration reaches a 1000 timesteps
+    2. Any of the state space values is no longer finite
+    3. The absolute value of any of the state variable indexed (angle and beyond) is greater than 100
+    4. The height of the hopper becomes greater than 0.7 metres (hopper has hopped too high).
+    5. The absolute value of the angle (index 2) is less than 0.2 radians (hopper has fallen down).
+
+    ### Arguments
+
+    No additional arguments are currently supported (in v2 and lower), but
+    modifications can be made to the XML file in the assets folder
+    (or by changing the path to a modified XML file in another folder).
+
+    ```
+    env = gym.make('Hopper-v2')
+    ```
+
+    v3 and beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
+
+    ```
+    env = gym.make('Hopper-v3', ctrl_cost_weight=0.1, ....)
+    ```
+
+    ### Version History
+
+    * v4: all mujoco environments now use the mujoco binidings in mujoco>=2.1.3
+    * v3: support for gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. rgb rendering comes from tracking camera (so agent does not run away from screen)
+    * v2: All continuous control environments now use mujoco_py >= 1.50
+    * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments.
+    * v0: Initial versions release (1.0.0)
+    """
+
+    def __init__(
+        self,
+        xml_file="hopper.xml",
+        forward_reward_weight=1.0,
+        ctrl_cost_weight=1e-3,
+        healthy_reward=1.0,
+        terminate_when_unhealthy=True,
+        healthy_state_range=(-100.0, 100.0),
+        healthy_z_range=(0.7, float("inf")),
+        healthy_angle_range=(-0.2, 0.2),
+        reset_noise_scale=5e-3,
+        exclude_current_positions_from_observation=True,
+    ):
+        utils.EzPickle.__init__(**locals())
+
+        self._forward_reward_weight = forward_reward_weight
+
+        self._ctrl_cost_weight = ctrl_cost_weight
+
+        self._healthy_reward = healthy_reward
+        self._terminate_when_unhealthy = terminate_when_unhealthy
+
+        self._healthy_state_range = healthy_state_range
+        self._healthy_z_range = healthy_z_range
+        self._healthy_angle_range = healthy_angle_range
+
+        self._reset_noise_scale = reset_noise_scale
+
+        self._exclude_current_positions_from_observation = (
+            exclude_current_positions_from_observation
+        )
+
+        mujoco_env.MujocoEnv.__init__(self, xml_file, 4)
+
+    @property
+    def healthy_reward(self):
+        return (
+            float(self.is_healthy or self._terminate_when_unhealthy)
+            * self._healthy_reward
+        )
+
+    def control_cost(self, action):
+        control_cost = self._ctrl_cost_weight * np.sum(np.square(action))
+        return control_cost
+
+    @property
+    def is_healthy(self):
+        z, angle = self.data.qpos[1:3]
+        state = self.state_vector()[2:]
+
+        min_state, max_state = self._healthy_state_range
+        min_z, max_z = self._healthy_z_range
+        min_angle, max_angle = self._healthy_angle_range
+
+        healthy_state = np.all(np.logical_and(min_state < state, state < max_state))
+        healthy_z = min_z < z < max_z
+        healthy_angle = min_angle < angle < max_angle
+
+        is_healthy = all((healthy_state, healthy_z, healthy_angle))
+
+        return is_healthy
+
+    @property
+    def done(self):
+        done = not self.is_healthy if self._terminate_when_unhealthy else False
+        return done
+
+    def _get_obs(self):
+        position = self.data.qpos.flat.copy()
+        velocity = np.clip(self.data.qvel.flat.copy(), -10, 10)
+
+        if self._exclude_current_positions_from_observation:
+            position = position[1:]
+
+        observation = np.concatenate((position, velocity)).ravel()
+        return observation
+
+    def step(self, action):
+        x_position_before = self.data.qpos[0]
+        self.do_simulation(action, self.frame_skip)
+        x_position_after = self.data.qpos[0]
+        x_velocity = (x_position_after - x_position_before) / self.dt
+
+        ctrl_cost = self.control_cost(action)
+
+        forward_reward = self._forward_reward_weight * x_velocity
+        healthy_reward = self.healthy_reward
+
+        rewards = forward_reward + healthy_reward
+        costs = ctrl_cost
+
+        observation = self._get_obs()
+        reward = rewards - costs
+        done = self.done
+        info = {
+            "x_position": x_position_after,
+            "x_velocity": x_velocity,
+        }
+
+        return observation, reward, done, info
+
+    def reset_model(self):
+        noise_low = -self._reset_noise_scale
+        noise_high = self._reset_noise_scale
+
+        qpos = self.init_qpos + self.np_random.uniform(
+            low=noise_low, high=noise_high, size=self.model.nq
+        )
+        qvel = self.init_qvel + self.np_random.uniform(
+            low=noise_low, high=noise_high, size=self.model.nv
+        )
+
+        self.set_state(qpos, qvel)
+
+        observation = self._get_obs()
+        return observation
+
+    def viewer_setup(self):
+        for key, value in DEFAULT_CAMERA_CONFIG.items():
+            if isinstance(value, np.ndarray):
+                getattr(self.viewer.cam, key)[:] = value
+            else:
+                setattr(self.viewer.cam, key, value)
--- a/gym/envs/mujoco/humanoid.py
+++ b/gym/envs/mujoco/humanoid.py
@@ -12,7 +12,9 @@ def mass_center(model, sim):

 class HumanoidEnv(mujoco_env.MujocoEnv, utils.EzPickle):
    def __init__(self):
-        mujoco_env.MujocoEnv.__init__(self, "humanoid.xml", 5)
+        mujoco_env.MujocoEnv.__init__(
+            self, "humanoid.xml", 5, mujoco_bindings="mujoco_py"
+        )
        utils.EzPickle.__init__(self)

    def _get_obs(self):
--- a/gym/envs/mujoco/humanoid_v3.py
+++ b/gym/envs/mujoco/humanoid_v3.py
@@ -18,198 +18,6 @@ def mass_center(model, sim):


 class HumanoidEnv(mujoco_env.MujocoEnv, utils.EzPickle):
-    """
-    ### Description
-
-    This environment is based on the environment introduced by Tassa, Erez and Todorov
-    in ["Synthesis and stabilization of complex behaviors through online trajectory optimization"](https://ieeexplore.ieee.org/document/6386025).
-    The 3D bipedal robot is designed to simulate a human. It has a torso (abdomen) with a pair of
-    legs and arms. The legs each consist of two links, and so the arms (representing the knees and
-    elbows respectively). The goal of the environment is to walk forward as fast as possible without falling over.
-
-    ### Action Space
-    The action space is a `Box(-1, 1, (17,), float32)`. An action represents the torques applied at the hinge joints.
-
-    | Num | Action                    | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit |
-    |-----|----------------------|---------------|----------------|---------------------------------------|-------|------|
-    | 0   | Torque applied on the hinge in the y-coordinate of the abdomen                     | -0.4 | 0.4 | hip_1 (front_left_leg)      | hinge | torque (N m) |
-    | 1   | Torque applied on the hinge in the z-coordinate of the abdomen                     | -0.4 | 0.4 | angle_1 (front_left_leg)    | hinge | torque (N m) |
-    | 2   | Torque applied on the hinge in the x-coordinate of the abdomen                     | -0.4 | 0.4 | hip_2 (front_right_leg)     | hinge | torque (N m) |
-    | 3   | Torque applied on the rotor between torso/abdomen and the right hip (x-coordinate) | -0.4 | 0.4 | right_hip_x (right_thigh)   | hinge | torque (N m) |
-    | 4   | Torque applied on the rotor between torso/abdomen and the right hip (z-coordinate) | -0.4 | 0.4 | right_hip_z (right_thigh)   | hinge | torque (N m) |
-    | 5   | Torque applied on the rotor between torso/abdomen and the right hip (y-coordinate) | -0.4 | 0.4 | right_hip_y (right_thigh)   | hinge | torque (N m) |
-    | 6   | Torque applied on the rotor between the right hip/thigh and the right shin         | -0.4 | 0.4 | right_knee                  | hinge | torque (N m) |
-    | 7   | Torque applied on the rotor between torso/abdomen and the left hip (x-coordinate)  | -0.4 | 0.4 | left_hip_x (left_thigh)     | hinge | torque (N m) |
-    | 8   | Torque applied on the rotor between torso/abdomen and the left hip (z-coordinate)  | -0.4 | 0.4 | left_hip_z (left_thigh)     | hinge | torque (N m) |
-    | 9   | Torque applied on the rotor between torso/abdomen and the left hip (y-coordinate)  | -0.4 | 0.4 | left_hip_y (left_thigh)     | hinge | torque (N m) |
-    | 10   | Torque applied on the rotor between the left hip/thigh and the left shin          | -0.4 | 0.4 | left_knee                   | hinge | torque (N m) |
-    | 11   | Torque applied on the rotor between the torso and right upper arm (coordinate -1) | -0.4 | 0.4 | right_shoulder1             | hinge | torque (N m) |
-    | 12   | Torque applied on the rotor between the torso and right upper arm (coordinate -2) | -0.4 | 0.4 | right_shoulder2             | hinge | torque (N m) |
-    | 13   | Torque applied on the rotor between the right upper arm and right lower arm       | -0.4 | 0.4 | right_elbow                 | hinge | torque (N m) |
-    | 14   | Torque applied on the rotor between the torso and left upper arm (coordinate -1)  | -0.4 | 0.4 | left_shoulder1              | hinge | torque (N m) |
-    | 15   | Torque applied on the rotor between the torso and left upper arm (coordinate -2)  | -0.4 | 0.4 | left_shoulder2              | hinge | torque (N m) |
-    | 16   | Torque applied on the rotor between the left upper arm and left lower arm         | -0.4 | 0.4 | left_elbow                  | hinge | torque (N m) |
-
-    ### Observation Space
-
-    Observations consist of positional values of different body parts of the Humanoid,
-     followed by the velocities of those individual parts (their derivatives) with all the
-     positions ordered before all the velocities.
-
-    By default, observations do not include the x- and y-coordinates of the torso. These may
-    be included by passing `exclude_current_positions_from_observation=False` during construction.
-    In that case, the observation space will have 378 dimensions where the first two dimensions
-    represent the x- and y-coordinates of the torso.
-    Regardless of whether `exclude_current_positions_from_observation` was set to true or false, the x- and y-coordinates
-    will be returned in `info` with keys `"x_position"` and `"y_position"`, respectively.
-
-    However, by default, the observation is a `ndarray` with shape `(376,)` where the elements correspond to the following:
-
-    | Num | Observation                                                        | Min                | Max                | Name (in corresponding XML file) | Joint | Unit |
-    |-----|---------------------------------------------------------|----------------|-----------------|----------------------------------------|-------|------|
-    | 0   | z-coordinate of the torso (centre)                                                                              | -Inf                 | Inf                | root      | free | position (m) |
-    | 1   | x-orientation of the torso (centre)                                                                             | -Inf                 | Inf                | root      | free | angle (rad) |
-    | 2   | y-orientation of the torso (centre)                                                                             | -Inf                 | Inf                | root      | free | angle (rad) |
-    | 3   | z-orientation of the torso (centre)                                                                             | -Inf                 | Inf                | root      | free | angle (rad) |
-    | 4   | w-orientation of the torso (centre)                                                                             | -Inf                 | Inf                | root       | free | angle (rad) |
-    | 5   | z-angle of the abdomen (in lower_waist)                                                                         | -Inf                 | Inf               | abdomen_z | hinge | angle (rad) |
-    | 6   | y-angle of the abdomen (in lower_waist)                                                                         | -Inf                 | Inf               | abdomen_y | hinge | angle (rad) |
-    | 7   | x-angle of the abdomen (in pelvis)                                                                              | -Inf                 | Inf               | abdomen_x | hinge | angle (rad) |
-    | 8   | x-coordinate of angle between pelvis and right hip (in right_thigh)                                             | -Inf                 | Inf               | right_hip_x | hinge | angle (rad) |
-    | 9   | z-coordinate of angle between pelvis and right hip (in right_thigh)                                             | -Inf                 | Inf               | right_hip_z | hinge | angle (rad) |
-    | 19  | y-coordinate of angle between pelvis and right hip (in right_thigh)                                             | -Inf                 | Inf               | right_hip_y | hinge | angle (rad) |
-    | 11  | angle between right hip and the right shin (in right_knee)                                                      | -Inf                 | Inf               | right_knee | hinge | angle (rad) |
-    | 12  | x-coordinate of angle between pelvis and left hip (in left_thigh)                                               | -Inf                 | Inf               | left_hip_x | hinge | angle (rad) |
-    | 13  | z-coordinate of angle between pelvis and left hip (in left_thigh)                                               | -Inf                 | Inf               | left_hip_z | hinge | angle (rad) |
-    | 14  | y-coordinate of angle between pelvis and left hip (in left_thigh)                                               | -Inf                 | Inf               | left_hip_y | hinge | angle (rad) |
-    | 15  | angle between left hip and the left shin (in left_knee)                                                         | -Inf                 | Inf               | left_knee | hinge | angle (rad) |
-    | 16  | coordinate-1 (multi-axis) angle between torso and right arm (in right_upper_arm)                                | -Inf                 | Inf               | right_shoulder1 | hinge | angle (rad) |
-    | 17  | coordinate-2 (multi-axis) angle between torso and right arm (in right_upper_arm)                                | -Inf                 | Inf               | right_shoulder2 | hinge | angle (rad) |
-    | 18  | angle between right upper arm and right_lower_arm                                                               | -Inf                 | Inf               | right_elbow | hinge | angle (rad) |
-    | 19  | coordinate-1 (multi-axis) angle between torso and left arm (in left_upper_arm)                                  | -Inf                 | Inf               | left_shoulder1 | hinge | angle (rad) |
-    | 20  | coordinate-2 (multi-axis) angle between torso and left arm (in left_upper_arm)                                  | -Inf                 | Inf               | left_shoulder2 | hinge | angle (rad) |
-    | 21  | angle between left upper arm and left_lower_arm                                                                 | -Inf                 | Inf               | left_elbow | hinge | angle (rad) |
-    | 22  | x-coordinate velocity of the torso (centre)                                                                     | -Inf                 | Inf                | root      | free | velocity (m/s) |
-    | 23  | y-coordinate velocity of the torso (centre)                                                                     | -Inf                 | Inf                | root      | free | velocity (m/s) |
-    | 24  | z-coordinate velocity of the torso (centre)                                                                     | -Inf                 | Inf                | root      | free | velocity (m/s) |
-    | 25  | x-coordinate angular velocity of the torso (centre)                                                             | -Inf                 | Inf                | root      | free | anglular velocity (rad/s) |
-    | 26  | y-coordinate angular velocity of the torso (centre)                                                             | -Inf                 | Inf                | root      | free | anglular velocity (rad/s) |
-    | 27  | z-coordinate angular velocity of the torso (centre)                                                             | -Inf                 | Inf                | root      | free | anglular velocity (rad/s) |
-    | 28  | z-coordinate of angular velocity of the abdomen (in lower_waist)                                                | -Inf                 | Inf               | abdomen_z | hinge | anglular velocity (rad/s) |
-    | 29  | y-coordinate of angular velocity of the abdomen (in lower_waist)                                                | -Inf                 | Inf               | abdomen_y | hinge | anglular velocity (rad/s) |
-    | 30  | x-coordinate of angular velocity of the abdomen (in pelvis)                                                     | -Inf                 | Inf               | abdomen_x | hinge | aanglular velocity (rad/s) |
-    | 31  | x-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh)                 | -Inf                 | Inf               | right_hip_x | hinge | anglular velocity (rad/s) |
-    | 32  | z-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh)                 | -Inf                 | Inf               | right_hip_z | hinge | anglular velocity (rad/s) |
-    | 33  | y-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh)                 | -Inf                 | Inf               | right_hip_y | hinge | anglular velocity (rad/s) |
-    | 34  | angular velocity of the angle between right hip and the right shin (in right_knee)                              | -Inf                 | Inf               | right_knee | hinge | anglular velocity (rad/s) |
-    | 35  | x-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh)                   | -Inf                 | Inf               | left_hip_x | hinge | anglular velocity (rad/s) |
-    | 36  | z-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh)                   | -Inf                 | Inf               | left_hip_z | hinge | anglular velocity (rad/s) |
-    | 37  | y-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh)                   | -Inf                 | Inf               | left_hip_y | hinge | anglular velocity (rad/s) |
-    | 38  | angular velocity of the angle between left hip and the left shin (in left_knee)                                 | -Inf                 | Inf               | left_knee | hinge | anglular velocity (rad/s) |
-    | 39  | coordinate-1 (multi-axis) of the angular velocity of the angle between torso and right arm (in right_upper_arm) | -Inf                 | Inf               | right_shoulder1 | hinge | anglular velocity (rad/s) |
-    | 40  | coordinate-2 (multi-axis) of the angular velocity of the angle between torso and right arm (in right_upper_arm) | -Inf                 | Inf               | right_shoulder2 | hinge | anglular velocity (rad/s) |
-    | 41  | angular velocity of the angle between right upper arm and right_lower_arm                                       | -Inf                 | Inf               | right_elbow | hinge | anglular velocity (rad/s) |
-    | 42  | coordinate-1 (multi-axis) of the angular velocity of the angle between torso and left arm (in left_upper_arm)   | -Inf                 | Inf               | left_shoulder1 | hinge | anglular velocity (rad/s) |
-    | 43  | coordinate-2 (multi-axis) of the angular velocity of the angle between torso and left arm (in left_upper_arm)   | -Inf                 | Inf               | left_shoulder2 | hinge | anglular velocity (rad/s) |
-    | 44  | angular velocitty of the angle between left upper arm and left_lower_arm                                        | -Inf                 | Inf               | left_elbow | hinge | anglular velocity (rad/s) |
-
-    Additionally, after all the positional and velocity based values in the table,
-    the observation contains (in order):
-    - *cinert:* Mass and inertia of a single rigid body relative to the center of mass
-    (this is an intermediate result of transition). It has shape 14*10 (*nbody * 10*)
-    and hence adds to another 140 elements in the state space.
-    - *cvel:* Center of mass based velocity. It has shape 14 * 6 (*nbody * 6*) and hence
-    adds another 84 elements in the state space
-    - *qfrc_actuator:* Constraint force generated as the actuator force. This has shape
-    `(23,)`  *(nv * 1)* and hence adds another 23 elements to the state space.
-    - *cfrc_ext:* This is the center of mass based external force on the body.  It has shape
-    14 * 6 (*nbody * 6*) and hence adds to another 84 elements in the state space.
-    where *nbody* stands for the number of bodies in the robot and *nv* stands for the
-    number of degrees of freedom (*= dim(qvel)*)
-
-    The (x,y,z) coordinates are translational DOFs while the orientations are rotational
-    DOFs expressed as quaternions. One can read more about free joints on the
-    [Mujoco Documentation](https://mujoco.readthedocs.io/en/latest/XMLreference.html).
-
-    **Note:** There have been reported issues that using a Mujoco-Py version > 2.0
-    results in the contact forces always being 0. As such we recommend to use a Mujoco-Py
-    version < 2.0 when using the Humanoid environment if you would like to report results
-    with contact forces (if contact forces are not used in your experiments, you can use
-    version > 2.0).
-
-    ### Rewards
-    The reward consists of three parts:
-    - *healthy_reward*: Every timestep that the humanoid is alive (see section Episode Termination for definition), it gets a reward of fixed value `healthy_reward`
-    - *forward_reward*: A reward of walking forward which is measured as *`forward_reward_weight` *
-    (average center of mass before action - average center of mass after action)/dt*.
-    *dt* is the time between actions and is dependent on the frame_skip parameter
-    (default is 5), where the frametime is 0.003 - making the default *dt = 5 * 0.003 = 0.015*.
-    This reward would be positive if the humanoid walks forward (in positive x-direction). The calculation
-    for the center of mass is defined in the `.py` file for the Humanoid.
-    - *ctrl_cost*: A negative reward for penalising the humanoid if it has too
-    large of a control force. If there are *nu* actuators/controls, then the control has
-    shape  `nu x 1`. It is measured as *`ctrl_cost_weight` * sum(control<sup>2</sup>)*.
-    - *contact_cost*: A negative reward for penalising the humanoid if the external
-    contact force is too large. It is calculated by clipping
-    *`contact_cost_weight` * sum(external contact force<sup>2</sup>)* to the interval specified by `contact_cost_range`.
-
-    The total reward returned is ***reward*** *=* *healthy_reward + forward_reward - ctrl_cost - contact_cost* and `info` will also contain the individual reward terms
-
-    ### Starting State
-    All observations start in state
-    (0.0, 0.0,  1.4, 1.0, 0.0  ... 0.0) with a uniform noise in the range
-    of [-`reset_noise_scale`, `reset_noise_scale`] added to the positional and velocity values (values in the table)
-    for stochasticity. Note that the initial z coordinate is intentionally
-    selected to be high, thereby indicating a standing up humanoid. The initial
-    orientation is designed to make it face forward as well.
-
-    ### Episode Termination
-    The humanoid is said to be unhealthy if the z-position of the torso is no longer contained in the
-    closed interval specified by the argument `healthy_z_range`.
-
-    If `terminate_when_unhealthy=True` is passed during construction (which is the default),
-    the episode terminates when any of the following happens:
-
-    1. The episode duration reaches a 1000 timesteps
-    3. The humanoid is unhealthy
-
-    If `terminate_when_unhealthy=False` is passed, the episode is terminated only when 1000 timesteps are exceeded.
-
-
-    ### Arguments
-
-    No additional arguments are currently supported in v2 and lower.
-
-    ```
-    env = gym.make('Humanoid-v2')
-    ```
-
-    v3 and beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
-
-    ```
-    env = gym.make('Humanoid-v3', ctrl_cost_weight=0.1, ....)
-    ```
-
-    | Parameter               | Type       | Default           |Description                    |
-    |-------------------------|------------|-------------------|-------------------------------|
-    | `xml_file`              | **str**    | `"humanoid.xml"`  | Path to a MuJoCo model |
-    | `forward_reward_weight` | **float**  | `1.25`            | Weight for *forward_reward* term (see section on reward) |
-    | `ctrl_cost_weight`      | **float**  | `0.1`             | Weight for *ctrl_cost* term (see section on reward) |
-    | `contact_cost_weight`   | **float**  | `5e-7`            | Weight for *contact_cost* term (see section on reward) |
-    | `healthy_reward`        | **float**  | `5.0`             | Constant reward given if the humanoid is "healthy" after timestep |
-    | `terminate_when_unhealthy` | **bool**| `True`            | If true, issue a done signal if the z-coordinate of the torso is no longer in the `healthy_z_range` |
-    | `healthy_z_range`       | **tuple**  | `(1.0, 2.0)`      | The humanoid is considered healthy if the z-coordinate of the torso is in this range |
-    | `reset_noise_scale`     | **float**  | `1e-2`            | Scale of random perturbations of initial position and velocity (see section on Starting State) |
-    | `exclude_current_positions_from_observation`| **bool**   | `True`| Whether or not to omit the x- and y-coordinates from observations. Excluding the position can serve as an inductive bias to induce position-agnostic behavior in policies |
-
-    ### Version History
-
-    * v3: support for gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. rgb rendering comes from tracking camera (so agent does not run away from screen)
-    * v2: All continuous control environments now use mujoco_py >= 1.50
-    * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments.
-    * v0: Initial versions release (1.0.0)
-    """
-
    def __init__(
        self,
        xml_file="humanoid.xml",
@@ -239,7 +47,7 @@ class HumanoidEnv(mujoco_env.MujocoEnv, utils.EzPickle):
            exclude_current_positions_from_observation
        )

-        mujoco_env.MujocoEnv.__init__(self, xml_file, 5)
+        mujoco_env.MujocoEnv.__init__(self, xml_file, 5, mujoco_bindings="mujoco_py")

    @property
    def healthy_reward(self):
--- a/gym/envs/mujoco/humanoid_v4.py
+++ b/gym/envs/mujoco/humanoid_v4.py
@@ -0,0 +1,331 @@
+import numpy as np
+
+from gym import utils
+from gym.envs.mujoco import mujoco_env
+
+DEFAULT_CAMERA_CONFIG = {
+    "trackbodyid": 1,
+    "distance": 4.0,
+    "lookat": np.array((0.0, 0.0, 2.0)),
+    "elevation": -20.0,
+}
+
+
+def mass_center(model, data):
+    mass = np.expand_dims(model.body_mass, axis=1)
+    xpos = data.xipos
+    return (np.sum(mass * xpos, axis=0) / np.sum(mass))[0:2].copy()
+
+
+class HumanoidEnv(mujoco_env.MujocoEnv, utils.EzPickle):
+    """
+    ### Description
+
+    This environment is based on the environment introduced by Tassa, Erez and Todorov
+    in ["Synthesis and stabilization of complex behaviors through online trajectory optimization"](https://ieeexplore.ieee.org/document/6386025).
+    The 3D bipedal robot is designed to simulate a human. It has a torso (abdomen) with a pair of
+    legs and arms. The legs each consist of two links, and so the arms (representing the knees and
+    elbows respectively). The goal of the environment is to walk forward as fast as possible without falling over.
+
+    ### Action Space
+    The agent take a 17-element vector for actions.
+    The action space is a continuous `(action, ...)` all in `[-1, 1]`, where `action`
+    represents the numerical torques applied at the hinge joints.
+
+    | Num | Action                    | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit |
+    |-----|----------------------|---------------|----------------|---------------------------------------|-------|------|
+    | 0   | Torque applied on the hinge in the y-coordinate of the abdomen                     | -0.4 | 0.4 | hip_1 (front_left_leg)      | hinge | torque (N m) |
+    | 1   | Torque applied on the hinge in the z-coordinate of the abdomen                     | -0.4 | 0.4 | angle_1 (front_left_leg)    | hinge | torque (N m) |
+    | 2   | Torque applied on the hinge in the x-coordinate of the abdomen                     | -0.4 | 0.4 | hip_2 (front_right_leg)     | hinge | torque (N m) |
+    | 3   | Torque applied on the rotor between torso/abdomen and the right hip (x-coordinate) | -0.4 | 0.4 | right_hip_x (right_thigh)   | hinge | torque (N m) |
+    | 4   | Torque applied on the rotor between torso/abdomen and the right hip (z-coordinate) | -0.4 | 0.4 | right_hip_z (right_thigh)   | hinge | torque (N m) |
+    | 5   | Torque applied on the rotor between torso/abdomen and the right hip (y-coordinate) | -0.4 | 0.4 | right_hip_y (right_thigh)   | hinge | torque (N m) |
+    | 6   | Torque applied on the rotor between the right hip/thigh and the right shin         | -0.4 | 0.4 | right_knee                  | hinge | torque (N m) |
+    | 7   | Torque applied on the rotor between torso/abdomen and the left hip (x-coordinate)  | -0.4 | 0.4 | left_hip_x (left_thigh)     | hinge | torque (N m) |
+    | 8   | Torque applied on the rotor between torso/abdomen and the left hip (z-coordinate)  | -0.4 | 0.4 | left_hip_z (left_thigh)     | hinge | torque (N m) |
+    | 9   | Torque applied on the rotor between torso/abdomen and the left hip (y-coordinate)  | -0.4 | 0.4 | left_hip_y (left_thigh)     | hinge | torque (N m) |
+    | 10   | Torque applied on the rotor between the left hip/thigh and the left shin          | -0.4 | 0.4 | left_knee                   | hinge | torque (N m) |
+    | 11   | Torque applied on the rotor between the torso and right upper arm (coordinate -1) | -0.4 | 0.4 | right_shoulder1             | hinge | torque (N m) |
+    | 12   | Torque applied on the rotor between the torso and right upper arm (coordinate -2) | -0.4 | 0.4 | right_shoulder2             | hinge | torque (N m) |
+    | 13   | Torque applied on the rotor between the right upper arm and right lower arm       | -0.4 | 0.4 | right_elbow                 | hinge | torque (N m) |
+    | 14   | Torque applied on the rotor between the torso and left upper arm (coordinate -1)  | -0.4 | 0.4 | left_shoulder1              | hinge | torque (N m) |
+    | 15   | Torque applied on the rotor between the torso and left upper arm (coordinate -2)  | -0.4 | 0.4 | left_shoulder2              | hinge | torque (N m) |
+    | 16   | Torque applied on the rotor between the left upper arm and left lower arm         | -0.4 | 0.4 | left_elbow                  | hinge | torque (N m) |
+
+    ### Observation Space
+
+    The state space consists of positional values of different body parts of the Humanoid,
+     followed by the velocities of those individual parts (their derivatives) with all the
+     positions ordered before all the velocities.
+
+    The observation is a `ndarray` with shape `(376,)` where the elements correspond to the following:
+
+    | Num | Observation                                                        | Min                | Max                | Name (in corresponding XML file) | Joint | Unit |
+    |-----|---------------------------------------------------------|----------------|-----------------|----------------------------------------|-------|------|
+    | 0   | x-coordinate of the torso (centre)                                                                              | -Inf                 | Inf                | root      | free | position (m) |
+    | 1   | y-coordinate of the torso (centre)                                                                              | -Inf                 | Inf                | root      | free | position (m) |
+    | 2   | z-coordinate of the torso (centre)                                                                              | -Inf                 | Inf                | root      | free | position (m) |
+    | 3   | x-orientation of the torso (centre)                                                                             | -Inf                 | Inf                | root      | free | angle (rad) |
+    | 4   | y-orientation of the torso (centre)                                                                             | -Inf                 | Inf                | root      | free | angle (rad) |
+    | 5   | z-orientation of the torso (centre)                                                                             | -Inf                 | Inf                | root      | free | angle (rad) |
+    | 6   | w-orientation of the torso (centre)                                                                             | -Inf                 | Inf                | root       | free | angle (rad) |
+    | 7   | z-angle of the abdomen (in lower_waist)                                                                         | -Inf                 | Inf               | abdomen_z | hinge | angle (rad) |
+    | 8   | y-angle of the abdomen (in lower_waist)                                                                         | -Inf                 | Inf               | abdomen_y | hinge | angle (rad) |
+    | 9   | x-angle of the abdomen (in pelvis)                                                                              | -Inf                 | Inf               | abdomen_x | hinge | angle (rad) |
+    | 10  | x-coordinate of angle between pelvis and right hip (in right_thigh)                                             | -Inf                 | Inf               | right_hip_x | hinge | angle (rad) |
+    | 11  | z-coordinate of angle between pelvis and right hip (in right_thigh)                                             | -Inf                 | Inf               | right_hip_z | hinge | angle (rad) |
+    | 12  | y-coordinate of angle between pelvis and right hip (in right_thigh)                                             | -Inf                 | Inf               | right_hip_y | hinge | angle (rad) |
+    | 13  | angle between right hip and the right shin (in right_knee)                                                      | -Inf                 | Inf               | right_knee | hinge | angle (rad) |
+    | 14  | x-coordinate of angle between pelvis and left hip (in left_thigh)                                               | -Inf                 | Inf               | left_hip_x | hinge | angle (rad) |
+    | 15  | z-coordinate of angle between pelvis and left hip (in left_thigh)                                               | -Inf                 | Inf               | left_hip_z | hinge | angle (rad) |
+    | 16  | y-coordinate of angle between pelvis and left hip (in left_thigh)                                               | -Inf                 | Inf               | left_hip_y | hinge | angle (rad) |
+    | 17  | angle between left hip and the left shin (in left_knee)                                                         | -Inf                 | Inf               | left_knee | hinge | angle (rad) |
+    | 18  | coordinate-1 (multi-axis) angle between torso and right arm (in right_upper_arm)                                | -Inf                 | Inf               | right_shoulder1 | hinge | angle (rad) |
+    | 19  | coordinate-2 (multi-axis) angle between torso and right arm (in right_upper_arm)                                | -Inf                 | Inf               | right_shoulder2 | hinge | angle (rad) |
+    | 20  | angle between right upper arm and right_lower_arm                                                               | -Inf                 | Inf               | right_elbow | hinge | angle (rad) |
+    | 21  | coordinate-1 (multi-axis) angle between torso and left arm (in left_upper_arm)                                  | -Inf                 | Inf               | left_shoulder1 | hinge | angle (rad) |
+    | 22  | coordinate-2 (multi-axis) angle between torso and left arm (in left_upper_arm)                                  | -Inf                 | Inf               | left_shoulder2 | hinge | angle (rad) |
+    | 23  | angle between left upper arm and left_lower_arm                                                                 | -Inf                 | Inf               | left_elbow | hinge | angle (rad) |
+    | 24  | x-coordinate velocity of the torso (centre)                                                                     | -Inf                 | Inf                | root      | free | velocity (m/s) |
+    | 25  | y-coordinate velocity of the torso (centre)                                                                     | -Inf                 | Inf                | root      | free | velocity (m/s) |
+    | 26  | z-coordinate velocity of the torso (centre)                                                                     | -Inf                 | Inf                | root      | free | velocity (m/s) |
+    | 27  | x-coordinate angular velocity of the torso (centre)                                                             | -Inf                 | Inf                | root      | free | anglular velocity (rad/s) |
+    | 28  | y-coordinate angular velocity of the torso (centre)                                                             | -Inf                 | Inf                | root      | free | anglular velocity (rad/s) |
+    | 29  | z-coordinate angular velocity of the torso (centre)                                                             | -Inf                 | Inf                | root      | free | anglular velocity (rad/s) |
+    | 30  | z-coordinate of angular velocity of the abdomen (in lower_waist)                                                | -Inf                 | Inf               | abdomen_z | hinge | anglular velocity (rad/s) |
+    | 31  | y-coordinate of angular velocity of the abdomen (in lower_waist)                                                | -Inf                 | Inf               | abdomen_y | hinge | anglular velocity (rad/s) |
+    | 32  | x-coordinate of angular velocity of the abdomen (in pelvis)                                                     | -Inf                 | Inf               | abdomen_x | hinge | aanglular velocity (rad/s) |
+    | 33  | x-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh)                 | -Inf                 | Inf               | right_hip_x | hinge | anglular velocity (rad/s) |
+    | 34  | z-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh)                 | -Inf                 | Inf               | right_hip_z | hinge | anglular velocity (rad/s) |
+    | 35  | y-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh)                 | -Inf                 | Inf               | right_hip_y | hinge | anglular velocity (rad/s) |
+    | 36  | angular velocity of the angle between right hip and the right shin (in right_knee)                              | -Inf                 | Inf               | right_knee | hinge | anglular velocity (rad/s) |
+    | 37  | x-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh)                   | -Inf                 | Inf               | left_hip_x | hinge | anglular velocity (rad/s) |
+    | 38  | z-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh)                   | -Inf                 | Inf               | left_hip_z | hinge | anglular velocity (rad/s) |
+    | 39  | y-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh)                   | -Inf                 | Inf               | left_hip_y | hinge | anglular velocity (rad/s) |
+    | 40  | angular velocity of the angle between left hip and the left shin (in left_knee)                                 | -Inf                 | Inf               | left_knee | hinge | anglular velocity (rad/s) |
+    | 41  | coordinate-1 (multi-axis) of the angular velocity of the angle between torso and right arm (in right_upper_arm) | -Inf                 | Inf               | right_shoulder1 | hinge | anglular velocity (rad/s) |
+    | 42  | coordinate-2 (multi-axis) of the angular velocity of the angle between torso and right arm (in right_upper_arm) | -Inf                 | Inf               | right_shoulder2 | hinge | anglular velocity (rad/s) |
+    | 43  | angular velocity of the angle between right upper arm and right_lower_arm                                       | -Inf                 | Inf               | right_elbow | hinge | anglular velocity (rad/s) |
+    | 44  | coordinate-1 (multi-axis) of the angular velocity of the angle between torso and left arm (in left_upper_arm)   | -Inf                 | Inf               | left_shoulder1 | hinge | anglular velocity (rad/s) |
+    | 45  | coordinate-2 (multi-axis) of the angular velocity of the angle between torso and left arm (in left_upper_arm)   | -Inf                 | Inf               | left_shoulder2 | hinge | anglular velocity (rad/s) |
+    | 46  | angular velocitty of the angle between left upper arm and left_lower_arm                                        | -Inf                 | Inf               | left_elbow | hinge | anglular velocity (rad/s) |
+
+    Additionally, after all the positional and velocity based values in the table,
+    the state_space consists of (in order):
+    - *cinert:* Mass and inertia of a single rigid body relative to the center of mass
+    (this is an intermediate result of transition). It has shape 14*10 (*nbody * 10*)
+    and hence adds to another 140 elements in the state space.
+    - *cvel:* Center of mass based velocity. It has shape 14 * 6 (*nbody * 6*) and hence
+    adds another 84 elements in the state space
+    - *qfrc_actuator:* Constraint force generated as the actuator force. This has shape
+    `(23,)`  *(nv * 1)* and hence adds another 23 elements to the state space.
+    - *cfrc_ext:* This is the center of mass based external force on the body.  It has shape
+    14 * 6 (*nbody * 6*) and hence adds to another 84 elements in the state space.
+    where *nbody* stands for the number of bodies in the robot and *nv* stands for the
+    number of degrees of freedom (*= dim(qvel)*)
+
+    The (x,y,z) coordinates are translational DOFs while the orientations are rotational
+    DOFs expressed as quaternions. One can read more about free joints on the
+    [Mujoco Documentation](https://mujoco.readthedocs.io/en/latest/XMLreference.html).
+
+    **Note:** There are 47 elements in the table above - giving rise to `(378,)`
+    elements in the state space. In practice (and Gym implementation), the first two
+    positional elements are omitted from the state space since the reward function is
+    calculated based on the x-coordinate value. This value is hidden from the algorithm,
+    which in turn has to develop an abstract understanding of it from the observed rewards.
+    Therefore, observation space has shape `(376,)` instead of `(378,)` and the table should
+    not have the first two rows.
+
+    **Note:** Humanoid-v4 environment no longer has the following contact forces issue.
+    If using previous Humanoid versions from v4, there have been reported issues that using
+    a Mujoco-Py version > 2.0 results in the contact forces always being 0. As such we recommend
+    to use a Mujoco-Py version < 2.0 when using the Humanoid environment if you would like to report
+    results with contact forces (if contact forces are not used in your experiments, you can use
+    version > 2.0).
+
+    ### Rewards
+    The reward consists of three parts:
+    - *alive_bonus*: Every timestep that the humanoid is alive, it gets a reward of 5.
+    - *lin_vel_cost*: A reward of walking forward which is measured as *1.25 *
+    (average center of mass before action - average center of mass after action)/dt*.
+    *dt* is the time between actions and is dependent on the frame_skip parameter
+    (default is 5), where the *dt* for one frame is 0.003 - making the default *dt = 5 * 0.003 = 0.015*.
+    This reward would be positive if the humanoid walks forward (right) desired. The calculation
+    for the center of mass is defined in the `.py` file for the Humanoid.
+    - *quad_ctrl_cost*: A negative reward for penalising the humanoid if it has too
+    large of a control force. If there are *nu* actuators/controls, then the control has
+    shape  `nu x 1`. It is measured as *0.1 **x** sum(control<sup>2</sup>)*.
+    - *quad_impact_cost*: A negative reward for penalising the humanoid if the external
+    contact force is too large. It is calculated as
+    *min(0.5 * 0.000001 * sum(external contact force<sup>2</sup>), 10)*.
+
+    The total reward returned is ***reward*** *=* *alive_bonus + lin_vel_cost - quad_ctrl_cost - quad_impact_cost*
+
+    ### Starting State
+    All observations start in state
+    (0.0, 0.0,  1.4, 1.0, 0.0  ... 0.0) with a uniform noise in the range
+    of [-0.01, 0.01] added to the positional and velocity values (values in the table)
+    for stochasticity. Note that the initial z coordinate is intentionally
+    selected to be high, thereby indicating a standing up humanoid. The initial
+    orientation is designed to make it face forward as well.
+
+    ### Episode Termination
+    The episode terminates when any of the following happens:
+
+    1. The episode duration reaches a 1000 timesteps
+    2. Any of the state space values is no longer finite
+    3. The z-coordinate of the torso (index 0 in state space OR index 2 in the table) is **not** in the range `[1.0, 2.0]` (the humanoid has fallen or is about to fall beyond recovery).
+
+    ### Arguments
+
+    No additional arguments are currently supported (in v2 and lower), but
+    modifications can be made to the XML file in the assets folder (or by changing
+    the path to a modified XML file in another folder)..
+
+    ```
+    env = gym.make('Ant-v2')
+    ```
+
+    v3 and beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
+
+    ```
+    env = gym.make('Ant-v3', ctrl_cost_weight=0.1, ....)
+    ```
+
+    ### Version History
+
+    * v4: all mujoco environments now use the mujoco binidings in mujoco>=2.1.3
+    * v3: support for gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. rgb rendering comes from tracking camera (so agent does not run away from screen)
+    * v2: All continuous control environments now use mujoco_py >= 1.50
+    * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments.
+    * v0: Initial versions release (1.0.0)
+
+    """
+
+    def __init__(
+        self,
+        xml_file="humanoid.xml",
+        forward_reward_weight=1.25,
+        ctrl_cost_weight=0.1,
+        healthy_reward=5.0,
+        terminate_when_unhealthy=True,
+        healthy_z_range=(1.0, 2.0),
+        reset_noise_scale=1e-2,
+        exclude_current_positions_from_observation=True,
+    ):
+        utils.EzPickle.__init__(**locals())
+
+        self._forward_reward_weight = forward_reward_weight
+        self._ctrl_cost_weight = ctrl_cost_weight
+        self._healthy_reward = healthy_reward
+        self._terminate_when_unhealthy = terminate_when_unhealthy
+        self._healthy_z_range = healthy_z_range
+
+        self._reset_noise_scale = reset_noise_scale
+
+        self._exclude_current_positions_from_observation = (
+            exclude_current_positions_from_observation
+        )
+
+        mujoco_env.MujocoEnv.__init__(self, xml_file, 5)
+
+    @property
+    def healthy_reward(self):
+        return (
+            float(self.is_healthy or self._terminate_when_unhealthy)
+            * self._healthy_reward
+        )
+
+    def control_cost(self, action):
+        control_cost = self._ctrl_cost_weight * np.sum(np.square(self.data.ctrl))
+        return control_cost
+
+    @property
+    def is_healthy(self):
+        min_z, max_z = self._healthy_z_range
+        is_healthy = min_z < self.data.qpos[2] < max_z
+
+        return is_healthy
+
+    @property
+    def done(self):
+        done = (not self.is_healthy) if self._terminate_when_unhealthy else False
+        return done
+
+    def _get_obs(self):
+        position = self.data.qpos.flat.copy()
+        velocity = self.data.qvel.flat.copy()
+
+        com_inertia = self.data.cinert.flat.copy()
+        com_velocity = self.data.cvel.flat.copy()
+
+        actuator_forces = self.data.qfrc_actuator.flat.copy()
+        external_contact_forces = self.data.cfrc_ext.flat.copy()
+
+        if self._exclude_current_positions_from_observation:
+            position = position[2:]
+
+        return np.concatenate(
+            (
+                position,
+                velocity,
+                com_inertia,
+                com_velocity,
+                actuator_forces,
+                external_contact_forces,
+            )
+        )
+
+    def step(self, action):
+        xy_position_before = mass_center(self.model, self.data)
+        self.do_simulation(action, self.frame_skip)
+        xy_position_after = mass_center(self.model, self.data)
+
+        xy_velocity = (xy_position_after - xy_position_before) / self.dt
+        x_velocity, y_velocity = xy_velocity
+
+        ctrl_cost = self.control_cost(action)
+
+        forward_reward = self._forward_reward_weight * x_velocity
+        healthy_reward = self.healthy_reward
+
+        rewards = forward_reward + healthy_reward
+
+        observation = self._get_obs()
+        reward = rewards - ctrl_cost
+        done = self.done
+        info = {
+            "reward_linvel": forward_reward,
+            "reward_quadctrl": -ctrl_cost,
+            "reward_alive": healthy_reward,
+            "x_position": xy_position_after[0],
+            "y_position": xy_position_after[1],
+            "distance_from_origin": np.linalg.norm(xy_position_after, ord=2),
+            "x_velocity": x_velocity,
+            "y_velocity": y_velocity,
+            "forward_reward": forward_reward,
+        }
+
+        return observation, reward, done, info
+
+    def reset_model(self):
+        noise_low = -self._reset_noise_scale
+        noise_high = self._reset_noise_scale
+
+        qpos = self.init_qpos + self.np_random.uniform(
+            low=noise_low, high=noise_high, size=self.model.nq
+        )
+        qvel = self.init_qvel + self.np_random.uniform(
+            low=noise_low, high=noise_high, size=self.model.nv
+        )
+        self.set_state(qpos, qvel)
+
+        observation = self._get_obs()
+        return observation
+
+    def viewer_setup(self):
+        for key, value in DEFAULT_CAMERA_CONFIG.items():
+            if isinstance(value, np.ndarray):
+                getattr(self.viewer.cam, key)[:] = value
+            else:
+                setattr(self.viewer.cam, key, value)
--- a/gym/envs/mujoco/humanoidstandup.py
+++ b/gym/envs/mujoco/humanoidstandup.py
@@ -5,178 +5,10 @@ from gym.envs.mujoco import mujoco_env


 class HumanoidStandupEnv(mujoco_env.MujocoEnv, utils.EzPickle):
-    """
-    ### Description
-
-    This environment is based on the environment introduced by Tassa, Erez and Todorov
-    in ["Synthesis and stabilization of complex behaviors through online trajectory optimization"](https://ieeexplore.ieee.org/document/6386025).
-    The 3D bipedal robot is designed to simulate a human. It has a torso (abdomen) with a
-    pair of legs and arms. The legs each consist of two links, and so the arms (representing the
-    knees and elbows respectively). The environment starts with the humanoid layiing on the ground,
-    and then the goal of the environment is to make the humanoid standup and then keep it standing
-    by applying torques on the various hinges.
-
-    ### Action Space
-    The agent take a 17-element vector for actions.
-
-    The action space is a continuous `(action, ...)` all in `[-1, 1]`, where `action`
-    represents the numerical torques applied at the hinge joints.
-
-    | Num | Action                    | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit |
-    |-----|----------------------|---------------|----------------|---------------------------------------|-------|------|
-    | 0   | Torque applied on the hinge in the y-coordinate of the abdomen                     | -0.4 | 0.4 | hip_1 (front_left_leg)      | hinge | torque (N m) |
-    | 1   | Torque applied on the hinge in the z-coordinate of the abdomen                     | -0.4 | 0.4 | angle_1 (front_left_leg)    | hinge | torque (N m) |
-    | 2   | Torque applied on the hinge in the x-coordinate of the abdomen                     | -0.4 | 0.4 | hip_2 (front_right_leg)     | hinge | torque (N m) |
-    | 3   | Torque applied on the rotor between torso/abdomen and the right hip (x-coordinate) | -0.4 | 0.4 | right_hip_x (right_thigh)   | hinge | torque (N m) |
-    | 4   | Torque applied on the rotor between torso/abdomen and the right hip (z-coordinate) | -0.4 | 0.4 | right_hip_z (right_thigh)   | hinge | torque (N m) |
-    | 5   | Torque applied on the rotor between torso/abdomen and the right hip (y-coordinate) | -0.4 | 0.4 | right_hip_y (right_thigh)   | hinge | torque (N m) |
-    | 6   | Torque applied on the rotor between the right hip/thigh and the right shin         | -0.4 | 0.4 | right_knee                  | hinge | torque (N m) |
-    | 7   | Torque applied on the rotor between torso/abdomen and the left hip (x-coordinate)  | -0.4 | 0.4 | left_hip_x (left_thigh)     | hinge | torque (N m) |
-    | 8   | Torque applied on the rotor between torso/abdomen and the left hip (z-coordinate)  | -0.4 | 0.4 | left_hip_z (left_thigh)     | hinge | torque (N m) |
-    | 9   | Torque applied on the rotor between torso/abdomen and the left hip (y-coordinate)  | -0.4 | 0.4 | left_hip_y (left_thigh)     | hinge | torque (N m) |
-    | 10   | Torque applied on the rotor between the left hip/thigh and the left shin          | -0.4 | 0.4 | left_knee                   | hinge | torque (N m) |
-    | 11   | Torque applied on the rotor between the torso and right upper arm (coordinate -1) | -0.4 | 0.4 | right_shoulder1             | hinge | torque (N m) |
-    | 12   | Torque applied on the rotor between the torso and right upper arm (coordinate -2) | -0.4 | 0.4 | right_shoulder2             | hinge | torque (N m) |
-    | 13   | Torque applied on the rotor between the right upper arm and right lower arm       | -0.4 | 0.4 | right_elbow                 | hinge | torque (N m) |
-    | 14   | Torque applied on the rotor between the torso and left upper arm (coordinate -1)  | -0.4 | 0.4 | left_shoulder1              | hinge | torque (N m) |
-    | 15   | Torque applied on the rotor between the torso and left upper arm (coordinate -2)  | -0.4 | 0.4 | left_shoulder2              | hinge | torque (N m) |
-    | 16   | Torque applied on the rotor between the left upper arm and left lower arm         | -0.4 | 0.4 | left_elbow                  | hinge | torque (N m) |
-
-    ### Observation Space
-
-    The state space consists of positional values of different body parts of the Humanoid,
-    followed by the velocities of those individual parts (their derivatives) with all the positions ordered before all the velocities.
-
-    **Note:** The x- and y-coordinates of the torso are being omitted to produce position-agnostic behavior in policies
-
-    The observation is a `ndarray` with shape `(376,)` where the elements correspond to the following:
-
-    | Num | Observation                                                        | Min                | Max                | Name (in corresponding XML file) | Joint | Unit |
-    |-----|---------------------------------------------------------|----------------|-----------------|----------------------------------------|-------|------|
-    | 0   | z-coordinate of the torso (centre)                                                                              | -Inf                 | Inf                | root      | free | position (m) |
-    | 1   | x-orientation of the torso (centre)                                                                             | -Inf                 | Inf                | root      | free | angle (rad) |
-    | 2   | y-orientation of the torso (centre)                                                                             | -Inf                 | Inf                | root      | free | angle (rad) |
-    | 3   | z-orientation of the torso (centre)                                                                             | -Inf                 | Inf                | root      | free | angle (rad) |
-    | 4   | w-orientation of the torso (centre)                                                                             | -Inf                 | Inf                | root       | free | angle (rad) |
-    | 5   | z-angle of the abdomen (in lower_waist)                                                                         | -Inf                 | Inf               | abdomen_z | hinge | angle (rad) |
-    | 6   | y-angle of the abdomen (in lower_waist)                                                                         | -Inf                 | Inf               | abdomen_y | hinge | angle (rad) |
-    | 7   | x-angle of the abdomen (in pelvis)                                                                              | -Inf                 | Inf               | abdomen_x | hinge | angle (rad) |
-    | 8   | x-coordinate of angle between pelvis and right hip (in right_thigh)                                             | -Inf                 | Inf               | right_hip_x | hinge | angle (rad) |
-    | 9   | z-coordinate of angle between pelvis and right hip (in right_thigh)                                             | -Inf                 | Inf               | right_hip_z | hinge | angle (rad) |
-    | 10  | y-coordinate of angle between pelvis and right hip (in right_thigh)                                             | -Inf                 | Inf               | right_hip_y | hinge | angle (rad) |
-    | 11  | angle between right hip and the right shin (in right_knee)                                                      | -Inf                 | Inf               | right_knee | hinge | angle (rad) |
-    | 12  | x-coordinate of angle between pelvis and left hip (in left_thigh)                                               | -Inf                 | Inf               | left_hip_x | hinge | angle (rad) |
-    | 13  | z-coordinate of angle between pelvis and left hip (in left_thigh)                                               | -Inf                 | Inf               | left_hip_z | hinge | angle (rad) |
-    | 14  | y-coordinate of angle between pelvis and left hip (in left_thigh)                                               | -Inf                 | Inf               | left_hip_y | hinge | angle (rad) |
-    | 15  | angle between left hip and the left shin (in left_knee)                                                         | -Inf                 | Inf               | left_knee | hinge | angle (rad) |
-    | 16  | coordinate-1 (multi-axis) angle between torso and right arm (in right_upper_arm)                                | -Inf                 | Inf               | right_shoulder1 | hinge | angle (rad) |
-    | 17  | coordinate-2 (multi-axis) angle between torso and right arm (in right_upper_arm)                                | -Inf                 | Inf               | right_shoulder2 | hinge | angle (rad) |
-    | 18  | angle between right upper arm and right_lower_arm                                                               | -Inf                 | Inf               | right_elbow | hinge | angle (rad) |
-    | 19  | coordinate-1 (multi-axis) angle between torso and left arm (in left_upper_arm)                                  | -Inf                 | Inf               | left_shoulder1 | hinge | angle (rad) |
-    | 20  | coordinate-2 (multi-axis) angle between torso and left arm (in left_upper_arm)                                  | -Inf                 | Inf               | left_shoulder2 | hinge | angle (rad) |
-    | 21  | angle between left upper arm and left_lower_arm                                                                 | -Inf                 | Inf               | left_elbow | hinge | angle (rad) |
-    | 22  | x-coordinate velocity of the torso (centre)                                                                     | -Inf                 | Inf                | root      | free | velocity (m/s) |
-    | 23  | y-coordinate velocity of the torso (centre)                                                                     | -Inf                 | Inf                | root      | free | velocity (m/s) |
-    | 24  | z-coordinate velocity of the torso (centre)                                                                     | -Inf                 | Inf                | root      | free | velocity (m/s) |
-    | 25  | x-coordinate angular velocity of the torso (centre)                                                             | -Inf                 | Inf                | root      | free | anglular velocity (rad/s) |
-    | 26  | y-coordinate angular velocity of the torso (centre)                                                             | -Inf                 | Inf                | root      | free | anglular velocity (rad/s) |
-    | 27  | z-coordinate angular velocity of the torso (centre)                                                             | -Inf                 | Inf                | root      | free | anglular velocity (rad/s) |
-    | 28  | z-coordinate of angular velocity of the abdomen (in lower_waist)                                                | -Inf                 | Inf               | abdomen_z | hinge | anglular velocity (rad/s) |
-    | 29  | y-coordinate of angular velocity of the abdomen (in lower_waist)                                                | -Inf                 | Inf               | abdomen_y | hinge | anglular velocity (rad/s) |
-    | 30  | x-coordinate of angular velocity of the abdomen (in pelvis)                                                     | -Inf                 | Inf               | abdomen_x | hinge | aanglular velocity (rad/s) |
-    | 31  | x-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh)                 | -Inf                 | Inf               | right_hip_x | hinge | anglular velocity (rad/s) |
-    | 32  | z-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh)                 | -Inf                 | Inf               | right_hip_z | hinge | anglular velocity (rad/s) |
-    | 33  | y-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh)                 | -Inf                 | Inf               | right_hip_y | hinge | anglular velocity (rad/s) |
-    | 35  | angular velocity of the angle between right hip and the right shin (in right_knee)                              | -Inf                 | Inf               | right_knee | hinge | anglular velocity (rad/s) |
-    | 36  | x-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh)                   | -Inf                 | Inf               | left_hip_x | hinge | anglular velocity (rad/s) |
-    | 37  | z-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh)                   | -Inf                 | Inf               | left_hip_z | hinge | anglular velocity (rad/s) |
-    | 38  | y-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh)                   | -Inf                 | Inf               | left_hip_y | hinge | anglular velocity (rad/s) |
-    | 39  | angular velocity of the angle between left hip and the left shin (in left_knee)                                 | -Inf                 | Inf               | left_knee | hinge | anglular velocity (rad/s) |
-    | 40  | coordinate-1 (multi-axis) of the angular velocity of the angle between torso and right arm (in right_upper_arm) | -Inf                 | Inf               | right_shoulder1 | hinge | anglular velocity (rad/s) |
-    | 41  | coordinate-2 (multi-axis) of the angular velocity of the angle between torso and right arm (in right_upper_arm) | -Inf                 | Inf               | right_shoulder2 | hinge | anglular velocity (rad/s) |
-    | 42  | angular velocity of the angle between right upper arm and right_lower_arm                                       | -Inf                 | Inf               | right_elbow | hinge | anglular velocity (rad/s) |
-    | 43  | coordinate-1 (multi-axis) of the angular velocity of the angle between torso and left arm (in left_upper_arm)   | -Inf                 | Inf               | left_shoulder1 | hinge | anglular velocity (rad/s) |
-    | 44  | coordinate-2 (multi-axis) of the angular velocity of the angle between torso and left arm (in left_upper_arm)   | -Inf                 | Inf               | left_shoulder2 | hinge | anglular velocity (rad/s) |
-    | 45  | angular velocitty of the angle between left upper arm and left_lower_arm                                        | -Inf                 | Inf               | left_elbow | hinge | anglular velocity (rad/s) |
-
-
-    Additionally, after all the positional and velocity based values in the table,
-    the state_space consists of (in order):
-    - *cinert:* Mass and inertia of a single rigid body relative to the center of mass
-    (this is an intermediate result of transition). It has shape 14*10 (*nbody * 10*)
-    and hence adds to another 140 elements in the state space.
-    - *cvel:* Center of mass based velocity. It has shape 14 * 6 (*nbody * 6*) and hence
-    adds another 84 elements in the state space
-    - *qfrc_actuator:* Constraint force generated as the actuator force. This has shape
-    `(23,)`  *(nv * 1)* and hence adds another 23 elements to the state space.
-    - *cfrc_ext:* This is the center of mass based external force on the body.  It has shape
-    14 * 6 (*nbody * 6*) and hence adds to another 84 elements in the state space.
-    where *nbody* stands for the number of bodies in the robot and *nv* stands for the number
-    of degrees of freedom (*= dim(qvel)*)
-
-    The (x,y,z) coordinates are translational DOFs while the orientations are rotational
-    DOFs expressed as quaternions. One can read more about free joints on the
-    [Mujoco Documentation](https://mujoco.readthedocs.io/en/latest/XMLreference.html).
-
-    **Note:** There have been reported issues that using a Mujoco-Py version > 2.0 results
-    in the contact forces always being 0. As such we recommend to use a Mujoco-Py version < 2.0
-    when using the Humanoid environment if you would like to report results with contact forces
-    (if contact forces are not used in your experiments, you can use version > 2.0).
-
-    ### Rewards
-    The reward consists of three parts:
-    - *uph_cost*: A reward for moving upward (in an attempt to stand up). This is not a relative
-    reward which meaures how much upward it has moved from the last timestep, but it is an
-    absolute reward which measures how much upward the Humanoid has moved overall. It is
-    measured as *(z coordinate after action - 0)/(atomic timestep)*, where *z coordinate after
-    action* is index 0 in the state/index 2 in the table, and *atomic timestep* is the time for
-    one frame of movement even though the simulation has a framerate of 5 (done in order to inflate
-    rewards a little for fassteer learning).
-    - *quad_ctrl_cost*: A negative reward for penalising the humanoid if it has too large of
-    a control force. If there are *nu* actuators/controls, then the control has shape  `nu x 1`.
-    It is measured as *0.1 **x** sum(control<sup>2</sup>)*.
-    - *quad_impact_cost*: A negative reward for penalising the humanoid if the external
-    contact force is too large. It is calculated as *min(0.5 * 0.000001 * sum(external
-    contact force<sup>2</sup>), 10)*.
-
-    The total reward returned is ***reward*** *=* *uph_cost + 1 - quad_ctrl_cost - quad_impact_cost*
-
-    ### Starting State
-    All observations start in state
-    (0.0, 0.0,  0.105, 1.0, 0.0  ... 0.0) with a uniform noise in the range of
-    [-0.01, 0.01] added to the positional and velocity values (values in the table)
-    for stochasticity. Note that the initial z coordinate is intentionally selected
-    to be low, thereby indicating a laying down humanoid. The initial orientation is
-    designed to make it face forward as well.
-
-    ### Episode Termination
-    The episode terminates when any of the following happens:
-
-    1. The episode duration reaches a 1000 timesteps
-    2. Any of the state space values is no longer finite
-
-    ### Arguments
-
-    No additional arguments are currently supported.
-
-    ```
-    env = gym.make('HumanoidStandup-v2')
-    ```
-
-    There is no v3 for HumanoidStandup, unlike the robot environments where a v3 and
-    beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
-
-
-    ### Version History
-
-    * v2: All continuous control environments now use mujoco_py >= 1.50
-    * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments.
-    * v0: Initial versions release (1.0.0)
-
-    """
-
    def __init__(self):
-        mujoco_env.MujocoEnv.__init__(self, "humanoidstandup.xml", 5)
+        mujoco_env.MujocoEnv.__init__(
+            self, "humanoidstandup.xml", 5, mujoco_bindings="mujoco_py"
+        )
        utils.EzPickle.__init__(self)

    def _get_obs(self):
--- a/gym/envs/mujoco/humanoidstandup_v4.py
+++ b/gym/envs/mujoco/humanoidstandup_v4.py
@@ -0,0 +1,247 @@
+import numpy as np
+
+from gym import utils
+from gym.envs.mujoco import mujoco_env
+
+
+class HumanoidStandupEnv(mujoco_env.MujocoEnv, utils.EzPickle):
+    """
+    ### Description
+
+    This environment is based on the environment introduced by Tassa, Erez and Todorov
+    in ["Synthesis and stabilization of complex behaviors through online trajectory optimization"](https://ieeexplore.ieee.org/document/6386025).
+    The 3D bipedal robot is designed to simulate a human. It has a torso (abdomen) with a
+    pair of legs and arms. The legs each consist of two links, and so the arms (representing the
+    knees and elbows respectively). The environment starts with the humanoid layiing on the ground,
+    and then the goal of the environment is to make the humanoid standup and then keep it standing
+    by applying torques on the various hinges.
+
+    ### Action Space
+    The agent take a 17-element vector for actions.
+
+    The action space is a continuous `(action, ...)` all in `[-1, 1]`, where `action`
+    represents the numerical torques applied at the hinge joints.
+
+    | Num | Action                    | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit |
+    |-----|----------------------|---------------|----------------|---------------------------------------|-------|------|
+    | 0   | Torque applied on the hinge in the y-coordinate of the abdomen                     | -0.4 | 0.4 | hip_1 (front_left_leg)      | hinge | torque (N m) |
+    | 1   | Torque applied on the hinge in the z-coordinate of the abdomen                     | -0.4 | 0.4 | angle_1 (front_left_leg)    | hinge | torque (N m) |
+    | 2   | Torque applied on the hinge in the x-coordinate of the abdomen                     | -0.4 | 0.4 | hip_2 (front_right_leg)     | hinge | torque (N m) |
+    | 3   | Torque applied on the rotor between torso/abdomen and the right hip (x-coordinate) | -0.4 | 0.4 | right_hip_x (right_thigh)   | hinge | torque (N m) |
+    | 4   | Torque applied on the rotor between torso/abdomen and the right hip (z-coordinate) | -0.4 | 0.4 | right_hip_z (right_thigh)   | hinge | torque (N m) |
+    | 5   | Torque applied on the rotor between torso/abdomen and the right hip (y-coordinate) | -0.4 | 0.4 | right_hip_y (right_thigh)   | hinge | torque (N m) |
+    | 6   | Torque applied on the rotor between the right hip/thigh and the right shin         | -0.4 | 0.4 | right_knee                  | hinge | torque (N m) |
+    | 7   | Torque applied on the rotor between torso/abdomen and the left hip (x-coordinate)  | -0.4 | 0.4 | left_hip_x (left_thigh)     | hinge | torque (N m) |
+    | 8   | Torque applied on the rotor between torso/abdomen and the left hip (z-coordinate)  | -0.4 | 0.4 | left_hip_z (left_thigh)     | hinge | torque (N m) |
+    | 9   | Torque applied on the rotor between torso/abdomen and the left hip (y-coordinate)  | -0.4 | 0.4 | left_hip_y (left_thigh)     | hinge | torque (N m) |
+    | 10   | Torque applied on the rotor between the left hip/thigh and the left shin          | -0.4 | 0.4 | left_knee                   | hinge | torque (N m) |
+    | 11   | Torque applied on the rotor between the torso and right upper arm (coordinate -1) | -0.4 | 0.4 | right_shoulder1             | hinge | torque (N m) |
+    | 12   | Torque applied on the rotor between the torso and right upper arm (coordinate -2) | -0.4 | 0.4 | right_shoulder2             | hinge | torque (N m) |
+    | 13   | Torque applied on the rotor between the right upper arm and right lower arm       | -0.4 | 0.4 | right_elbow                 | hinge | torque (N m) |
+    | 14   | Torque applied on the rotor between the torso and left upper arm (coordinate -1)  | -0.4 | 0.4 | left_shoulder1              | hinge | torque (N m) |
+    | 15   | Torque applied on the rotor between the torso and left upper arm (coordinate -2)  | -0.4 | 0.4 | left_shoulder2              | hinge | torque (N m) |
+    | 16   | Torque applied on the rotor between the left upper arm and left lower arm         | -0.4 | 0.4 | left_elbow                  | hinge | torque (N m) |
+
+    ### Observation Space
+
+    The state space consists of positional values of different body parts of the Humanoid,
+    followed by the velocities of those individual parts (their derivatives) with all the positions ordered before all the velocities.
+
+    The observation is a `ndarray` with shape `(376,)` where the elements correspond to the following:
+
+    | Num | Observation                                                        | Min                | Max                | Name (in corresponding XML file) | Joint | Unit |
+    |-----|---------------------------------------------------------|----------------|-----------------|----------------------------------------|-------|------|
+    | 0   | x-coordinate of the torso (centre)                                                                              | -Inf                 | Inf                | root      | free | position (m) |
+    | 1   | y-coordinate of the torso (centre)                                                                              | -Inf                 | Inf                | root      | free | position (m) |
+    | 2   | z-coordinate of the torso (centre)                                                                              | -Inf                 | Inf                | root      | free | position (m) |
+    | 3   | x-orientation of the torso (centre)                                                                             | -Inf                 | Inf                | root      | free | angle (rad) |
+    | 4   | y-orientation of the torso (centre)                                                                             | -Inf                 | Inf                | root      | free | angle (rad) |
+    | 5   | z-orientation of the torso (centre)                                                                             | -Inf                 | Inf                | root      | free | angle (rad) |
+    | 6   | w-orientation of the torso (centre)                                                                             | -Inf                 | Inf                | root       | free | angle (rad) |
+    | 7   | z-angle of the abdomen (in lower_waist)                                                                         | -Inf                 | Inf               | abdomen_z | hinge | angle (rad) |
+    | 8   | y-angle of the abdomen (in lower_waist)                                                                         | -Inf                 | Inf               | abdomen_y | hinge | angle (rad) |
+    | 9   | x-angle of the abdomen (in pelvis)                                                                              | -Inf                 | Inf               | abdomen_x | hinge | angle (rad) |
+    | 10  | x-coordinate of angle between pelvis and right hip (in right_thigh)                                             | -Inf                 | Inf               | right_hip_x | hinge | angle (rad) |
+    | 11  | z-coordinate of angle between pelvis and right hip (in right_thigh)                                             | -Inf                 | Inf               | right_hip_z | hinge | angle (rad) |
+    | 12  | y-coordinate of angle between pelvis and right hip (in right_thigh)                                             | -Inf                 | Inf               | right_hip_y | hinge | angle (rad) |
+    | 13  | angle between right hip and the right shin (in right_knee)                                                      | -Inf                 | Inf               | right_knee | hinge | angle (rad) |
+    | 14  | x-coordinate of angle between pelvis and left hip (in left_thigh)                                               | -Inf                 | Inf               | left_hip_x | hinge | angle (rad) |
+    | 15  | z-coordinate of angle between pelvis and left hip (in left_thigh)                                               | -Inf                 | Inf               | left_hip_z | hinge | angle (rad) |
+    | 16  | y-coordinate of angle between pelvis and left hip (in left_thigh)                                               | -Inf                 | Inf               | left_hip_y | hinge | angle (rad) |
+    | 17  | angle between left hip and the left shin (in left_knee)                                                         | -Inf                 | Inf               | left_knee | hinge | angle (rad) |
+    | 18  | coordinate-1 (multi-axis) angle between torso and right arm (in right_upper_arm)                                | -Inf                 | Inf               | right_shoulder1 | hinge | angle (rad) |
+    | 19  | coordinate-2 (multi-axis) angle between torso and right arm (in right_upper_arm)                                | -Inf                 | Inf               | right_shoulder2 | hinge | angle (rad) |
+    | 20  | angle between right upper arm and right_lower_arm                                                               | -Inf                 | Inf               | right_elbow | hinge | angle (rad) |
+    | 21  | coordinate-1 (multi-axis) angle between torso and left arm (in left_upper_arm)                                  | -Inf                 | Inf               | left_shoulder1 | hinge | angle (rad) |
+    | 22  | coordinate-2 (multi-axis) angle between torso and left arm (in left_upper_arm)                                  | -Inf                 | Inf               | left_shoulder2 | hinge | angle (rad) |
+    | 23  | angle between left upper arm and left_lower_arm                                                                 | -Inf                 | Inf               | left_elbow | hinge | angle (rad) |
+    | 24  | x-coordinate velocity of the torso (centre)                                                                     | -Inf                 | Inf                | root      | free | velocity (m/s) |
+    | 25  | y-coordinate velocity of the torso (centre)                                                                     | -Inf                 | Inf                | root      | free | velocity (m/s) |
+    | 26  | z-coordinate velocity of the torso (centre)                                                                     | -Inf                 | Inf                | root      | free | velocity (m/s) |
+    | 27  | x-coordinate angular velocity of the torso (centre)                                                             | -Inf                 | Inf                | root      | free | anglular velocity (rad/s) |
+    | 28  | y-coordinate angular velocity of the torso (centre)                                                             | -Inf                 | Inf                | root      | free | anglular velocity (rad/s) |
+    | 29  | z-coordinate angular velocity of the torso (centre)                                                             | -Inf                 | Inf                | root      | free | anglular velocity (rad/s) |
+    | 30  | z-coordinate of angular velocity of the abdomen (in lower_waist)                                                | -Inf                 | Inf               | abdomen_z | hinge | anglular velocity (rad/s) |
+    | 31  | y-coordinate of angular velocity of the abdomen (in lower_waist)                                                | -Inf                 | Inf               | abdomen_y | hinge | anglular velocity (rad/s) |
+    | 32  | x-coordinate of angular velocity of the abdomen (in pelvis)                                                     | -Inf                 | Inf               | abdomen_x | hinge | aanglular velocity (rad/s) |
+    | 33  | x-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh)                 | -Inf                 | Inf               | right_hip_x | hinge | anglular velocity (rad/s) |
+    | 34  | z-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh)                 | -Inf                 | Inf               | right_hip_z | hinge | anglular velocity (rad/s) |
+    | 35  | y-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh)                 | -Inf                 | Inf               | right_hip_y | hinge | anglular velocity (rad/s) |
+    | 36  | angular velocity of the angle between right hip and the right shin (in right_knee)                              | -Inf                 | Inf               | right_knee | hinge | anglular velocity (rad/s) |
+    | 37  | x-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh)                   | -Inf                 | Inf               | left_hip_x | hinge | anglular velocity (rad/s) |
+    | 38  | z-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh)                   | -Inf                 | Inf               | left_hip_z | hinge | anglular velocity (rad/s) |
+    | 39  | y-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh)                   | -Inf                 | Inf               | left_hip_y | hinge | anglular velocity (rad/s) |
+    | 40  | angular velocity of the angle between left hip and the left shin (in left_knee)                                 | -Inf                 | Inf               | left_knee | hinge | anglular velocity (rad/s) |
+    | 41  | coordinate-1 (multi-axis) of the angular velocity of the angle between torso and right arm (in right_upper_arm) | -Inf                 | Inf               | right_shoulder1 | hinge | anglular velocity (rad/s) |
+    | 42  | coordinate-2 (multi-axis) of the angular velocity of the angle between torso and right arm (in right_upper_arm) | -Inf                 | Inf               | right_shoulder2 | hinge | anglular velocity (rad/s) |
+    | 43  | angular velocity of the angle between right upper arm and right_lower_arm                                       | -Inf                 | Inf               | right_elbow | hinge | anglular velocity (rad/s) |
+    | 44  | coordinate-1 (multi-axis) of the angular velocity of the angle between torso and left arm (in left_upper_arm)   | -Inf                 | Inf               | left_shoulder1 | hinge | anglular velocity (rad/s) |
+    | 45  | coordinate-2 (multi-axis) of the angular velocity of the angle between torso and left arm (in left_upper_arm)   | -Inf                 | Inf               | left_shoulder2 | hinge | anglular velocity (rad/s) |
+    | 46  | angular velocitty of the angle between left upper arm and left_lower_arm                                        | -Inf                 | Inf               | left_elbow | hinge | anglular velocity (rad/s) |
+
+
+    Additionally, after all the positional and velocity based values in the table,
+    the state_space consists of (in order):
+    - *cinert:* Mass and inertia of a single rigid body relative to the center of mass
+    (this is an intermediate result of transition). It has shape 14*10 (*nbody * 10*)
+    and hence adds to another 140 elements in the state space.
+    - *cvel:* Center of mass based velocity. It has shape 14 * 6 (*nbody * 6*) and hence
+    adds another 84 elements in the state space
+    - *qfrc_actuator:* Constraint force generated as the actuator force. This has shape
+    `(23,)`  *(nv * 1)* and hence adds another 23 elements to the state space.
+    - *cfrc_ext:* This is the center of mass based external force on the body.  It has shape
+    14 * 6 (*nbody * 6*) and hence adds to another 84 elements in the state space.
+    where *nbody* stands for the number of bodies in the robot and *nv* stands for the number
+    of degrees of freedom (*= dim(qvel)*)
+
+    The (x,y,z) coordinates are translational DOFs while the orientations are rotational
+    DOFs expressed as quaternions. One can read more about free joints on the
+    [Mujoco Documentation](https://mujoco.readthedocs.io/en/latest/XMLreference.html).
+
+    **Note:** There are 47 elements in the table above - giving rise to `(378,)`
+    elements in the state space. In practice (and Gym implementation), the first two
+    positional elements are omitted from the state space since the reward function is
+    calculated based on the x-coordinate value. This value is hidden from the algorithm,
+    which in turn has to develop an abstract understanding of it from the observed rewards.
+    Therefore, observation space has shape `(376,)` instead of `(378,)` and the table should not have the first two rows.
+
+    **Note:** HumanoidStandup-v4 environment no longer has the following contact forces issue.
+    If using previous HumanoidStandup versions from v4, there have been reported issues that using a Mujoco-Py version > 2.0 results
+    in the contact forces always being 0. As such we recommend to use a Mujoco-Py version < 2.0
+    when using the Humanoid environment if you would like to report results with contact forces
+    (if contact forces are not used in your experiments, you can use version > 2.0).
+
+    ### Rewards
+    The reward consists of three parts:
+    - *uph_cost*: A reward for moving upward (in an attempt to stand up). This is not a relative
+    reward which meaures how much upward it has moved from the last timestep, but it is an
+    absolute reward which measures how much upward the Humanoid has moved overall. It is
+    measured as *(z coordinate after action - 0)/(atomic timestep)*, where *z coordinate after
+    action* is index 0 in the state/index 2 in the table, and *atomic timestep* is the time for
+    one frame of movement even though the simulation has a framerate of 5 (done in order to inflate
+    rewards a little for fassteer learning).
+    - *quad_ctrl_cost*: A negative reward for penalising the humanoid if it has too large of
+    a control force. If there are *nu* actuators/controls, then the control has shape  `nu x 1`.
+    It is measured as *0.1 **x** sum(control<sup>2</sup>)*.
+    - *quad_impact_cost*: A negative reward for penalising the humanoid if the external
+    contact force is too large. It is calculated as *min(0.5 * 0.000001 * sum(external
+    contact force<sup>2</sup>), 10)*.
+
+    The total reward returned is ***reward*** *=* *uph_cost + 1 - quad_ctrl_cost - quad_impact_cost*
+
+    ### Starting State
+    All observations start in state
+    (0.0, 0.0,  0.105, 1.0, 0.0  ... 0.0) with a uniform noise in the range of
+    [-0.01, 0.01] added to the positional and velocity values (values in the table)
+    for stochasticity. Note that the initial z coordinate is intentionally selected
+    to be low, thereby indicating a laying down humanoid. The initial orientation is
+    designed to make it face forward as well.
+
+    ### Episode Termination
+    The episode terminates when any of the following happens:
+
+    1. The episode duration reaches a 1000 timesteps
+    2. Any of the state space values is no longer finite
+
+    ### Arguments
+
+    No additional arguments are currently supported (in v2 and lower), but
+    modifications can be made to the XML file in the assets folder
+    (or by changing the path to a modified XML file in another folder)..
+
+    ```
+    env = gym.make('HumanoidStandup-v2')
+    ```
+
+    There is no v3 for HumanoidStandup, unlike the robot environments where a v3 and
+    beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
+
+
+    ### Version History
+
+    * v4: all mujoco environments now use the mujoco binidings in mujoco>=2.1.3
+    * v3: support for gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. rgb rendering comes from tracking camera (so agent does not run away from screen)
+    * v2: All continuous control environments now use mujoco_py >= 1.50
+    * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments.
+    * v0: Initial versions release (1.0.0)
+
+    """
+
+    def __init__(self):
+        mujoco_env.MujocoEnv.__init__(self, "humanoidstandup.xml", 5)
+        utils.EzPickle.__init__(self)
+
+    def _get_obs(self):
+        data = self.data
+        return np.concatenate(
+            [
+                data.qpos.flat[2:],
+                data.qvel.flat,
+                data.cinert.flat,
+                data.cvel.flat,
+                data.qfrc_actuator.flat,
+                data.cfrc_ext.flat,
+            ]
+        )
+
+    def step(self, a):
+        self.do_simulation(a, self.frame_skip)
+        pos_after = self.data.qpos[2]
+        data = self.data
+        uph_cost = (pos_after - 0) / self.model.opt.timestep
+
+        quad_ctrl_cost = 0.1 * np.square(data.ctrl).sum()
+        quad_impact_cost = 0.5e-6 * np.square(data.cfrc_ext).sum()
+        quad_impact_cost = min(quad_impact_cost, 10)
+        reward = uph_cost - quad_ctrl_cost - quad_impact_cost + 1
+
+        done = bool(False)
+        return (
+            self._get_obs(),
+            reward,
+            done,
+            dict(
+                reward_linup=uph_cost,
+                reward_quadctrl=-quad_ctrl_cost,
+                reward_impact=-quad_impact_cost,
+            ),
+        )
+
+    def reset_model(self):
+        c = 0.01
+        self.set_state(
+            self.init_qpos + self.np_random.uniform(low=-c, high=c, size=self.model.nq),
+            self.init_qvel
+            + self.np_random.uniform(
+                low=-c,
+                high=c,
+                size=self.model.nv,
+            ),
+        )
+        return self._get_obs()
+
+    def viewer_setup(self):
+        self.viewer.cam.trackbodyid = 1
+        self.viewer.cam.distance = self.model.stat.extent * 1.0
+        self.viewer.cam.lookat[2] = 0.8925
+        self.viewer.cam.elevation = -20
--- a/gym/envs/mujoco/inverted_double_pendulum.py
+++ b/gym/envs/mujoco/inverted_double_pendulum.py
@@ -5,113 +5,10 @@ from gym.envs.mujoco import mujoco_env


 class InvertedDoublePendulumEnv(mujoco_env.MujocoEnv, utils.EzPickle):
-    """
-    ### Description
-
-    This environment originates from control theory and builds on the cartpole
-    environment based on the work done by Barto, Sutton, and Anderson in
-    ["Neuronlike adaptive elements that can solve difficult learning control problems"](https://ieeexplore.ieee.org/document/6313077),
-    powered by the Mujoco physics simulator - allowing for more complex experiments
-    (such as varying the effects of gravity or constraints). This environment involves a cart that can
-    moved linearly, with a pole fixed on it and a second pole fixed on the other end of the first one
-    (leaving the second pole as the only one with one free end). The cart can be pushed left or right,
-    and the goal is to balance the second pole on top of the first pole, which is in turn on top of the
-    cart, by applying continuous forces on the cart.
-
-    ### Action Space
-    The agent take a 1-element vector for actions.
-    The action space is a continuous `(action)` in `[-1, 1]`, where `action` represents the
-    numerical force applied to the cart (with magnitude representing the amount of force and
-    sign representing the direction)
-
-    | Num | Action                    | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit      |
-    |-----|---------------------------|-------------|-------------|----------------------------------|-------|-----------|
-    | 0   | Force applied on the cart | -1          | 1           | slider                           | slide | Force (N) |
-
-    ### Observation Space
-
-    The state space consists of positional values of different body parts of the pendulum system,
-    followed by the velocities of those individual parts (their derivatives) with all the
-    positions ordered before all the velocities.
-
-    The observation is a `ndarray` with shape `(11,)` where the elements correspond to the following:
-
-    | Num | Observation           | Min                  | Max                | Name (in corresponding XML file) | Joint| Unit |
-    |-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------|
-    | 0   | position of the cart along the linear surface                        | -Inf                 | Inf                | slider | slide | position (m) |
-    | 1   | sine of the angle between the cart and the first pole                | -Inf                 | Inf                | sin(hinge) | hinge | unitless |
-    | 2   | sine of the angle between the two poles                              | -Inf                 | Inf                | sin(hinge2) | hinge | unitless |
-    | 3   | cosine of the angle between the cart and the first pole              | -Inf                 | Inf                | cos(hinge) | hinge | unitless |
-    | 4   | cosine of the angle between the two poles                            | -Inf                 | Inf                | cos(hinge2) | hinge | unitless |
-    | 5   | velocity of the cart                                                 | -Inf                 | Inf                | slider | slide | velocity (m/s) |
-    | 6   | angular velocity of the angle between the cart and the first pole    | -Inf                 | Inf                | hinge | hinge | angular velocity (rad/s) |
-    | 7   | angular velocity of the angle between the two poles                  | -Inf                 | Inf                | hinge2 | hinge | angular velocity (rad/s) |
-    | 8   | constraint force - 1                                                 | -Inf                 | Inf                |  |  | Force (N) |
-    | 9   | constraint force - 2                                                 | -Inf                 | Inf                |  |  | Force (N) |
-    | 10  | constraint force - 3                                                 | -Inf                 | Inf                |  |  | Force (N) |
-
-
-    There is physical contact between the robots and their environment - and Mujoco
-    attempts at getting realisitic physics simulations for the possible physical contact
-    dynamics by aiming for physical accuracy and computational efficiency.
-
-    There is one constraint force for contacts for each degree of freedom (3).
-    The approach and handling of constraints by Mujoco is unique to the simulator
-    and is based on their research. Once can find more information in their
-    [*documentation*](https://mujoco.readthedocs.io/en/latest/computation.html)
-    or in their paper
-    ["Analytically-invertible dynamics with contacts and constraints: Theory and implementation in MuJoCo"](https://homes.cs.washington.edu/~todorov/papers/TodorovICRA14.pdf).
-
-
-    ### Rewards
-
-    The reward consists of two parts:
-    - *alive_bonus*: The goal is to make the second inverted pendulum stand upright
-    (within a certain angle limit) as long as possible - as such a reward of +10 is awarded
-     for each timestep that the second pole is upright.
-    - *distance_penalty*: This reward is a measure of how far the *tip* of the second pendulum
-    (the only free end) moves, and it is calculated as
-    *0.01 * x<sup>2</sup> + (y - 2)<sup>2</sup>*, where *x* is the x-coordinate of the tip
-    and *y* is the y-coordinate of the tip of the second pole.
-    - *velocity_penalty*: A negative reward for penalising the agent if it moves too
-    fast *0.001 *  v<sub>1</sub><sup>2</sup> + 0.005 * v<sub>2</sub> <sup>2</sup>*
-
-    The total reward returned is ***reward*** *=* *alive_bonus - distance_penalty - velocity_penalty*
-
-    ### Starting State
-    All observations start in state
-    (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0) with a uniform noise in the range
-    of [-0.1, 0.1] added to the positional values (cart position and pole angles) and standard
-    normal force with a standard deviation of 0.1 added to the velocity values for stochasticity.
-
-    ### Episode Termination
-    The episode terminates when any of the following happens:
-
-    1. The episode duration reaches 1000 timesteps.
-    2. Any of the state space values is no longer finite.
-    3. The y_coordinate of the tip of the second pole *is less than or equal* to 1. The maximum standing height of the system is 1.196 m when all the parts are perpendicularly vertical on top of each other).
-
-    ### Arguments
-
-    No additional arguments are currently supported.
-
-    ```
-    env = gym.make('InvertedDoublePendulum-v2')
-    ```
-    There is no v3 for InvertedPendulum, unlike the robot environments where a v3 and
-    beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
-
-
-    ### Version History
-
-    * v2: All continuous control environments now use mujoco_py >= 1.50
-    * v1: max_time_steps raised to 1000 for robot based tasks (including inverted pendulum)
-    * v0: Initial versions release (1.0.0)
-
-    """
-
    def __init__(self):
-        mujoco_env.MujocoEnv.__init__(self, "inverted_double_pendulum.xml", 5)
+        mujoco_env.MujocoEnv.__init__(
+            self, "inverted_double_pendulum.xml", 5, mujoco_bindings="mujoco_py"
+        )
        utils.EzPickle.__init__(self)

    def step(self, action):
--- a/gym/envs/mujoco/inverted_double_pendulum_v4.py
+++ b/gym/envs/mujoco/inverted_double_pendulum_v4.py
@@ -0,0 +1,156 @@
+import numpy as np
+
+from gym import utils
+from gym.envs.mujoco import mujoco_env
+
+
+class InvertedDoublePendulumEnv(mujoco_env.MujocoEnv, utils.EzPickle):
+    """
+    ### Description
+
+    This environment originates from control theory and builds on the cartpole
+    environment based on the work done by Barto, Sutton, and Anderson in
+    ["Neuronlike adaptive elements that can solve difficult learning control problems"](https://ieeexplore.ieee.org/document/6313077),
+    powered by the Mujoco physics simulator - allowing for more complex experiments
+    (such as varying the effects of gravity or constraints). This environment involves a cart that can
+    moved linearly, with a pole fixed on it and a second pole fixed on the other end of the first one
+    (leaving the second pole as the only one with one free end). The cart can be pushed left or right,
+    and the goal is to balance the second pole on top of the first pole, which is in turn on top of the
+    cart, by applying continuous forces on the cart.
+
+    ### Action Space
+    The agent take a 1-element vector for actions.
+    The action space is a continuous `(action)` in `[-1, 1]`, where `action` represents the
+    numerical force applied to the cart (with magnitude representing the amount of force and
+    sign representing the direction)
+
+    | Num | Action                    | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit      |
+    |-----|---------------------------|-------------|-------------|----------------------------------|-------|-----------|
+    | 0   | Force applied on the cart | -1          | 1           | slider                           | slide | Force (N) |
+
+    ### Observation Space
+
+    The state space consists of positional values of different body parts of the pendulum system,
+    followed by the velocities of those individual parts (their derivatives) with all the
+    positions ordered before all the velocities.
+
+    The observation is a `ndarray` with shape `(11,)` where the elements correspond to the following:
+
+    | Num | Observation           | Min                  | Max                | Name (in corresponding XML file) | Joint| Unit |
+    |-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------|
+    | 0   | position of the cart along the linear surface                        | -Inf                 | Inf                | slider | slide | position (m) |
+    | 1   | sine of the angle between the cart and the first pole                | -Inf                 | Inf                | sin(hinge) | hinge | unitless |
+    | 2   | sine of the angle between the two poles                              | -Inf                 | Inf                | sin(hinge2) | hinge | unitless |
+    | 3   | cosine of the angle between the cart and the first pole              | -Inf                 | Inf                | cos(hinge) | hinge | unitless |
+    | 4   | cosine of the angle between the two poles                            | -Inf                 | Inf                | cos(hinge2) | hinge | unitless |
+    | 5   | velocity of the cart                                                 | -Inf                 | Inf                | slider | slide | velocity (m/s) |
+    | 6   | angular velocity of the angle between the cart and the first pole    | -Inf                 | Inf                | hinge | hinge | angular velocity (rad/s) |
+    | 7   | angular velocity of the angle between the two poles                  | -Inf                 | Inf                | hinge2 | hinge | angular velocity (rad/s) |
+    | 8   | constraint force - 1                                                 | -Inf                 | Inf                |  |  | Force (N) |
+    | 9   | constraint force - 2                                                 | -Inf                 | Inf                |  |  | Force (N) |
+    | 10  | constraint force - 3                                                 | -Inf                 | Inf                |  |  | Force (N) |
+
+
+    There is physical contact between the robots and their environment - and Mujoco
+    attempts at getting realisitic physics simulations for the possible physical contact
+    dynamics by aiming for physical accuracy and computational efficiency.
+
+    There is one constraint force for contacts for each degree of freedom (3).
+    The approach and handling of constraints by Mujoco is unique to the simulator
+    and is based on their research. Once can find more information in their
+    [*documentation*](https://mujoco.readthedocs.io/en/latest/computation.html)
+    or in their paper
+    ["Analytically-invertible dynamics with contacts and constraints: Theory and implementation in MuJoCo"](https://homes.cs.washington.edu/~todorov/papers/TodorovICRA14.pdf).
+
+
+    ### Rewards
+
+    The reward consists of two parts:
+    - *alive_bonus*: The goal is to make the second inverted pendulum stand upright
+    (within a certain angle limit) as long as possible - as such a reward of +10 is awarded
+     for each timestep that the second pole is upright.
+    - *distance_penalty*: This reward is a measure of how far the *tip* of the second pendulum
+    (the only free end) moves, and it is calculated as
+    *0.01 * x<sup>2</sup> + (y - 2)<sup>2</sup>*, where *x* is the x-coordinate of the tip
+    and *y* is the y-coordinate of the tip of the second pole.
+    - *velocity_penalty*: A negative reward for penalising the agent if it moves too
+    fast *0.001 *  v<sub>1</sub><sup>2</sup> + 0.005 * v<sub>2</sub> <sup>2</sup>*
+
+    The total reward returned is ***reward*** *=* *alive_bonus - distance_penalty - velocity_penalty*
+
+    ### Starting State
+    All observations start in state
+    (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0) with a uniform noise in the range
+    of [-0.1, 0.1] added to the positional values (cart position and pole angles) and standard
+    normal force with a standard deviation of 0.1 added to the velocity values for stochasticity.
+
+    ### Episode Termination
+    The episode terminates when any of the following happens:
+
+    1. The episode duration reaches 1000 timesteps.
+    2. Any of the state space values is no longer finite.
+    3. The y_coordinate of the tip of the second pole *is less than or equal* to 1. The maximum standing height of the system is 1.196 m when all the parts are perpendicularly vertical on top of each other).
+
+    ### Arguments
+
+    No additional arguments are currently supported (in v2 and lower), but modifications can
+    be made to the XML file in the assets folder (or by changing the path to a modified XML
+    file in another folder)..
+
+    ```
+    env = gym.make('InvertedDoublePendulum-v2')
+    ```
+    There is no v3 for InvertedPendulum, unlike the robot environments where a v3 and
+    beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
+
+
+    ### Version History
+
+    * v4: all mujoco environments now use the mujoco binidings in mujoco>=2.1.3
+    * v3: support for gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. rgb rendering comes from tracking camera (so agent does not run away from screen)
+    * v2: All continuous control environments now use mujoco_py >= 1.50
+    * v1: max_time_steps raised to 1000 for robot based tasks (including inverted pendulum)
+    * v0: Initial versions release (1.0.0)
+
+    """
+
+    def __init__(self):
+        mujoco_env.MujocoEnv.__init__(self, "inverted_double_pendulum.xml", 5)
+        utils.EzPickle.__init__(self)
+
+    def step(self, action):
+        self.do_simulation(action, self.frame_skip)
+        ob = self._get_obs()
+        x, _, y = self.data.site_xpos[0]
+        dist_penalty = 0.01 * x**2 + (y - 2) ** 2
+        v1, v2 = self.data.qvel[1:3]
+        vel_penalty = 1e-3 * v1**2 + 5e-3 * v2**2
+        alive_bonus = 10
+        r = alive_bonus - dist_penalty - vel_penalty
+        done = bool(y <= 1)
+        return ob, r, done, {}
+
+    def _get_obs(self):
+        return np.concatenate(
+            [
+                self.data.qpos[:1],  # cart x pos
+                np.sin(self.data.qpos[1:]),  # link angles
+                np.cos(self.data.qpos[1:]),
+                np.clip(self.data.qvel, -10, 10),
+                np.clip(self.data.qfrc_constraint, -10, 10),
+            ]
+        ).ravel()
+
+    def reset_model(self):
+        self.set_state(
+            self.init_qpos
+            + self.np_random.uniform(low=-0.1, high=0.1, size=self.model.nq),
+            self.init_qvel + self.np_random.standard_normal(self.model.nv) * 0.1,
+        )
+        return self._get_obs()
+
+    def viewer_setup(self):
+        v = self.viewer
+        v.cam.trackbodyid = 0
+        v.cam.distance = self.model.stat.extent * 0.5
+        v.cam.lookat[2] = 0.12250000000000005  # v.model.stat.center[2]
--- a/gym/envs/mujoco/inverted_pendulum.py
+++ b/gym/envs/mujoco/inverted_pendulum.py
@@ -5,85 +5,11 @@ from gym.envs.mujoco import mujoco_env


 class InvertedPendulumEnv(mujoco_env.MujocoEnv, utils.EzPickle):
-    """
-    ### Description
-
-    This environment is the cartpole environment based on the work done by
-    Barto, Sutton, and Anderson in ["Neuronlike adaptive elements that can
-    solve difficult learning control problems"](https://ieeexplore.ieee.org/document/6313077),
-    just like in the classic environments but now powered by the Mujoco physics simulator -
-    allowing for more complex experiments (such as varying the effects of gravity).
-    This environment involves a cart that can moved linearly, with a pole fixed on it
-    at one end and having another end free. The cart can be pushed left or right, and the
-    goal is to balance the pole on the top of the cart by applying forces on the cart.
-
-    ### Action Space
-    The agent take a 1-element vector for actions.
-
-    The action space is a continuous `(action)` in `[-3, 3]`, where `action` represents
-    the numerical force applied to the cart (with magnitude representing the amount of
-    force and sign representing the direction)
-
-    | Num | Action                    | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit      |
-    |-----|---------------------------|-------------|-------------|----------------------------------|-------|-----------|
-    | 0   | Force applied on the cart | -3          | 3           | slider                           | slide | Force (N) |
-
-    ### Observation Space
-
-    The state space consists of positional values of different body parts of
-    the pendulum system, followed by the velocities of those individual parts (their derivatives)
-    with all the positions ordered before all the velocities.
-
-    The observation is a `ndarray` with shape `(4,)` where the elements correspond to the following:
-
-    | Num | Observation           | Min                  | Max                | Name (in corresponding XML file) | Joint| Unit |
-    |-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------|
-    | 0   | position of the cart along the linear surface | -Inf                 | Inf                | slider | slide | position (m) |
-    | 1   | vertical angle of the pole on the cart        | -Inf                 | Inf                | hinge | hinge | angle (rad) |
-    | 2   | linear velocity of the cart                   | -Inf                 | Inf                | slider | slide | velocity (m/s) |
-    | 3   | angular velocity of the pole on the cart      | -Inf                 | Inf                | hinge | hinge | anglular velocity (rad/s) |
-
-
-    ### Rewards
-
-    The goal is to make the inverted pendulum stand upright (within a certain angle limit)
-    as long as possible - as such a reward of +1 is awarded for each timestep that
-    the pole is upright.
-
-    ### Starting State
-    All observations start in state
-    (0.0, 0.0, 0.0, 0.0) with a uniform noise in the range
-    of [-0.01, 0.01] added to the values for stochasticity.
-
-    ### Episode Termination
-    The episode terminates when any of the following happens:
-
-    1. The episode duration reaches 1000 timesteps.
-    2. Any of the state space values is no longer finite.
-    3. The absolutely value of the vertical angle between the pole and the cart is greater than 0.2 radian.
-
-    ### Arguments
-
-    No additional arguments are currently supported.
-
-    ```
-    env = gym.make('InvertedPendulum-v2')
-    ```
-    There is no v3 for InvertedPendulum, unlike the robot environments where a
-    v3 and beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
-
-
-    ### Version History
-
-    * v2: All continuous control environments now use mujoco_py >= 1.50
-    * v1: max_time_steps raised to 1000 for robot based tasks (including inverted pendulum)
-    * v0: Initial versions release (1.0.0)
-
-    """
-
    def __init__(self):
        utils.EzPickle.__init__(self)
-        mujoco_env.MujocoEnv.__init__(self, "inverted_pendulum.xml", 2)
+        mujoco_env.MujocoEnv.__init__(
+            self, "inverted_pendulum.xml", 2, mujoco_bindings="mujoco_py"
+        )

    def step(self, a):
        reward = 1.0
--- a/gym/envs/mujoco/inverted_pendulum_v4.py
+++ b/gym/envs/mujoco/inverted_pendulum_v4.py
@@ -0,0 +1,116 @@
+import numpy as np
+
+from gym import utils
+from gym.envs.mujoco import mujoco_env
+
+
+class InvertedPendulumEnv(mujoco_env.MujocoEnv, utils.EzPickle):
+    """
+    ### Description
+
+    This environment is the cartpole environment based on the work done by
+    Barto, Sutton, and Anderson in ["Neuronlike adaptive elements that can
+    solve difficult learning control problems"](https://ieeexplore.ieee.org/document/6313077),
+    just like in the classic environments but now powered by the Mujoco physics simulator -
+    allowing for more complex experiments (such as varying the effects of gravity).
+    This environment involves a cart that can moved linearly, with a pole fixed on it
+    at one end and having another end free. The cart can be pushed left or right, and the
+    goal is to balance the pole on the top of the cart by applying forces on the cart.
+
+    ### Action Space
+    The agent take a 1-element vector for actions.
+
+    The action space is a continuous `(action)` in `[-3, 3]`, where `action` represents
+    the numerical force applied to the cart (with magnitude representing the amount of
+    force and sign representing the direction)
+
+    | Num | Action                    | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit      |
+    |-----|---------------------------|-------------|-------------|----------------------------------|-------|-----------|
+    | 0   | Force applied on the cart | -3          | 3           | slider                           | slide | Force (N) |
+
+    ### Observation Space
+
+    The state space consists of positional values of different body parts of
+    the pendulum system, followed by the velocities of those individual parts (their derivatives)
+    with all the positions ordered before all the velocities.
+
+    The observation is a `ndarray` with shape `(4,)` where the elements correspond to the following:
+
+    | Num | Observation           | Min                  | Max                | Name (in corresponding XML file) | Joint| Unit |
+    |-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------|
+    | 0   | position of the cart along the linear surface | -Inf                 | Inf                | slider | slide | position (m) |
+    | 1   | vertical angle of the pole on the cart        | -Inf                 | Inf                | hinge | hinge | angle (rad) |
+    | 2   | linear velocity of the cart                   | -Inf                 | Inf                | slider | slide | velocity (m/s) |
+    | 3   | angular velocity of the pole on the cart      | -Inf                 | Inf                | hinge | hinge | anglular velocity (rad/s) |
+
+
+    ### Rewards
+
+    The goal is to make the inverted pendulum stand upright (within a certain angle limit)
+    as long as possible - as such a reward of +1 is awarded for each timestep that
+    the pole is upright.
+
+    ### Starting State
+    All observations start in state
+    (0.0, 0.0, 0.0, 0.0) with a uniform noise in the range
+    of [-0.01, 0.01] added to the values for stochasticity.
+
+    ### Episode Termination
+    The episode terminates when any of the following happens:
+
+    1. The episode duration reaches 1000 timesteps.
+    2. Any of the state space values is no longer finite.
+    3. The absolutely value of the vertical angle between the pole and the cart is greater than 0.2 radian.
+
+    ### Arguments
+
+    No additional arguments are currently supported (in v2 and lower),
+    but modifications can be made to the XML file in the assets folder
+    (or by changing the path to a modified XML file in another folder).
+
+    ```
+    env = gym.make('InvertedPendulum-v2')
+    ```
+    There is no v3 for InvertedPendulum, unlike the robot environments where a
+    v3 and beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
+
+
+    ### Version History
+
+    * v4: all mujoco environments now use the mujoco binidings in mujoco>=2.1.3
+    * v3: support for gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. rgb rendering comes from tracking camera (so agent does not run away from screen)
+    * v2: All continuous control environments now use mujoco_py >= 1.50
+    * v1: max_time_steps raised to 1000 for robot based tasks (including inverted pendulum)
+    * v0: Initial versions release (1.0.0)
+
+    """
+
+    def __init__(self):
+        utils.EzPickle.__init__(self)
+        mujoco_env.MujocoEnv.__init__(self, "inverted_pendulum.xml", 2)
+
+    def step(self, a):
+        reward = 1.0
+        self.do_simulation(a, self.frame_skip)
+        ob = self._get_obs()
+        notdone = np.isfinite(ob).all() and (np.abs(ob[1]) <= 0.2)
+        done = not notdone
+        return ob, reward, done, {}
+
+    def reset_model(self):
+        qpos = self.init_qpos + self.np_random.uniform(
+            size=self.model.nq, low=-0.01, high=0.01
+        )
+        qvel = self.init_qvel + self.np_random.uniform(
+            size=self.model.nv, low=-0.01, high=0.01
+        )
+        self.set_state(qpos, qvel)
+        return self._get_obs()
+
+    def _get_obs(self):
+        return np.concatenate([self.data.qpos, self.data.qvel]).ravel()
+
+    def viewer_setup(self):
+        v = self.viewer
+        v.cam.trackbodyid = 0
+        v.cam.distance = self.model.stat.extent
--- a/gym/envs/mujoco/mujoco_env.py
+++ b/gym/envs/mujoco/mujoco_env.py
@@ -1,4 +1,3 @@
-import os
 from collections import OrderedDict
 from os import path
 from typing import Optional
@@ -6,18 +5,9 @@ from typing import Optional
 import numpy as np

 import gym
-from gym import error, spaces
+from gym import error, logger, spaces

-try:
-    import mujoco_py
-except ImportError as e:
-    raise error.DependencyNotInstalled(
-        "{}. (HINT: you need to install mujoco_py, and also perform the setup instructions here: https://github.com/openai/mujoco-py/.)".format(
-            e
-        )
-    )
-
-DEFAULT_SIZE = 500
+DEFAULT_SIZE = 480


 def convert_observation_to_space(observation):
@@ -43,28 +33,64 @@ def convert_observation_to_space(observation):
 class MujocoEnv(gym.Env):
    """Superclass for all MuJoCo environments."""

-    def __init__(self, model_path, frame_skip):
+    def __init__(self, model_path, frame_skip, mujoco_bindings="mujoco"):
+
        if model_path.startswith("/"):
            fullpath = model_path
        else:
-            fullpath = os.path.join(os.path.dirname(__file__), "assets", model_path)
+            fullpath = path.join(path.dirname(__file__), "assets", model_path)
        if not path.exists(fullpath):
            raise OSError(f"File {fullpath} does not exist")
-        self.frame_skip = frame_skip
-        self.model = mujoco_py.load_model_from_path(fullpath)
-        self.sim = mujoco_py.MjSim(self.model)
+
+        if mujoco_bindings == "mujoco_py":
+            logger.warn(
+                "This version of the mujoco environments depends "
+                "on the mujoco-py bindings, which are no longer maintained "
+                "and may stop working. Please upgrade to the v4 versions of "
+                "the environments (which depend on the mujoco python bindings instead), unless "
+                "you are trying to precisely replicate previous works)."
+            )
+            try:
+                import mujoco_py
+
+                self._mujoco_bindings = mujoco_py
+            except ImportError as e:
+                raise error.DependencyNotInstalled(
+                    "{}. (HINT: you need to install mujoco_py, and also perform the setup instructions here: https://github.com/openai/mujoco-py/.)".format(
+                        e
+                    )
+                )
+
+            self.model = self._mujoco_bindings.load_model_from_path(fullpath)
+            self.sim = self._mujoco_bindings.MjSim(self.model)
            self.data = self.sim.data
-        self.viewer = None
+
+        elif mujoco_bindings == "mujoco":
+            try:
+                import mujoco
+
+                self._mujoco_bindings = mujoco
+
+            except ImportError as e:
+                raise error.DependencyNotInstalled(
+                    f"{e}. (HINT: you need to install mujoco)"
+                )
+            self.model = self._mujoco_bindings.MjModel.from_xml_path(fullpath)
+            self.data = self._mujoco_bindings.MjData(self.model)
+
+        self.init_qpos = self.data.qpos.ravel().copy()
+        self.init_qvel = self.data.qvel.ravel().copy()
        self._viewers = {}

+        self.frame_skip = frame_skip
+
+        self.viewer = None
+
        self.metadata = {
            "render_modes": ["human", "rgb_array", "depth_array"],
            "render_fps": int(np.round(1.0 / self.dt)),
        }

-        self.init_qpos = self.sim.data.qpos.ravel().copy()
-        self.init_qvel = self.sim.data.qvel.ravel().copy()
-
        self._set_action_space()

        action = self.action_space.sample()
@@ -111,7 +137,12 @@ class MujocoEnv(gym.Env):
        options: Optional[dict] = None,
    ):
        super().reset(seed=seed)
+
+        if self._mujoco_bindings.__name__ == "mujoco_py":
            self.sim.reset()
+        else:
+            self._mujoco_bindings.mj_resetData(self.model, self.data)
+
        ob = self.reset_model()
        if not return_info:
            return ob
@@ -120,12 +151,19 @@ class MujocoEnv(gym.Env):

    def set_state(self, qpos, qvel):
        assert qpos.shape == (self.model.nq,) and qvel.shape == (self.model.nv,)
-        old_state = self.sim.get_state()
-        new_state = mujoco_py.MjSimState(
-            old_state.time, qpos, qvel, old_state.act, old_state.udd_state
+        if self._mujoco_bindings.__name__ == "mujoco_py":
+            state = self.sim.get_state()
+            state = self._mujoco_bindings.MjSimState(
+                state.time, qpos, qvel, state.act, state.udd_state
            )
-        self.sim.set_state(new_state)
+            self.sim.set_state(state)
            self.sim.forward()
+        else:
+            self.data.qpos[:] = np.copy(qpos)
+            self.data.qvel[:] = np.copy(qvel)
+            if self.model.na == 0:
+                self.data.act[:] = None
+            self._mujoco_bindings.mj_forward(self.model, self.data)

    @property
    def dt(self):
@@ -135,9 +173,21 @@ class MujocoEnv(gym.Env):
        if np.array(ctrl).shape != self.action_space.shape:
            raise ValueError("Action dimension mismatch")

+        if self._mujoco_bindings.__name__ == "mujoco_py":
            self.sim.data.ctrl[:] = ctrl
+        else:
+            self.data.ctrl[:] = ctrl
        for _ in range(n_frames):
+            if self._mujoco_bindings.__name__ == "mujoco_py":
                self.sim.step()
+            else:
+                self._mujoco_bindings.mj_step(self.model, self.data)
+
+        # As of MuJoCo 2.0, force-related quantities like cacc are not computed
+        # unless there's a force sensor in the model.
+        # See https://github.com/openai/gym/issues/1541
+        if self._mujoco_bindings.__name__ != "mujoco_py":
+            self._mujoco_bindings.mj_rnePostConstraint(self.model, self.data)

    def render(
        self,
@@ -158,19 +208,25 @@ class MujocoEnv(gym.Env):
            if no_camera_specified:
                camera_name = "track"

-            if camera_id is None and camera_name in self.model._camera_name2id:
+            if camera_id is None:
+                if self._mujoco_bindings.__name__ == "mujoco_py":
+                    if camera_name in self.model._camera_name2id:
                        camera_id = self.model.camera_name2id(camera_name)
+                else:
+                    camera_id = self._mujoco_bindings.mj_name2id(
+                        self.model,
+                        self._mujoco_bindings.mjtObj.mjOBJ_CAMERA,
+                        camera_name,
+                    )

                self._get_viewer(mode).render(width, height, camera_id=camera_id)

        if mode == "rgb_array":
-            # window size used for old mujoco-py:
            data = self._get_viewer(mode).read_pixels(width, height, depth=False)
            # original image is upside-down, so flip it
            return data[::-1, :, :]
        elif mode == "depth_array":
            self._get_viewer(mode).render(width, height)
-            # window size used for old mujoco-py:
            # Extract depth part of the read_pixels() tuple
            data = self._get_viewer(mode).read_pixels(width, height, depth=True)[1]
            # original image is upside-down, so flip it
@@ -180,24 +236,40 @@ class MujocoEnv(gym.Env):

    def close(self):
        if self.viewer is not None:
-            # self.viewer.finish()
            self.viewer = None
            self._viewers = {}

-    def _get_viewer(self, mode):
+    def _get_viewer(self, mode, width=DEFAULT_SIZE, height=DEFAULT_SIZE):
        self.viewer = self._viewers.get(mode)
        if self.viewer is None:
            if mode == "human":
-                self.viewer = mujoco_py.MjViewer(self.sim)
+                if self._mujoco_bindings.__name__ == "mujoco_py":
+                    self.viewer = self._mujoco_bindings.MjViewer(self.sim)
+                else:
+                    from gym.envs.mujoco.mujoco_rendering import Viewer
+
+                    self.viewer = Viewer(self.model, self.data)
            elif mode == "rgb_array" or mode == "depth_array":
-                self.viewer = mujoco_py.MjRenderContextOffscreen(self.sim, -1)
+                if self._mujoco_bindings.__name__ == "mujoco_py":
+                    self.viewer = self._mujoco_bindings.MjRenderContextOffscreen(
+                        self.sim, -1
+                    )
+                else:
+                    from gym.envs.mujoco.mujoco_rendering import RenderContextOffscreen
+
+                    self.viewer = RenderContextOffscreen(
+                        width, height, self.model, self.data
+                    )

            self.viewer_setup()
            self._viewers[mode] = self.viewer
        return self.viewer

    def get_body_com(self, body_name):
+        if self._mujoco_bindings.__name__ == "mujoco_py":
            return self.data.get_body_xpos(body_name)
+        else:
+            return self.data.body(body_name).xpos

    def state_vector(self):
-        return np.concatenate([self.sim.data.qpos.flat, self.sim.data.qvel.flat])
+        return np.concatenate([self.data.qpos.flat, self.data.qvel.flat])
--- a/gym/envs/mujoco/mujoco_rendering.py
+++ b/gym/envs/mujoco/mujoco_rendering.py
@@ -0,0 +1,554 @@
+import collections
+import os
+import sys
+import time
+from threading import Lock
+
+import glfw
+import imageio
+import mujoco
+import numpy as np
+
+
+def _import_egl(width, height):
+    from mujoco.egl import GLContext
+
+    return GLContext(width, height)
+
+
+def _import_glfw(width, height):
+    from mujoco.glfw import GLContext
+
+    return GLContext(width, height)
+
+
+def _import_osmesa(width, height):
+    from mujoco.osmesa import GLContext
+
+    return GLContext(width, height)
+
+
+_ALL_RENDERERS = collections.OrderedDict(
+    [
+        ("glfw", _import_glfw),
+        ("egl", _import_egl),
+        ("osmesa", _import_osmesa),
+    ]
+)
+
+
+class RenderContext:
+    """Render context superclass for offscreen and window rendering."""
+
+    def __init__(self, model, data, offscreen=True):
+
+        self.model = model
+        self.data = data
+        self.offscreen = offscreen
+        max_geom = 1000
+
+        mujoco.mj_forward(self.model, self.data)
+
+        self.scn = mujoco.MjvScene(self.model, max_geom)
+        self.cam = mujoco.MjvCamera()
+        self.vopt = mujoco.MjvOption()
+        self.pert = mujoco.MjvPerturb()
+        self.con = mujoco.MjrContext(self.model, mujoco.mjtFontScale.mjFONTSCALE_150)
+
+        self._markers = []
+        self._overlays = {}
+
+        self._init_camera()
+        self._set_mujoco_buffers()
+
+    def _set_mujoco_buffers(self):
+        if self.offscreen:
+            mujoco.mjr_setBuffer(mujoco.mjtFramebuffer.mjFB_OFFSCREEN, self.con)
+            if self.con.currentBuffer != mujoco.mjtFramebuffer.mjFB_OFFSCREEN:
+                raise RuntimeError("Offscreen rendering not supported")
+        else:
+            mujoco.mjr_setBuffer(mujoco.mjtFramebuffer.mjFB_WINDOW, self.con)
+            if self.con.currentBuffer != mujoco.mjtFramebuffer.mjFB_WINDOW:
+                raise RuntimeError("Window rendering not supported")
+
+    def update_offscreen_size(self, width, height):
+        if width != self.con.offWidth or height != self.con.offHeight:
+            self.model.vis.global_.offwidth = width
+            self.model.vis.global_.offheight = height
+            self.con.free()
+            self._set_mujoco_buffers()
+
+    def render(self, width, height, camera_id=None, segmentation=False):
+        rect = mujoco.MjrRect(left=0, bottom=0, width=width, height=height)
+
+        # Sometimes buffers are too small.
+        if width > self.con.offWidth or height > self.con.offHeight:
+            new_width = max(width, self.model.vis.global_.offwidth)
+            new_height = max(height, self.model.vis.global_.offheight)
+            self.update_offscreen_size(new_width, new_height)
+
+        if camera_id is not None:
+            if camera_id == -1:
+                self.cam.type = mujoco.mjtCamera.mjCAMERA_FREE
+            else:
+                self.cam.type = mujoco.mjtCamera.mjCAMERA_FIXED
+            self.cam.fixedcamid = camera_id
+
+        mujoco.mjv_updateScene(
+            self.model,
+            self.data,
+            self.vopt,
+            self.pert,
+            self.cam,
+            mujoco.mjtCatBit.mjCAT_ALL,
+            self.scn,
+        )
+
+        if segmentation:
+            self.scn.flags[mujoco.mjtRndFlag.mjRND_SEGMENT] = 1
+            self.scn.flags[mujoco.mjtRndFlag.mjRND_IDCOLOR] = 1
+
+        for marker_params in self._markers:
+            self._add_marker_to_scene(marker_params)
+
+        mujoco.mjr_render(rect, self.scn, self.con)
+
+        for gridpos, (text1, text2) in self._overlays.items():
+            mujoco.mjr_overlay(
+                mujoco.mjtFontScale.mjFONTSCALE_150,
+                gridpos,
+                rect,
+                text1.encode(),
+                text2.encode(),
+                self.con,
+            )
+
+        if segmentation:
+            self.scn.flags[mujoco.mjtRndFlag.mjRND_SEGMENT] = 0
+            self.scn.flags[mujoco.mjtRndFlag.mjRND_IDCOLOR] = 0
+
+    def read_pixels(self, width, height, depth=True, segmentation=False):
+        rect = mujoco.MjrRect(left=0, bottom=0, width=width, height=height)
+
+        rgb_arr = np.zeros(3 * rect.width * rect.height, dtype=np.uint8)
+        depth_arr = np.zeros(rect.width * rect.height, dtype=np.float32)
+
+        mujoco.mjr_readPixels(rgb_arr, depth_arr, rect, self.con)
+        rgb_img = rgb_arr.reshape(rect.height, rect.width, 3)
+
+        ret_img = rgb_img
+        if segmentation:
+            seg_img = (
+                rgb_img[:, :, 0]
+                + rgb_img[:, :, 1] * (2**8)
+                + rgb_img[:, :, 2] * (2**16)
+            )
+            seg_img[seg_img >= (self.scn.ngeom + 1)] = 0
+            seg_ids = np.full((self.scn.ngeom + 1, 2), fill_value=-1, dtype=np.int32)
+
+            for i in range(self.scn.ngeom):
+                geom = self.scn.geoms[i]
+                if geom.segid != -1:
+                    seg_ids[geom.segid + 1, 0] = geom.objtype
+                    seg_ids[geom.segid + 1, 1] = geom.objid
+            ret_img = seg_ids[seg_img]
+
+        if depth:
+            depth_img = depth_arr.reshape(rect.height, rect.width)
+            return (ret_img, depth_img)
+        else:
+            return ret_img
+
+    def _init_camera(self):
+        self.cam.type = mujoco.mjtCamera.mjCAMERA_FREE
+        self.cam.fixedcamid = -1
+        for i in range(3):
+            self.cam.lookat[i] = np.median(self.data.geom_xpos[:, i])
+        self.cam.distance = self.model.stat.extent
+
+    def add_overlay(self, gridpos: int, text1: str, text2: str):
+        """Overlays text on the scene."""
+        if gridpos not in self._overlays:
+            self._overlays[gridpos] = ["", ""]
+        self._overlays[gridpos][0] += text1 + "\n"
+        self._overlays[gridpos][1] += text2 + "\n"
+
+    def add_marker(self, **marker_params):
+        self._markers.append(marker_params)
+
+    def _add_marker_to_scene(self, marker):
+        if self.scn.ngeom >= self.scn.maxgeom:
+            raise RuntimeError("Ran out of geoms. maxgeom: %d" % self.scn.maxgeom)
+
+        g = self.scn.geoms[self.scn.ngeom]
+        # default values.
+        g.dataid = -1
+        g.objtype = mujoco.mjtObj.mjOBJ_UNKNOWN
+        g.objid = -1
+        g.category = mujoco.mjtCatBit.mjCAT_DECOR
+        g.texid = -1
+        g.texuniform = 0
+        g.texrepeat[0] = 1
+        g.texrepeat[1] = 1
+        g.emission = 0
+        g.specular = 0.5
+        g.shininess = 0.5
+        g.reflectance = 0
+        g.type = mujoco.mjtGeom.mjGEOM_BOX
+        g.size[:] = np.ones(3) * 0.1
+        g.mat[:] = np.eye(3)
+        g.rgba[:] = np.ones(4)
+
+        for key, value in marker.items():
+            if isinstance(value, (int, float, mujoco._enums.mjtGeom)):
+                setattr(g, key, value)
+            elif isinstance(value, (tuple, list, np.ndarray)):
+                attr = getattr(g, key)
+                attr[:] = np.asarray(value).reshape(attr.shape)
+            elif isinstance(value, str):
+                assert key == "label", "Only label is a string in mjtGeom."
+                if value is None:
+                    g.label[0] = 0
+                else:
+                    g.label = value
+            elif hasattr(g, key):
+                raise ValueError(
+                    "mjtGeom has attr {} but type {} is invalid".format(
+                        key, type(value)
+                    )
+                )
+            else:
+                raise ValueError("mjtGeom doesn't have field %s" % key)
+
+        self.scn.ngeom += 1
+
+
+class RenderContextOffscreen(RenderContext):
+    """Offscreen rendering class with opengl context."""
+
+    def __init__(self, width, height, model, data):
+
+        self._get_opengl_backend(width, height)
+        self.opengl_context.make_current()
+
+        super().__init__(model, data, offscreen=True)
+
+    def _get_opengl_backend(self, width, height):
+
+        backend = os.environ.get("MUJOCO_GL")
+        if backend is not None:
+            try:
+                self.opengl_context = _ALL_RENDERERS[backend](width, height)
+            except KeyError:
+                raise RuntimeError(
+                    "Environment variable {} must be one of {!r}: got {!r}.".format(
+                        "MUJOCO_GL", _ALL_RENDERERS.keys(), backend
+                    )
+                )
+
+        else:
+            for name, _ in _ALL_RENDERERS.items():
+                try:
+                    self.opengl_context = _ALL_RENDERERS[name](width, height)
+                    backend = name
+                    break
+                except:  # noqa:E722
+                    pass
+            if backend is None:
+                raise RuntimeError(
+                    "No OpenGL backend could be imported. Attempting to create a "
+                    "rendering context will result in a RuntimeError."
+                )
+
+
+class Viewer(RenderContext):
+    """Class for window rendering in all MuJoCo environments."""
+
+    def __init__(self, model, data):
+        self._gui_lock = Lock()
+        self._button_left_pressed = False
+        self._button_right_pressed = False
+        self._last_mouse_x = 0
+        self._last_mouse_y = 0
+        self._paused = False
+        self._transparent = False
+        self._contacts = False
+        self._render_every_frame = True
+        self._image_idx = 0
+        self._image_path = "/tmp/frame_%07d.png"
+        self._time_per_render = 1 / 60.0
+        self._run_speed = 1.0
+        self._loop_count = 0
+        self._advance_by_one_step = False
+        self._hide_menu = False
+
+        # glfw init
+        glfw.init()
+        width, height = glfw.get_video_mode(glfw.get_primary_monitor()).size
+        self.window = glfw.create_window(width // 2, height // 2, "mujoco", None, None)
+        glfw.make_context_current(self.window)
+        glfw.swap_interval(1)
+
+        framebuffer_width, framebuffer_height = glfw.get_framebuffer_size(self.window)
+        window_width, _ = glfw.get_window_size(self.window)
+        self._scale = framebuffer_width * 1.0 / window_width
+
+        # set callbacks
+        glfw.set_cursor_pos_callback(self.window, self._cursor_pos_callback)
+        glfw.set_mouse_button_callback(self.window, self._mouse_button_callback)
+        glfw.set_scroll_callback(self.window, self._scroll_callback)
+        glfw.set_key_callback(self.window, self._key_callback)
+
+        # get viewport
+        self.viewport = mujoco.MjrRect(0, 0, framebuffer_width, framebuffer_height)
+
+        super().__init__(model, data, offscreen=False)
+
+    def _key_callback(self, key, action):
+        if action != glfw.RELEASE:
+            return
+        # Switch cameras
+        elif key == glfw.KEY_TAB:
+            self.cam.fixedcamid += 1
+            self.cam.type = mujoco.mjtCamera.mjCAMERA_FIXED
+            if self.cam.fixedcamid >= self.model.ncam:
+                self.cam.fixedcamid = -1
+                self.cam.type = mujoco.mjtCamera.mjCAMERA_FREE
+        # Pause simulation
+        elif key == glfw.KEY_SPACE and self._paused is not None:
+            self._paused = not self._paused
+        # Advances simulation by one step.
+        elif key == glfw.KEY_RIGHT and self._paused is not None:
+            self._advance_by_one_step = True
+            self._paused = True
+        # Slows down simulation
+        elif key == glfw.KEY_S:
+            self._run_speed /= 2.0
+        # Speeds up simulation
+        elif key == glfw.KEY_F:
+            self._run_speed *= 2.0
+        # Turn off / turn on rendering every frame.
+        elif key == glfw.KEY_D:
+            self._render_every_frame = not self._render_every_frame
+        # Capture screenshot
+        elif key == glfw.KEY_T:
+            img = np.zeros(
+                (
+                    glfw.get_framebuffer_size(self.window)[1],
+                    glfw.get_framebuffer_size(self.window)[0],
+                    3,
+                ),
+                dtype=np.uint8,
+            )
+            mujoco.mjr_readPixels(img, None, self.viewport, self.con)
+            imageio.imwrite(self._image_path % self._image_idx, np.flipud(img))
+            self._image_idx += 1
+        # Display contact forces
+        elif key == glfw.KEY_C:
+            self._contacts = not self._contacts
+            self.vopt.flags[mujoco.mjtVisFlag.mjVIS_CONTACTPOINT] = self._contacts
+            self.vopt.flags[mujoco.mjtVisFlag.mjVIS_CONTACTFORCE] = self._contacts
+        # Display coordinate frames
+        elif key == glfw.KEY_E:
+            self.vopt.frame = 1 - self.vopt.frame
+        # Hide overlay menu
+        elif key == glfw.KEY_H:
+            self._hide_menu = not self._hide_menu
+        # Make transparent
+        elif key == glfw.KEY_R:
+            self._transparent = not self._transparent
+            if self._transparent:
+                self.model.geom_rgba[:, 3] /= 5.0
+            else:
+                self.model.geom_rgba[:, 3] *= 5.0
+        # Geom group visibility
+        elif key in (glfw.KEY_0, glfw.KEY_1, glfw.KEY_2, glfw.KEY_3, glfw.KEY_4):
+            self.vopt.geomgroup[key - glfw.KEY_0] ^= 1
+        # Quit
+        if key == glfw.KEY_ESCAPE:
+            print("Pressed ESC")
+            print("Quitting.")
+            glfw.terminate()
+            sys.exit(0)
+
+    def _cursor_pos_callback(self, window, xpos, ypos):
+        if not (self._button_left_pressed or self._button_right_pressed):
+            return
+
+        mod_shift = (
+            glfw.get_key(window, glfw.KEY_LEFT_SHIFT) == glfw.PRESS
+            or glfw.get_key(window, glfw.KEY_RIGHT_SHIFT) == glfw.PRESS
+        )
+        if self._button_right_pressed:
+            action = (
+                mujoco.mjtMouse.mjMOUSE_MOVE_H
+                if mod_shift
+                else mujoco.mjtMouse.mjMOUSE_MOVE_V
+            )
+        elif self._button_left_pressed:
+            action = (
+                mujoco.mjtMouse.mjMOUSE_ROTATE_H
+                if mod_shift
+                else mujoco.mjtMouse.mjMOUSE_ROTATE_V
+            )
+        else:
+            action = mujoco.mjtMouse.mjMOUSE_ZOOM
+
+        dx = int(self._scale * xpos) - self._last_mouse_x
+        dy = int(self._scale * ypos) - self._last_mouse_y
+        width, height = glfw.get_framebuffer_size(window)
+
+        with self._gui_lock:
+            mujoco.mjv_moveCamera(
+                self.model, action, dx / height, dy / height, self.scn, self.cam
+            )
+
+        self._last_mouse_x = int(self._scale * xpos)
+        self._last_mouse_y = int(self._scale * ypos)
+
+    def _mouse_button_callback(self, window, button, act, mods):
+        self._button_left_pressed = (
+            glfw.get_mouse_button(window, glfw.MOUSE_BUTTON_LEFT) == glfw.PRESS
+        )
+        self._button_right_pressed = (
+            glfw.get_mouse_button(window, glfw.MOUSE_BUTTON_RIGHT) == glfw.PRESS
+        )
+
+        x, y = glfw.get_cursor_pos(window)
+        self._last_mouse_x = int(self._scale * x)
+        self._last_mouse_y = int(self._scale * y)
+
+    def _scroll_callback(self, window, x_offset, y_offset):
+        with self._gui_lock:
+            mujoco.mjv_moveCamera(
+                self.model,
+                mujoco.mjtMouse.mjMOUSE_ZOOM,
+                0,
+                -0.05 * y_offset,
+                self.scn,
+                self.cam,
+            )
+
+    def _create_overlay(self):
+        topleft = mujoco.mjtGridPos.mjGRID_TOPLEFT
+        bottomleft = mujoco.mjtGridPos.mjGRID_BOTTOMLEFT
+
+        if self._render_every_frame:
+            self.add_overlay(topleft, "", "")
+        else:
+            self.add_overlay(
+                topleft,
+                "Run speed = %.3f x real time" % self._run_speed,
+                "[S]lower, [F]aster",
+            )
+        self.add_overlay(
+            topleft, "Ren[d]er every frame", "On" if self._render_every_frame else "Off"
+        )
+        self.add_overlay(
+            topleft,
+            "Switch camera (#cams = %d)" % (self.model.ncam + 1),
+            "[Tab] (camera ID = %d)" % self.cam.fixedcamid,
+        )
+        self.add_overlay(topleft, "[C]ontact forces", "On" if self._contacts else "Off")
+        self.add_overlay(topleft, "T[r]ansparent", "On" if self._transparent else "Off")
+        if self._paused is not None:
+            if not self._paused:
+                self.add_overlay(topleft, "Stop", "[Space]")
+            else:
+                self.add_overlay(topleft, "Start", "[Space]")
+                self.add_overlay(
+                    topleft, "Advance simulation by one step", "[right arrow]"
+                )
+        self.add_overlay(
+            topleft, "Referenc[e] frames", "On" if self.vopt.frame == 1 else "Off"
+        )
+        self.add_overlay(topleft, "[H]ide Menu", "")
+        if self._image_idx > 0:
+            fname = self._image_path % (self._image_idx - 1)
+            self.add_overlay(topleft, "Cap[t]ure frame", "Saved as %s" % fname)
+        else:
+            self.add_overlay(topleft, "Cap[t]ure frame", "")
+        self.add_overlay(topleft, "Toggle geomgroup visibility", "0-4")
+
+        self.add_overlay(bottomleft, "FPS", "%d%s" % (1 / self._time_per_render, ""))
+        self.add_overlay(
+            bottomleft, "Solver iterations", str(self.data.solver_iter + 1)
+        )
+        self.add_overlay(
+            bottomleft, "Step", str(round(self.data.time / self.model.opt.timestep))
+        )
+        self.add_overlay(bottomleft, "timestep", "%.5f" % self.model.opt.timestep)
+
+    def render(self):
+        # mjv_updateScene, mjr_render, mjr_overlay
+        def update():
+            # fill overlay items
+            self._create_overlay()
+
+            render_start = time.time()
+            if self.window is None:
+                return
+            elif glfw.window_should_close(self.window):
+                glfw.terminate()
+                sys.exit(0)
+            self.viewport.width, self.viewport.height = glfw.get_framebuffer_size(
+                self.window
+            )
+            with self._gui_lock:
+                # update scene
+                mujoco.mjv_updateScene(
+                    self.model,
+                    self.data,
+                    self.vopt,
+                    mujoco.MjvPerturb(),
+                    self.cam,
+                    mujoco.mjtCatBit.mjCAT_ALL.value,
+                    self.scn,
+                )
+                # marker items
+                for marker in self._markers:
+                    self._add_marker_to_scene(marker)
+                # render
+                mujoco.mjr_render(self.viewport, self.scn, self.con)
+                # overlay items
+                if not self._hide_menu:
+                    for gridpos, [t1, t2] in self._overlays.items():
+                        mujoco.mjr_overlay(
+                            mujoco.mjtFontScale.mjFONTSCALE_150,
+                            gridpos,
+                            self.viewport,
+                            t1,
+                            t2,
+                            self.con,
+                        )
+                glfw.swap_buffers(self.window)
+            glfw.poll_events()
+            self._time_per_render = 0.9 * self._time_per_render + 0.1 * (
+                time.time() - render_start
+            )
+
+            # clear overlay
+            self._overlays.clear()
+
+        if self._paused:
+            while self._paused:
+                update()
+                if self._advance_by_one_step:
+                    self._advance_by_one_step = False
+                    break
+        else:
+            self._loop_count += self.model.opt.timestep / (
+                self._time_per_render * self._run_speed
+            )
+            if self._render_every_frame:
+                self._loop_count = 1
+            while self._loop_count > 0:
+                update()
+                self._loop_count -= 1
+
+        # clear markers
+        self._markers[:] = []
+
+    def close(self):
+        glfw.terminate()
+        sys.exit(0)
--- a/gym/envs/mujoco/pusher.py
+++ b/gym/envs/mujoco/pusher.py
@@ -5,131 +5,11 @@ from gym.envs.mujoco import mujoco_env


 class PusherEnv(mujoco_env.MujocoEnv, utils.EzPickle):
-    """
-    ### Description
-    "Pusher" is a multi-jointed robot arm which is very similar to that of a human.
-     The goal is to move a target cylinder (called *object*) to a goal position using the robot's end effector (called *fingertip*).
-      The robot consists of shoulder, elbow, forearm, and wrist joints.
-
-    ### Action Space
-    The action space is a `Box(-2, 2, (7,), float32)`. An action `(a, b)` represents the torques applied at the hinge joints.
-
-    | Num | Action                                                             | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit         |
-    |-----|--------------------------------------------------------------------|-------------|-------------|----------------------------------|-------|--------------|
-    | 0    | Rotation of the panning the shoulder                              | -2          | 2           | r_shoulder_pan_joint             | hinge | torque (N m) |
-    | 1    | Rotation of the shoulder lifting joint                            | -2          | 2           | r_shoulder_lift_joint            | hinge | torque (N m) |
-    | 2    | Rotation of the shoulder rolling joint                            | -2          | 2           | r_upper_arm_roll_joint           | hinge | torque (N m) |
-    | 3    | Rotation of hinge joint that flexed the elbow                     | -2          | 2           | r_elbow_flex_joint               | hinge | torque (N m) |
-    | 4    | Rotation of hinge that rolls the forearm                          | -2          | 2           | r_forearm_roll_joint             | hinge | torque (N m) |
-    | 5    | Rotation of flexing the wrist                                     | -2          | 2           | r_wrist_flex_joint               | hinge | torque (N m) |
-    | 6    | Rotation of rolling the wrist                                     | -2          | 2           | r_wrist_roll_joint               | hinge | torque (N m) |
-
-    ### Observation Space
-
-    Observations consist of
-
-    - Angle of rotational joints on the pusher
-    - Anglular velocities of rotational joints on the pusher
-    - The coordinates of the fingertip of the pusher
-    - The coordinates of the object to be moved
-    - The coordinates of the goal position
-
-    The observation is a `ndarray` with shape `(23,)` where the elements correspond to the table below.
-    An analogy can be drawn to a human arm in order to help understand the state space, with the words flex and roll meaning the
-    same as human joints.
-
-    | Num | Observation           | Min                  | Max                | Name (in corresponding XML file) | Joint| Unit |
-    |-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------|
-    | 0    | Rotation of the panning the shoulder                              | -Inf                 | Inf                | r_shoulder_pan_joint | hinge | angle (rad) |
-    | 1    | Rotation of the shoulder lifting joint                            | -Inf                 | Inf                | r_shoulder_lift_joint | hinge | angle (rad) |
-    | 2    | Rotation of the shoulder rolling joint                            | -Inf                 | Inf                | r_upper_arm_roll_joint | hinge | angle (rad) |
-    | 3    | Rotation of hinge joint that flexed the elbow                     | -Inf                 | Inf                | r_elbow_flex_joint | hinge | angle (rad) |
-    | 4    | Rotation of hinge that rolls the forearm                          | -Inf                 | Inf                | r_forearm_roll_joint | hinge | angle (rad) |
-    | 5    | Rotation of flexing the wrist                                     | -Inf                 | Inf                | r_wrist_flex_joint | hinge | angle (rad) |
-    | 6    | Rotation of rolling the wrist                                     | -Inf                 | Inf                | r_wrist_roll_joint | hinge | angle (rad) |
-    | 7    | Rotational velocity of the panning the shoulder                   | -Inf                 | Inf                | r_shoulder_pan_joint | hinge | angular velocity (rad/s) |
-    | 8    | Rotational velocity of the shoulder lifting joint                 | -Inf                 | Inf                | r_shoulder_lift_joint | hinge | angular velocity (rad/s) |
-    | 9    | Rotational velocity of the shoulder rolling joint                 | -Inf                 | Inf                | r_upper_arm_roll_joint | hinge | angular velocity (rad/s) |
-    | 10   | Rotational velocity of hinge joint that flexed the elbow          | -Inf                 | Inf                | r_elbow_flex_joint | hinge | angular velocity (rad/s) |
-    | 11   | Rotational velocity of hinge that rolls the forearm               | -Inf                 | Inf                | r_forearm_roll_joint | hinge | angular velocity (rad/s) |
-    | 12   | Rotational velocity of flexing the wrist                          | -Inf                 | Inf                | r_wrist_flex_joint | hinge | angular velocity (rad/s) |
-    | 13   | Rotational velocity of rolling the wrist                          | -Inf                 | Inf                | r_wrist_roll_joint | hinge | angular velocity (rad/s) |
-    | 14   | x-coordinate of the fingertip of the pusher                       | -Inf                 | Inf                | tips_arm | slide | position (m) |
-    | 15   | y-coordinate of the fingertip of the pusher                       | -Inf                 | Inf                | tips_arm | slide | position (m) |
-    | 16   | z-coordinate of the fingertip of the pusher                       | -Inf                 | Inf                | tips_arm | slide | position (m) |
-    | 17   | x-coordinate of the object to be moved                            | -Inf                 | Inf                | object (obj_slidex) | slide | position (m) |
-    | 18   | y-coordinate of the object to be moved                            | -Inf                 | Inf                | object (obj_slidey) | slide | position (m) |
-    | 19   | z-coordinate of the object to be moved                            | -Inf                 | Inf                | object | cylinder | position (m) |
-    | 20   | x-coordinate of the goal position of the object                   | -Inf                 | Inf                | goal (goal_slidex) | slide | position (m) |
-    | 21   | y-coordinate of the goal position of the object                   | -Inf                 | Inf                | goal (goal_slidey) | slide | position (m) |
-    | 22   | z-coordinate of the goal position of the object                   | -Inf                 | Inf                | goal | sphere | position (m) |
-
-
-    ### Rewards
-    The reward consists of two parts:
-    - *reward_near *: This reward is a measure of how far the *fingertip*
-    of the pusher (the unattached end) is from the object, with a more negative
-    value assigned for when the pusher's *fingertip* is further away from the
-    target. It is calculated as the negative vector norm of (position of
-    the fingertip - position of target), or *-norm("fingertip" - "target")*.
-    - *reward_dist *: This reward is a measure of how far the object is from
-    the target goal position, with a more negative value assigned for object is
-    further away from the target. It is calculated as the negative vector norm of
-    (position of the object - position of goal), or *-norm("object" - "target")*.
-    - *reward_control*: A negative reward for penalising the pusher if
-    it takes actions that are too large. It is measured as the negative squared
-    Euclidean norm of the action, i.e. as *- sum(action<sup>2</sup>)*.
-
-    The total reward returned is ***reward*** *=* *reward_dist + 0.1 * reward_ctrl + 0.5 * reward_near*
-
-    Unlike other environments, Pusher does not allow you to specify weights for the individual reward terms.
-    However, `info` does contain the keys *reward_dist* and *reward_ctrl*. Thus, if you'd like to weight the terms,
-    you should create a wrapper that computes the weighted reward from `info`.
-
-
-    ### Starting State
-    All pusher (not including object and goal) states start in
-    (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0). A uniform noise in the range
-    [-0.005, 0.005] is added to the velocity attributes only. The velocities of
-    the object and goal are permanently set to 0. The object's x-position is selected uniformly
-    between [-0.3, 0] while the y-position is selected uniformly between [-0.2, 0.2], and this
-    process is repeated until the vector norm between the object's (x,y) position and origin is not greater
-    than 0.17. The goal always have the same position of (0.45, -0.05, -0.323).
-
-    The default framerate is 5 with each frame lasting for 0.01, giving rise to a *dt = 5 * 0.01 = 0.05*
-
-    ### Episode Termination
-
-    The episode terminates when any of the following happens:
-
-    1. The episode duration reaches a 100 timesteps.
-    2. Any of the state space values is no longer finite.
-
-    ### Arguments
-
-    No additional arguments are currently supported (in v2 and lower),
-    but modifications can be made to the XML file in the assets folder
-    (or by changing the path to a modified XML file in another folder)..
-
-    ```
-    env = gym.make('Pusher-v2')
-    ```
-
-    There is no v3 for Pusher, unlike the robot environments where a v3 and
-    beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
-
-
-    ### Version History
-
-    * v2: All continuous control environments now use mujoco_py >= 1.50
-    * v1: max_time_steps raised to 1000 for robot based tasks (not including reacher, which has a max_time_steps of 50). Added reward_threshold to environments.
-    * v0: Initial versions release (1.0.0)
-
-    """
-
    def __init__(self):
        utils.EzPickle.__init__(self)
-        mujoco_env.MujocoEnv.__init__(self, "pusher.xml", 5)
+        mujoco_env.MujocoEnv.__init__(
+            self, "pusher.xml", 5, mujoco_bindings="mujoco_py"
+        )

    def step(self, a):
        vec_1 = self.get_body_com("object") - self.get_body_com("tips_arm")
--- a/gym/envs/mujoco/pusher_v4.py
+++ b/gym/envs/mujoco/pusher_v4.py
@@ -0,0 +1,185 @@
+import numpy as np
+
+from gym import utils
+from gym.envs.mujoco import mujoco_env
+
+
+class PusherEnv(mujoco_env.MujocoEnv, utils.EzPickle):
+    """
+    ### Description
+    "Pusher" is a multi-jointed robot arm which is very similar to that of a human.
+    The goal is to move a target cylinder (called *object*) to a goal position using the robot's end effector (called *fingertip*).
+    The robot consists of shoulder, elbow, forearm, and wrist joints.
+
+    ### Action Space
+    The action space is a `Box(-2, 2, (7,), float32)`. An action `(a, b)` represents the torques applied at the hinge joints.
+
+    | Num | Action                                                             | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit         |
+    |-----|--------------------------------------------------------------------|-------------|-------------|----------------------------------|-------|--------------|
+    | 0    | Rotation of the panning the shoulder                              | -2          | 2           | r_shoulder_pan_joint             | hinge | torque (N m) |
+    | 1    | Rotation of the shoulder lifting joint                            | -2          | 2           | r_shoulder_lift_joint            | hinge | torque (N m) |
+    | 2    | Rotation of the shoulder rolling joint                            | -2          | 2           | r_upper_arm_roll_joint           | hinge | torque (N m) |
+    | 3    | Rotation of hinge joint that flexed the elbow                     | -2          | 2           | r_elbow_flex_joint               | hinge | torque (N m) |
+    | 4    | Rotation of hinge that rolls the forearm                          | -2          | 2           | r_forearm_roll_joint             | hinge | torque (N m) |
+    | 5    | Rotation of flexing the wrist                                     | -2          | 2           | r_wrist_flex_joint               | hinge | torque (N m) |
+    | 6    | Rotation of rolling the wrist                                     | -2          | 2           | r_wrist_roll_joint               | hinge | torque (N m) |
+
+    ### Observation Space
+
+    Observations consist of
+
+    - Angle of rotational joints on the pusher
+    - Anglular velocities of rotational joints on the pusher
+    - The coordinates of the fingertip of the pusher
+    - The coordinates of the object to be moved
+    - The coordinates of the goal position
+
+    The observation is a `ndarray` with shape `(23,)` where the elements correspond to the table below.
+    An analogy can be drawn to a human arm in order to help understand the state space, with the words flex and roll meaning the
+    same as human joints.
+
+    | Num | Observation           | Min                  | Max                | Name (in corresponding XML file) | Joint| Unit |
+    |-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------|
+    | 0    | Rotation of the panning the shoulder                              | -Inf                 | Inf                | r_shoulder_pan_joint | hinge | angle (rad) |
+    | 1    | Rotation of the shoulder lifting joint                            | -Inf                 | Inf                | r_shoulder_lift_joint | hinge | angle (rad) |
+    | 2    | Rotation of the shoulder rolling joint                            | -Inf                 | Inf                | r_upper_arm_roll_joint | hinge | angle (rad) |
+    | 3    | Rotation of hinge joint that flexed the elbow                     | -Inf                 | Inf                | r_elbow_flex_joint | hinge | angle (rad) |
+    | 4    | Rotation of hinge that rolls the forearm                          | -Inf                 | Inf                | r_forearm_roll_joint | hinge | angle (rad) |
+    | 5    | Rotation of flexing the wrist                                     | -Inf                 | Inf                | r_wrist_flex_joint | hinge | angle (rad) |
+    | 6    | Rotation of rolling the wrist                                     | -Inf                 | Inf                | r_wrist_roll_joint | hinge | angle (rad) |
+    | 7    | Rotational velocity of the panning the shoulder                   | -Inf                 | Inf                | r_shoulder_pan_joint | hinge | angular velocity (rad/s) |
+    | 8    | Rotational velocity of the shoulder lifting joint                 | -Inf                 | Inf                | r_shoulder_lift_joint | hinge | angular velocity (rad/s) |
+    | 9    | Rotational velocity of the shoulder rolling joint                 | -Inf                 | Inf                | r_upper_arm_roll_joint | hinge | angular velocity (rad/s) |
+    | 10   | Rotational velocity of hinge joint that flexed the elbow          | -Inf                 | Inf                | r_elbow_flex_joint | hinge | angular velocity (rad/s) |
+    | 11   | Rotational velocity of hinge that rolls the forearm               | -Inf                 | Inf                | r_forearm_roll_joint | hinge | angular velocity (rad/s) |
+    | 12   | Rotational velocity of flexing the wrist                          | -Inf                 | Inf                | r_wrist_flex_joint | hinge | angular velocity (rad/s) |
+    | 13   | Rotational velocity of rolling the wrist                          | -Inf                 | Inf                | r_wrist_roll_joint | hinge | angular velocity (rad/s) |
+    | 14   | x-coordinate of the fingertip of the pusher                       | -Inf                 | Inf                | tips_arm | slide | position (m) |
+    | 15   | y-coordinate of the fingertip of the pusher                       | -Inf                 | Inf                | tips_arm | slide | position (m) |
+    | 16   | z-coordinate of the fingertip of the pusher                       | -Inf                 | Inf                | tips_arm | slide | position (m) |
+    | 17   | x-coordinate of the object to be moved                            | -Inf                 | Inf                | object (obj_slidex) | slide | position (m) |
+    | 18   | y-coordinate of the object to be moved                            | -Inf                 | Inf                | object (obj_slidey) | slide | position (m) |
+    | 19   | z-coordinate of the object to be moved                            | -Inf                 | Inf                | object | cylinder | position (m) |
+    | 20   | x-coordinate of the goal position of the object                   | -Inf                 | Inf                | goal (goal_slidex) | slide | position (m) |
+    | 21   | y-coordinate of the goal position of the object                   | -Inf                 | Inf                | goal (goal_slidey) | slide | position (m) |
+    | 22   | z-coordinate of the goal position of the object                   | -Inf                 | Inf                | goal | sphere | position (m) |
+
+
+    ### Rewards
+    The reward consists of two parts:
+    - *reward_near *: This reward is a measure of how far the *fingertip*
+    of the pusher (the unattached end) is from the object, with a more negative
+    value assigned for when the pusher's *fingertip* is further away from the
+    target. It is calculated as the negative vector norm of (position of
+    the fingertip - position of target), or *-norm("fingertip" - "target")*.
+    - *reward_dist *: This reward is a measure of how far the object is from
+    the target goal position, with a more negative value assigned for object is
+    further away from the target. It is calculated as the negative vector norm of
+    (position of the object - position of goal), or *-norm("object" - "target")*.
+    - *reward_control*: A negative reward for penalising the pusher if
+    it takes actions that are too large. It is measured as the negative squared
+    Euclidean norm of the action, i.e. as *- sum(action<sup>2</sup>)*.
+
+    The total reward returned is ***reward*** *=* *reward_dist + 0.1 * reward_ctrl + 0.5 * reward_near*
+
+    Unlike other environments, Pusher does not allow you to specify weights for the individual reward terms.
+    However, `info` does contain the keys *reward_dist* and *reward_ctrl*. Thus, if you'd like to weight the terms,
+    you should create a wrapper that computes the weighted reward from `info`.
+
+
+    ### Starting State
+    All pusher (not including object and goal) states start in
+    (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0). A uniform noise in the range
+    [-0.005, 0.005] is added to the velocity attributes only. The velocities of
+    the object and goal are permanently set to 0. The object's x-position is selected uniformly
+    between [-0.3, 0] while the y-position is selected uniformly between [-0.2, 0.2], and this
+    process is repeated until the vector norm between the object's (x,y) position and origin is not greater
+    than 0.17. The goal always have the same position of (0.45, -0.05, -0.323).
+
+    The default framerate is 5 with each frame lasting for 0.01, giving rise to a *dt = 5 * 0.01 = 0.05*
+
+    ### Episode Termination
+
+    The episode terminates when any of the following happens:
+
+    1. The episode duration reaches a 100 timesteps.
+    2. Any of the state space values is no longer finite.
+
+    ### Arguments
+
+    No additional arguments are currently supported (in v2 and lower),
+    but modifications can be made to the XML file in the assets folder
+    (or by changing the path to a modified XML file in another folder)..
+
+    ```
+    env = gym.make('Pusher-v2')
+    ```
+
+    There is no v3 for Pusher, unlike the robot environments where a v3 and
+    beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
+
+
+    ### Version History
+
+    * v4: all mujoco environments now use the mujoco binidings in mujoco>=2.1.3
+    * v2: All continuous control environments now use mujoco_py >= 1.50
+    * v1: max_time_steps raised to 1000 for robot based tasks (not including reacher, which has a max_time_steps of 50). Added reward_threshold to environments.
+    * v0: Initial versions release (1.0.0)
+
+    """
+
+    def __init__(self):
+        utils.EzPickle.__init__(self)
+        mujoco_env.MujocoEnv.__init__(self, "pusher.xml", 5)
+
+    def step(self, a):
+        vec_1 = self.get_body_com("object") - self.get_body_com("tips_arm")
+        vec_2 = self.get_body_com("object") - self.get_body_com("goal")
+
+        reward_near = -np.linalg.norm(vec_1)
+        reward_dist = -np.linalg.norm(vec_2)
+        reward_ctrl = -np.square(a).sum()
+        reward = reward_dist + 0.1 * reward_ctrl + 0.5 * reward_near
+
+        self.do_simulation(a, self.frame_skip)
+        ob = self._get_obs()
+        done = False
+        return ob, reward, done, dict(reward_dist=reward_dist, reward_ctrl=reward_ctrl)
+
+    def viewer_setup(self):
+        self.viewer.cam.trackbodyid = -1
+        self.viewer.cam.distance = 4.0
+
+    def reset_model(self):
+        qpos = self.init_qpos
+
+        self.goal_pos = np.asarray([0, 0])
+        while True:
+            self.cylinder_pos = np.concatenate(
+                [
+                    self.np_random.uniform(low=-0.3, high=0, size=1),
+                    self.np_random.uniform(low=-0.2, high=0.2, size=1),
+                ]
+            )
+            if np.linalg.norm(self.cylinder_pos - self.goal_pos) > 0.17:
+                break
+
+        qpos[-4:-2] = self.cylinder_pos
+        qpos[-2:] = self.goal_pos
+        qvel = self.init_qvel + self.np_random.uniform(
+            low=-0.005, high=0.005, size=self.model.nv
+        )
+        qvel[-4:] = 0
+        self.set_state(qpos, qvel)
+        return self._get_obs()
+
+    def _get_obs(self):
+        return np.concatenate(
+            [
+                self.data.qpos.flat[:7],
+                self.data.qvel.flat[:7],
+                self.get_body_com("tips_arm"),
+                self.get_body_com("object"),
+                self.get_body_com("goal"),
+            ]
+        )
--- a/gym/envs/mujoco/reacher.py
+++ b/gym/envs/mujoco/reacher.py
@@ -5,121 +5,11 @@ from gym.envs.mujoco import mujoco_env


 class ReacherEnv(mujoco_env.MujocoEnv, utils.EzPickle):
-    """
-    ### Description
-    "Reacher" is a two-jointed robot arm. The goal is to move the robot's end effector (called *fingertip*) close to a
-    target that is spawned at a random position.
-
-    ### Action Space
-    The action space is a `Box(-1, 1, (2,), float32)`. An action `(a, b)` represents the torques applied at the hinge joints.
-
-    | Num | Action                                                                          | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit |
-    |-----|---------------------------------------------------------------------------------|-------------|-------------|--------------------------|-------|------|
-    | 0   | Torque applied at the first hinge (connecting the link to the point of fixture) | -1 | 1 | joint0  | hinge | torque (N m) |
-    | 1   |  Torque applied at the second hinge (connecting the two links)                  | -1 | 1 | joint1  | hinge | torque (N m) |
-
-    ### Observation Space
-
-    Observations consist of
-
-    - The cosine of the angles of the two arms
-    - The sine of the angles of the two arms
-    - The coordinates of the target
-    - The angular velocities of the arms
-    - The vector between the target and the reacher's fingertip (3 dimensional with the last element being 0)
-
-    The observation is a `ndarray` with shape `(11,)` where the elements correspond to the following:
-
-    | Num | Observation           | Min                  | Max                | Name (in corresponding XML file) | Joint| Unit |
-    |-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------|
-    | 0   | cosine of the angle of the first arm                                                            | -Inf                 | Inf                | cos(joint0) | hinge | unitless |
-    | 1   | cosine of the angle of the second arm                                                           | -Inf                 | Inf                | cos(joint1) | hinge | unitless |
-    | 2   | sine of the angle of the first arm                                                              | -Inf                 | Inf                | cos(joint0) | hinge | unitless |
-    | 3   | sine of the angle of the second arm                                                             | -Inf                 | Inf                | cos(joint1) | hinge | unitless |
-    | 4   |  x-coorddinate of the target                                                                    | -Inf                 | Inf                | target_x | slide | position (m) |
-    | 5   |  y-coorddinate of the target                                                                    | -Inf                 | Inf                | target_y | slide | position (m) |
-    | 6   | angular velocity of the first arm                                                               | -Inf                 | Inf                | joint0 | hinge | angular velocity (rad/s) |
-    | 7   | angular velocity of the second arm                                                              | -Inf                 | Inf                | joint1 | hinge | angular velocity (rad/s) |
-    | 8   | x-value of position_fingertip - position_target                                                 | -Inf                 | Inf                | NA | slide | position (m) |
-    | 9   | y-value of position_fingertip - position_target                                                 | -Inf                 | Inf                | NA | slide | position (m) |
-    | 10  | z-value of position_fingertip - position_target (0 since reacher is 2d and z is same for both)  | -Inf                 | Inf                | NA | slide | position (m) |
-
-
-    Most Gym environments just return the positions and velocity of the
-    joints in the `.xml` file as the state of the environment. However, in
-    reacher the state is created by combining only certain elements of the
-    position and velocity, and performing some function transformations on them.
-    If one is to read the `.xml` for reacher then they will find 4 joints:
-
-    | Num | Observation                 | Min      | Max      | Name (in corresponding XML file) | Joint | Unit               |
-    |-----|-----------------------------|----------|----------|----------------------------------|-------|--------------------|
-    | 0   | angle of the first arm      | -Inf     | Inf      | joint0                           | hinge | angle (rad)        |
-    | 1   | angle of the second arm     | -Inf     | Inf      | joint1                           | hinge | angle (rad)        |
-    | 2   | x-coordinate of the target  | -Inf     | Inf      | target_x                         | slide | position (m)       |
-    | 3   | y-coordinate of the target  | -Inf     | Inf      | target_y                         | slide | position (m)       |
-
-
-    ### Rewards
-    The reward consists of two parts:
-    - *reward_distance*: This reward is a measure of how far the *fingertip*
-    of the reacher (the unattached end) is from the target, with a more negative
-    value assigned for when the reacher's *fingertip* is further away from the
-    target. It is calculated as the negative vector norm of (position of
-    the fingertip - position of target), or *-norm("fingertip" - "target")*.
-    - *reward_control*: A negative reward for penalising the walker if
-    it takes actions that are too large. It is measured as the negative squared
-    Euclidean norm of the action, i.e. as *- sum(action<sup>2</sup>)*.
-
-    The total reward returned is ***reward*** *=* *reward_distance + reward_control*
-
-    Unlike other environments, Reacher does not allow you to specify weights for the individual reward terms.
-    However, `info` does contain the keys *reward_dist* and *reward_ctrl*. Thus, if you'd like to weight the terms,
-    you should create a wrapper that computes the weighted reward from `info`.
-
-
-    ### Starting State
-    All observations start in state
-    (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
-    with a noise added for stochasticity. A uniform noise in the range
-    [-0.1, 0.1] is added to the positional attributes, while the target position
-    is selected uniformly at random in a disk of radius 0.2 around the origin.
-    Independent, uniform noise in the
-    range of [-0.005, 0.005] is added to the velocities, and the last
-    element ("fingertip" - "target") is calculated at the end once everything
-    is set. The default setting has a framerate of 2 and a *dt = 2 * 0.01 = 0.02*
-
-    ### Episode Termination
-
-    The episode terminates when any of the following happens:
-
-    1. The episode duration reaches a 50 timesteps (with a new random target popping up if the reacher's fingertip reaches it before 50 timesteps)
-    2. Any of the state space values is no longer finite.
-
-    ### Arguments
-
-    No additional arguments are currently supported (in v2 and lower),
-    but modifications can be made to the XML file in the assets folder
-    (or by changing the path to a modified XML file in another folder)..
-
-    ```
-    env = gym.make('Reacher-v2')
-    ```
-
-    There is no v3 for Reacher, unlike the robot environments where a v3 and
-    beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
-
-
-    ### Version History
-
-    * v2: All continuous control environments now use mujoco_py >= 1.50
-    * v1: max_time_steps raised to 1000 for robot based tasks (not including reacher, which has a max_time_steps of 50). Added reward_threshold to environments.
-    * v0: Initial versions release (1.0.0)
-
-    """
-
    def __init__(self):
        utils.EzPickle.__init__(self)
-        mujoco_env.MujocoEnv.__init__(self, "reacher.xml", 2)
+        mujoco_env.MujocoEnv.__init__(
+            self, "reacher.xml", 2, mujoco_bindings="mujoco_py"
+        )

    def step(self, a):
        vec = self.get_body_com("fingertip") - self.get_body_com("target")
--- a/gym/envs/mujoco/reacher_v4.py
+++ b/gym/envs/mujoco/reacher_v4.py
@@ -0,0 +1,165 @@
+import numpy as np
+
+from gym import utils
+from gym.envs.mujoco import mujoco_env
+
+
+class ReacherEnv(mujoco_env.MujocoEnv, utils.EzPickle):
+    """
+    ### Description
+    "Reacher" is a two-jointed robot arm. The goal is to move the robot's end effector (called *fingertip*) close to a
+    target that is spawned at a random position.
+
+    ### Action Space
+    The action space is a `Box(-1, 1, (2,), float32)`. An action `(a, b)` represents the torques applied at the hinge joints.
+
+    | Num | Action                                                                          | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit |
+    |-----|---------------------------------------------------------------------------------|-------------|-------------|--------------------------|-------|------|
+    | 0   | Torque applied at the first hinge (connecting the link to the point of fixture) | -1 | 1 | joint0  | hinge | torque (N m) |
+    | 1   |  Torque applied at the second hinge (connecting the two links)                  | -1 | 1 | joint1  | hinge | torque (N m) |
+
+    ### Observation Space
+
+    Observations consist of
+
+    - The cosine of the angles of the two arms
+    - The sine of the angles of the two arms
+    - The coordinates of the target
+    - The angular velocities of the arms
+    - The vector between the target and the reacher's fingertip (3 dimensional with the last element being 0)
+
+    The observation is a `ndarray` with shape `(11,)` where the elements correspond to the following:
+
+    | Num | Observation           | Min                  | Max                | Name (in corresponding XML file) | Joint| Unit |
+    |-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------|
+    | 0   | cosine of the angle of the first arm                                                            | -Inf                 | Inf                | cos(joint0) | hinge | unitless |
+    | 1   | cosine of the angle of the second arm                                                           | -Inf                 | Inf                | cos(joint1) | hinge | unitless |
+    | 2   | sine of the angle of the first arm                                                              | -Inf                 | Inf                | cos(joint0) | hinge | unitless |
+    | 3   | sine of the angle of the second arm                                                             | -Inf                 | Inf                | cos(joint1) | hinge | unitless |
+    | 4   |  x-coorddinate of the target                                                                    | -Inf                 | Inf                | target_x | slide | position (m) |
+    | 5   |  y-coorddinate of the target                                                                    | -Inf                 | Inf                | target_y | slide | position (m) |
+    | 6   | angular velocity of the first arm                                                               | -Inf                 | Inf                | joint0 | hinge | angular velocity (rad/s) |
+    | 7   | angular velocity of the second arm                                                              | -Inf                 | Inf                | joint1 | hinge | angular velocity (rad/s) |
+    | 8   | x-value of position_fingertip - position_target                                                 | -Inf                 | Inf                | NA | slide | position (m) |
+    | 9   | y-value of position_fingertip - position_target                                                 | -Inf                 | Inf                | NA | slide | position (m) |
+    | 10  | z-value of position_fingertip - position_target (0 since reacher is 2d and z is same for both)  | -Inf                 | Inf                | NA | slide | position (m) |
+
+
+    Most Gym environments just return the positions and velocity of the
+    joints in the `.xml` file as the state of the environment. However, in
+    reacher the state is created by combining only certain elements of the
+    position and velocity, and performing some function transformations on them.
+    If one is to read the `.xml` for reacher then they will find 4 joints:
+
+    | Num | Observation                 | Min      | Max      | Name (in corresponding XML file) | Joint | Unit               |
+    |-----|-----------------------------|----------|----------|----------------------------------|-------|--------------------|
+    | 0   | angle of the first arm      | -Inf     | Inf      | joint0                           | hinge | angle (rad)        |
+    | 1   | angle of the second arm     | -Inf     | Inf      | joint1                           | hinge | angle (rad)        |
+    | 2   | x-coordinate of the target  | -Inf     | Inf      | target_x                         | slide | position (m)       |
+    | 3   | y-coordinate of the target  | -Inf     | Inf      | target_y                         | slide | position (m)       |
+
+
+    ### Rewards
+    The reward consists of two parts:
+    - *reward_distance*: This reward is a measure of how far the *fingertip*
+    of the reacher (the unattached end) is from the target, with a more negative
+    value assigned for when the reacher's *fingertip* is further away from the
+    target. It is calculated as the negative vector norm of (position of
+    the fingertip - position of target), or *-norm("fingertip" - "target")*.
+    - *reward_control*: A negative reward for penalising the walker if
+    it takes actions that are too large. It is measured as the negative squared
+    Euclidean norm of the action, i.e. as *- sum(action<sup>2</sup>)*.
+
+    The total reward returned is ***reward*** *=* *reward_distance + reward_control*
+
+    Unlike other environments, Reacher does not allow you to specify weights for the individual reward terms.
+    However, `info` does contain the keys *reward_dist* and *reward_ctrl*. Thus, if you'd like to weight the terms,
+    you should create a wrapper that computes the weighted reward from `info`.
+
+
+    ### Starting State
+    All observations start in state
+    (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
+    with a noise added for stochasticity. A uniform noise in the range
+    [-0.1, 0.1] is added to the positional attributes, while the target position
+    is selected uniformly at random in a disk of radius 0.2 around the origin.
+    Independent, uniform noise in the
+    range of [-0.005, 0.005] is added to the velocities, and the last
+    element ("fingertip" - "target") is calculated at the end once everything
+    is set. The default setting has a framerate of 2 and a *dt = 2 * 0.01 = 0.02*
+
+    ### Episode Termination
+
+    The episode terminates when any of the following happens:
+
+    1. The episode duration reaches a 50 timesteps (with a new random target popping up if the reacher's fingertip reaches it before 50 timesteps)
+    2. Any of the state space values is no longer finite.
+
+    ### Arguments
+
+    No additional arguments are currently supported (in v2 and lower),
+    but modifications can be made to the XML file in the assets folder
+    (or by changing the path to a modified XML file in another folder)..
+
+    ```
+    env = gym.make('Reacher-v2')
+    ```
+
+    There is no v3 for Reacher, unlike the robot environments where a v3 and
+    beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
+
+
+    ### Version History
+
+    * v4: all mujoco environments now use the mujoco binidings in mujoco>=2.1.3
+    * v2: All continuous control environments now use mujoco_py >= 1.50
+    * v1: max_time_steps raised to 1000 for robot based tasks (not including reacher, which has a max_time_steps of 50). Added reward_threshold to environments.
+    * v0: Initial versions release (1.0.0)
+
+    """
+
+    def __init__(self):
+        utils.EzPickle.__init__(self)
+        mujoco_env.MujocoEnv.__init__(self, "reacher.xml", 2)
+
+    def step(self, a):
+        vec = self.get_body_com("fingertip") - self.get_body_com("target")
+        reward_dist = -np.linalg.norm(vec)
+        reward_ctrl = -np.square(a).sum()
+        reward = reward_dist + reward_ctrl
+        self.do_simulation(a, self.frame_skip)
+        ob = self._get_obs()
+        done = False
+        return ob, reward, done, dict(reward_dist=reward_dist, reward_ctrl=reward_ctrl)
+
+    def viewer_setup(self):
+        self.viewer.cam.trackbodyid = 0
+
+    def reset_model(self):
+        qpos = (
+            self.np_random.uniform(low=-0.1, high=0.1, size=self.model.nq)
+            + self.init_qpos
+        )
+        while True:
+            self.goal = self.np_random.uniform(low=-0.2, high=0.2, size=2)
+            if np.linalg.norm(self.goal) < 0.2:
+                break
+        qpos[-2:] = self.goal
+        qvel = self.init_qvel + self.np_random.uniform(
+            low=-0.005, high=0.005, size=self.model.nv
+        )
+        qvel[-2:] = 0
+        self.set_state(qpos, qvel)
+        return self._get_obs()
+
+    def _get_obs(self):
+        theta = self.data.qpos.flat[:2]
+        return np.concatenate(
+            [
+                np.cos(theta),
+                np.sin(theta),
+                self.data.qpos.flat[2:],
+                self.data.qvel.flat[:2],
+                self.get_body_com("fingertip") - self.get_body_com("target"),
+            ]
+        )
--- a/gym/envs/mujoco/swimmer.py
+++ b/gym/envs/mujoco/swimmer.py
@@ -6,7 +6,9 @@ from gym.envs.mujoco import mujoco_env

 class SwimmerEnv(mujoco_env.MujocoEnv, utils.EzPickle):
    def __init__(self):
-        mujoco_env.MujocoEnv.__init__(self, "swimmer.xml", 4)
+        mujoco_env.MujocoEnv.__init__(
+            self, "swimmer.xml", 4, mujoco_bindings="mujoco_py"
+        )
        utils.EzPickle.__init__(self)

    def step(self, a):
--- a/gym/envs/mujoco/swimmer_v3.py
+++ b/gym/envs/mujoco/swimmer_v3.py
@@ -9,119 +9,6 @@ DEFAULT_CAMERA_CONFIG = {}


 class SwimmerEnv(mujoco_env.MujocoEnv, utils.EzPickle):
-    """
-    ### Description
-
-    This environment corresponds to the Swimmer environment described in Rémi Coulom's PhD thesis
-    ["Reinforcement Learning Using Neural Networks, with Applications to Motor Control"](https://tel.archives-ouvertes.fr/tel-00003985/document).
-    The environment aims to increase the number of independent state and control
-    variables as compared to the classic control environments. The swimmers
-    consist of three or more segments ('***links***') and one less articulation
-    joints ('***rotors***') - one rotor joint connecting exactly two links to
-    form a linear chain. The swimmer is suspended in a two dimensional pool and
-    always starts in the same position (subject to some deviation drawn from an
-    uniform distribution), and the goal is to move as fast as possible towards
-    the right by applying torque on the rotors and using the fluids friction.
-
-    ### Notes
-
-    The problem parameters are:
-    Problem parameters:
-    * *n*: number of body parts
-    * *m<sub>i</sub>*: mass of part *i* (*i* ∈ {1...n})
-    * *l<sub>i</sub>*: length of part *i* (*i* ∈ {1...n})
-    * *k*: viscous-friction coefficient
-
-    While the default environment has *n* = 3, *l<sub>i</sub>* = 0.1,
-    and *k* = 0.1. It is possible to pass a custom MuJoCo XML file during construction to increase the
-    number of links, or to tweak any of the parameters.
-
-    ### Action Space
-    The action space is a `Box(-1, 1, (2,), float32)`. An action represents the torques applied between *links*
-
-    | Num | Action                             | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit         |
-    |-----|------------------------------------|-------------|-------------|----------------------------------|-------|--------------|
-    | 0   | Torque applied on the first rotor  | -1          | 1           | rot2                             | hinge | torque (N m) |
-    | 1   | Torque applied on the second rotor | -1          | 1           | rot3                             | hinge | torque (N m) |
-
-    ### Observation Space
-
-    By default, observations consists of:
-    * θ<sub>i</sub>: angle of part *i* with respect to the *x* axis
-    * θ<sub>i</sub>': its derivative with respect to time (angular velocity)
-
-    In the default case, observations do not include the x- and y-coordinates of the front tip. These may
-    be included by passing `exclude_current_positions_from_observation=False` during construction.
-    Then, the observation space will have 10 dimensions where the first two dimensions
-    represent the x- and y-coordinates of the front tip.
-    Regardless of whether `exclude_current_positions_from_observation` was set to true or false, the x- and y-coordinates
-    will be returned in `info` with keys `"x_position"` and `"y_position"`, respectively.
-
-    By default, the observation is a `ndarray` with shape `(8,)` where the elements correspond to the following:
-
-    | Num | Observation           | Min                  | Max                | Name (in corresponding XML file) | Joint| Unit |
-    |-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------|
-    | 0   | angle of the front tip                     | -Inf                 | Inf                | rot | hinge | angle (rad) |
-    | 1   | angle of the second rotor                  | -Inf                 | Inf                | rot2 | hinge | angle (rad) |
-    | 2   | angle of the second rotor                  | -Inf                 | Inf                | rot3 | hinge | angle (rad) |
-    | 3   | velocity of the tip along the x-axis       | -Inf                 | Inf                | slider1 | slide | velocity (m/s) |
-    | 4   | velocity of the tip along the y-axis       | -Inf                 | Inf                | slider2 | slide | velocity (m/s) |
-    | 5   | angular velocity of front tip              | -Inf                 | Inf                | rot | hinge | angular velocity (rad/s) |
-    | 6   | angular velocity of second rotor           | -Inf                 | Inf                | rot2 | hinge | angular velocity (rad/s) |
-    | 7   | angular velocity of third rotor            | -Inf                 | Inf                | rot3 | hinge | angular velocity (rad/s) |
-
-    ### Rewards
-    The reward consists of two parts:
-    - *forward_reward*: A reward of moving forward which is measured
-    as *`forward_reward_weight` * (x-coordinate before action - x-coordinate after action)/dt*. *dt* is
-    the time between actions and is dependent on the frame_skip parameter
-    (default is 4), where the frametime is 0.01 - making the
-    default *dt = 4 * 0.01 = 0.04*. This reward would be positive if the swimmer
-    swims right as desired.
-    - *ctrl_cost*: A cost for penalising the swimmer if it takes
-    actions that are too large. It is measured as *`ctrl_cost_weight` *
-    sum(action<sup>2</sup>)* where *`ctrl_cost_weight`* is a parameter set for the
-    control and has a default value of 1e-4
-
-    The total reward returned is ***reward*** *=* *forward_reward - ctrl_cost* and `info` will also contain the individual reward terms
-
-    ### Starting State
-    All observations start in state (0,0,0,0,0,0,0,0) with a Uniform noise in the range of [-`reset_noise_scale`, `reset_noise_scale`] is added to the initial state for stochasticity.
-
-    ### Episode Termination
-    The episode terminates when the episode length is greater than 1000.
-
-    ### Arguments
-
-    No additional arguments are currently supported in v2 and lower.
-
-    ```
-    gym.make('Swimmer-v2')
-    ```
-
-    v3 and beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
-
-    ```
-    env = gym.make('Swimmer-v3', ctrl_cost_weight=0.1, ....)
-    ```
-
-    | Parameter               | Type       | Default      |Description                    |
-    |-------------------------|------------|--------------|-------------------------------|
-    | `xml_file`              | **str**    | `"swimmer.xml"`| Path to a MuJoCo model |
-    | `forward_reward_weight` | **float**  | `1.0`        | Weight for *forward_reward* term (see section on reward) |
-    | `ctrl_cost_weight`      | **float**  | `1e-4`       | Weight for *ctrl_cost* term (see section on reward) |
-    | `reset_noise_scale`     | **float**  | `0.1`        | Scale of random perturbations of initial position and velocity (see section on Starting State) |
-    | `exclude_current_positions_from_observation`| **bool** | `True`| Whether or not to omit the x- and y-coordinates from observations. Excluding the position can serve as an inductive bias to induce position-agnostic behavior in policies |
-
-
-    ### Version History
-
-    * v3: support for gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. rgb rendering comes from tracking camera (so agent does not run away from screen)
-    * v2: All continuous control environments now use mujoco_py >= 1.50
-    * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments.
-    * v0: Initial versions release (1.0.0)
-    """
-
    def __init__(
        self,
        xml_file="swimmer.xml",
@@ -141,7 +28,7 @@ class SwimmerEnv(mujoco_env.MujocoEnv, utils.EzPickle):
            exclude_current_positions_from_observation
        )

-        mujoco_env.MujocoEnv.__init__(self, xml_file, 4)
+        mujoco_env.MujocoEnv.__init__(self, xml_file, 4, mujoco_bindings="mujoco_py")

    def control_cost(self, action):
        control_cost = self._ctrl_cost_weight * np.sum(np.square(action))
--- a/gym/envs/mujoco/swimmer_v4.py
+++ b/gym/envs/mujoco/swimmer_v4.py
@@ -0,0 +1,217 @@
+__credits__ = ["Rushiv Arora"]
+
+import numpy as np
+
+from gym import utils
+from gym.envs.mujoco import mujoco_env
+
+DEFAULT_CAMERA_CONFIG = {}
+
+
+class SwimmerEnv(mujoco_env.MujocoEnv, utils.EzPickle):
+    """
+    ### Description
+
+    This environment corresponds to the Swimmer environment described in Rémi Coulom's PhD thesis
+    ["Reinforcement Learning Using Neural Networks, with Applications to Motor Control"](https://tel.archives-ouvertes.fr/tel-00003985/document).
+    The environment aims to increase the number of independent state and control
+    variables as compared to the classic control environments. The swimmers
+    consist of three or more segments ('***links***') and one less articulation
+    joints ('***rotors***') - one rotor joint connecting exactly two links to
+    form a linear chain. The swimmer is suspended in a two dimensional pool and
+    always starts in the same position (subject to some deviation drawn from an
+    uniform distribution), and the goal is to move as fast as possible towards
+    the right by applying torque on the rotors and using the fluids friction.
+
+    ### Notes
+
+    The problem parameters are:
+    Problem parameters:
+    * *n*: number of body parts
+    * *m<sub>i</sub>*: mass of part *i* (*i* ∈ {1...n})
+    * *l<sub>i</sub>*: length of part *i* (*i* ∈ {1...n})
+    * *k*: viscous-friction coefficient
+
+    While the default environment has *n* = 3, *l<sub>i</sub>* = 0.1,
+    and *k* = 0.1. It is possible to tweak the MuJoCo XML files to increase the
+    number of links, or to tweak any of the parameters.
+
+    ### Action Space
+    The agent take a 2-element vector for actions.
+    The action space is a continuous `(action, action)` in `[-1, 1]`, where
+    `action` represents the numerical torques applied between *links*
+
+    | Num | Action                             | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit         |
+    |-----|------------------------------------|-------------|-------------|----------------------------------|-------|--------------|
+    | 0   | Torque applied on the first rotor  | -1          | 1           | rot2                             | hinge | torque (N m) |
+    | 1   | Torque applied on the second rotor | -1          | 1           | rot3                             | hinge | torque (N m) |
+
+    ### Observation Space
+
+    The state space consists of:
+    * A<sub>0</sub>: position of first point
+    * θ<sub>i</sub>: angle of part *i* with respect to the *x* axis
+    * A<sub>0</sub>, θ<sub>i</sub>: their derivatives with respect to time (velocity and angular velocity)
+
+    The observation is a `ndarray` with shape `(8,)` where the elements correspond to the following:
+
+    | Num | Observation           | Min                  | Max                | Name (in corresponding XML file) | Joint| Unit |
+    |-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------|
+    | 0   | x-coordinate of the front tip              | -Inf                 | Inf                | slider1 | slide | position (m) |
+    | 1   | y-coordinate of the front tip              | -Inf                 | Inf                | slider2 | slide | position (m) |
+    | 2   | angle of the front tip                     | -Inf                 | Inf                | rot | hinge | angle (rad) |
+    | 3   | angle of the second rotor                  | -Inf                 | Inf                | rot2 | hinge | angle (rad) |
+    | 4   | angle of the second rotor                  | -Inf                 | Inf                | rot3 | hinge | angle (rad) |
+    | 5   | velocity of the tip along the x-axis       | -Inf                 | Inf                | slider1 | slide | velocity (m/s) |
+    | 6   | velocity of the tip along the y-axis       | -Inf                 | Inf                | slider2 | slide | velocity (m/s) |
+    | 7   | angular velocity of front tip              | -Inf                 | Inf                | rot | hinge | angular velocity (rad/s) |
+    | 8   | angular velocity of second rotor           | -Inf                 | Inf                | rot2 | hinge | angular velocity (rad/s) |
+    | 9   | angular velocity of third rotor            | -Inf                 | Inf                | rot3 | hinge | angular velocity (rad/s) |
+
+    **Note:**
+    In practice (and Gym implementation), the first two positional elements are
+    omitted from the state space since the reward function is calculated based
+    on those values. Therefore, observation space has shape `(8,)` and looks like:
+
+    | Num | Observation           | Min                  | Max                | Name (in corresponding XML file) | Joint| Unit |
+    |-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------|
+    | 0   | angle of the front tip                     | -Inf                 | Inf                | rot | hinge | angle (rad) |
+    | 1   | angle of the second rotor                  | -Inf                 | Inf                | rot2 | hinge | angle (rad) |
+    | 2   | angle of the second rotor                  | -Inf                 | Inf                | rot3 | hinge | angle (rad) |
+    | 3   | velocity of the tip along the x-axis       | -Inf                 | Inf                | slider1 | slide | velocity (m/s) |
+    | 4   | velocity of the tip along the y-axis       | -Inf                 | Inf                | slider2 | slide | velocity (m/s) |
+    | 5   | angular velocity of front tip              | -Inf                 | Inf                | rot | hinge | angular velocity (rad/s) |
+    | 6   | angular velocity of second rotor           | -Inf                 | Inf                | rot2 | hinge | angular velocity (rad/s) |
+    | 7   | angular velocity of third rotor            | -Inf                 | Inf                | rot3 | hinge | angular velocity (rad/s) |
+
+    ### Rewards
+    The reward consists of two parts:
+    - *reward_front*: A reward of moving forward which is measured
+    as *(x-coordinate before action - x-coordinate after action)/dt*. *dt* is
+    the time between actions and is dependeent on the frame_skip parameter
+    (default is 4), where the *dt* for one frame is 0.01 - making the
+    default *dt = 4 * 0.01 = 0.04*. This reward would be positive if the swimmer
+    swims right as desired.
+    - *reward_control*: A negative reward for penalising the swimmer if it takes
+    actions that are too large. It is measured as *-coefficient x
+    sum(action<sup>2</sup>)* where *coefficient* is a parameter set for the
+    control and has a default value of 0.0001
+
+    The total reward returned is ***reward*** *=* *reward_front + reward_control*
+
+    ### Starting State
+    All observations start in state (0,0,0,0,0,0,0,0) with a Uniform noise in the range of [-0.1, 0.1] is added to the initial state for stochasticity.
+
+    ### Episode Termination
+    The episode terminates when the episode length is greater than 1000.
+
+    ### Arguments
+
+    No additional arguments are currently supported (in v2 and lower), but
+    modifications can be made to the XML file in the assets folder
+    (or by changing the path to a modified XML file in another folder).
+
+    ```
+    gym.make('Swimmer-v2')
+    ```
+
+    v3 and beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
+
+    ```
+    env = gym.make('Swimmer-v3', ctrl_cost_weight=0.1, ....)
+    ```
+
+    ### Version History
+
+    * v4: all mujoco environments now use the mujoco binidings in mujoco>=2.1.3
+    * v3: support for gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. rgb rendering comes from tracking camera (so agent does not run away from screen)
+    * v2: All continuous control environments now use mujoco_py >= 1.50
+    * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments.
+    * v0: Initial versions release (1.0.0)
+    """
+
+    def __init__(
+        self,
+        xml_file="swimmer.xml",
+        forward_reward_weight=1.0,
+        ctrl_cost_weight=1e-4,
+        reset_noise_scale=0.1,
+        exclude_current_positions_from_observation=True,
+    ):
+        utils.EzPickle.__init__(**locals())
+
+        self._forward_reward_weight = forward_reward_weight
+        self._ctrl_cost_weight = ctrl_cost_weight
+
+        self._reset_noise_scale = reset_noise_scale
+
+        self._exclude_current_positions_from_observation = (
+            exclude_current_positions_from_observation
+        )
+
+        mujoco_env.MujocoEnv.__init__(self, xml_file, 4)
+
+    def control_cost(self, action):
+        control_cost = self._ctrl_cost_weight * np.sum(np.square(action))
+        return control_cost
+
+    def step(self, action):
+        xy_position_before = self.data.qpos[0:2].copy()
+        self.do_simulation(action, self.frame_skip)
+        xy_position_after = self.data.qpos[0:2].copy()
+
+        xy_velocity = (xy_position_after - xy_position_before) / self.dt
+        x_velocity, y_velocity = xy_velocity
+
+        forward_reward = self._forward_reward_weight * x_velocity
+
+        ctrl_cost = self.control_cost(action)
+
+        observation = self._get_obs()
+        reward = forward_reward - ctrl_cost
+        done = False
+        info = {
+            "reward_fwd": forward_reward,
+            "reward_ctrl": -ctrl_cost,
+            "x_position": xy_position_after[0],
+            "y_position": xy_position_after[1],
+            "distance_from_origin": np.linalg.norm(xy_position_after, ord=2),
+            "x_velocity": x_velocity,
+            "y_velocity": y_velocity,
+            "forward_reward": forward_reward,
+        }
+
+        return observation, reward, done, info
+
+    def _get_obs(self):
+        position = self.data.qpos.flat.copy()
+        velocity = self.data.qvel.flat.copy()
+
+        if self._exclude_current_positions_from_observation:
+            position = position[2:]
+
+        observation = np.concatenate([position, velocity]).ravel()
+        return observation
+
+    def reset_model(self):
+        noise_low = -self._reset_noise_scale
+        noise_high = self._reset_noise_scale
+
+        qpos = self.init_qpos + self.np_random.uniform(
+            low=noise_low, high=noise_high, size=self.model.nq
+        )
+        qvel = self.init_qvel + self.np_random.uniform(
+            low=noise_low, high=noise_high, size=self.model.nv
+        )
+
+        self.set_state(qpos, qvel)
+
+        observation = self._get_obs()
+        return observation
+
+    def viewer_setup(self):
+        for key, value in DEFAULT_CAMERA_CONFIG.items():
+            if isinstance(value, np.ndarray):
+                getattr(self.viewer.cam, key)[:] = value
+            else:
+                setattr(self.viewer.cam, key, value)
--- a/gym/envs/mujoco/walker2d.py
+++ b/gym/envs/mujoco/walker2d.py
@@ -6,7 +6,9 @@ from gym.envs.mujoco import mujoco_env

 class Walker2dEnv(mujoco_env.MujocoEnv, utils.EzPickle):
    def __init__(self):
-        mujoco_env.MujocoEnv.__init__(self, "walker2d.xml", 4)
+        mujoco_env.MujocoEnv.__init__(
+            self, "walker2d.xml", 4, mujoco_bindings="mujoco_py"
+        )
        utils.EzPickle.__init__(self)

    def step(self, a):
--- a/gym/envs/mujoco/walker2d_v3.py
+++ b/gym/envs/mujoco/walker2d_v3.py
@@ -12,137 +12,6 @@ DEFAULT_CAMERA_CONFIG = {


 class Walker2dEnv(mujoco_env.MujocoEnv, utils.EzPickle):
-    """
-    ### Description
-
-    This environment builds on the hopper environment based on the work done by Erez, Tassa, and Todorov
-    in ["Infinite Horizon Model Predictive Control for Nonlinear Periodic Tasks"](http://www.roboticsproceedings.org/rss07/p10.pdf)
-    by adding another set of legs making it possible for the robot to walker forward instead of
-    hop. Like other Mujoco environments, this environment aims to increase the number of independent state
-    and control variables as compared to the classic control environments. The walker is a
-    two-dimensional two-legged figure that consist of four main body parts - a single torso at the top
-    (with the two legs splitting after the torso), two thighs in the middle below the torso, two legs
-    in the bottom below the thighs, and two feet attached to the legs on which the entire body rests.
-    The goal is to make coordinate both sets of feet, legs, and thighs to move in the forward (right)
-    direction by applying torques on the six hinges connecting the six body parts.
-
-    ### Action Space
-    The action space is a `Box(-1, 1, (6,), float32)`. An action represents the torques applied at the hinge joints.
-
-    | Num | Action                                 | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit         |
-    |-----|----------------------------------------|-------------|-------------|----------------------------------|-------|--------------|
-    | 0   | Torque applied on the thigh rotor      | -1          | 1           | thigh_joint                      | hinge | torque (N m) |
-    | 1   | Torque applied on the leg rotor        | -1          | 1           | leg_joint                        | hinge | torque (N m) |
-    | 2   | Torque applied on the foot rotor       | -1          | 1           | foot_joint                       | hinge | torque (N m) |
-    | 3   | Torque applied on the left thigh rotor | -1          | 1           | thigh_left_joint                 | hinge | torque (N m) |
-    | 4   | Torque applied on the left leg rotor   | -1          | 1           | leg_left_joint                   | hinge | torque (N m) |
-    | 5   | Torque applied on the left foot rotor  | -1          | 1           | foot_left_joint                  | hinge | torque (N m) |
-
-    ### Observation Space
-
-    Observations consist of positional values of different body parts of the walker,
-    followed by the velocities of those individual parts (their derivatives) with all the positions ordered before all the velocities.
-
-    By default, observations do not include the x-coordinate of the top. It may
-    be included by passing `exclude_current_positions_from_observation=False` during construction.
-    In that case, the observation space will have 18 dimensions where the first dimension
-    represent the x-coordinates of the top of the walker.
-    Regardless of whether `exclude_current_positions_from_observation` was set to true or false, the x-coordinate
-    of the top will be returned in `info` with key `"x_position"`.
-
-    By default, observation is a `ndarray` with shape `(17,)` where the elements correspond to the following:
-
-    | Num | Observation                                                       | Min                | Max                | Name (in corresponding XML file) | Joint | Unit |
-    |-----|--------------------------------------------------------|----------------|-----------------|----------------------------------------|-------|------|
-    | 0   | z-coordinate of the top (height of hopper)         | -Inf                 | Inf                | rootz (torso)      | slide | position (m) |
-    | 1   | angle of the top                                   | -Inf                 | Inf                | rooty (torso)      | hinge | angle (rad) |
-    | 2   | angle of the thigh joint                           | -Inf                 | Inf                | thigh_joint        | hinge | angle (rad) |
-    | 3   | angle of the leg joint                             | -Inf                 | Inf                | leg_joint            | hinge | angle (rad) |
-    | 4   | angle of the foot joint                            | -Inf                 | Inf                | foot_joint          | hinge | angle (rad) |
-    | 5   | angle of the left thigh joint                      | -Inf                 | Inf                | thigh_left_joint  | hinge | angle (rad) |
-    | 6   | angle of the left leg joint                        | -Inf                 | Inf                | leg_left_joint    | hinge | angle (rad) |
-    | 7   | angle of the left foot joint                       | -Inf                 | Inf                | foot_left_joint  | hinge | angle (rad) |
-    | 8   | velocity of the x-coordinate of the top            | -Inf                 | Inf                | rootx               | slide | velocity (m/s) |
-    | 9   | velocity of the z-coordinate (height) of the top   | -Inf                 | Inf                | rootz                | slide | velocity (m/s)  |
-    | 10  | angular velocity of the angle of the top           | -Inf                 | Inf                | rooty                | hinge | angular velocity (rad/s) |
-    | 11  | angular velocity of the thigh hinge                | -Inf                 | Inf                | thigh_joint        | hinge | angular velocity (rad/s) |
-    | 12  | angular velocity of the leg hinge                  | -Inf                 | Inf                | leg_joint            |  hinge | angular velocity (rad/s) |
-    | 13  | angular velocity of the foot hinge                 | -Inf                 | Inf                | foot_joint           | hinge | angular velocity (rad/s) |
-    | 14  | angular velocity of the thigh hinge                | -Inf                 | Inf                | thigh_left_joint   | hinge | angular velocity (rad/s) |
-    | 15  | angular velocity of the leg hinge                  | -Inf                 | Inf                | leg_left_joint     | hinge | angular velocity (rad/s) |
-    | 16  | angular velocity of the foot hinge                 | -Inf                 | Inf                | foot_left_joint    | hinge | angular velocity (rad/s) |
-
-    ### Rewards
-    The reward consists of three parts:
-    - *healthy_reward*: Every timestep that the walker is alive, it receives a fixed reward of value `healthy_reward`,
-    - *forward_reward*: A reward of walking forward which is measured as
-    *`forward_reward_weight` * (x-coordinate before action - x-coordinate after action)/dt*.
-    *dt* is the time between actions and is dependeent on the frame_skip parameter
-    (default is 4), where the frametime is 0.002 - making the default
-    *dt = 4 * 0.002 = 0.008*. This reward would be positive if the walker walks forward (right) desired.
-    - *ctrl_cost*: A cost for penalising the walker if it
-    takes actions that are too large. It is measured as
-    *`ctrl_cost_weight` * sum(action<sup>2</sup>)* where *`ctrl_cost_weight`* is
-    a parameter set for the control and has a default value of 0.001
-
-    The total reward returned is ***reward*** *=* *healthy_reward bonus + forward_reward - ctrl_cost* and `info` will also contain the individual reward terms
-
-    ### Starting State
-    All observations start in state
-    (0.0, 1.25, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
-    with a uniform noise in the range of [-`reset_noise_scale`, `reset_noise_scale`] added to the values for stochasticity.
-
-    ### Episode Termination
-    The walker is said to be unhealthy if any of the following happens:
-
-    1. Any of the state space values is no longer finite
-    2. The height of the walker is ***not*** in the closed interval specified by `healthy_z_range`
-    3. The absolute value of the angle (`observation[1]` if `exclude_current_positions_from_observation=False`, else `observation[2]`) is ***not*** in the closed interval specified by `healthy_angle_range`
-
-    If `terminate_when_unhealthy=True` is passed during construction (which is the default),
-    the episode terminates when any of the following happens:
-
-    1. The episode duration reaches a 1000 timesteps
-    2. The walker is unhealthy
-
-    If `terminate_when_unhealthy=False` is passed, the episode is terminated only when 1000 timesteps are exceeded.
-
-    ### Arguments
-
-    No additional arguments are currently supported in v2 and lower.
-
-    ```
-    env = gym.make('Walker2d-v2')
-    ```
-
-    v3 and beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
-
-    ```
-    env = gym.make('Walker2d-v3', ctrl_cost_weight=0.1, ....)
-    ```
-
-    | Parameter               | Type       | Default      |Description                    |
-    |-------------------------|------------|--------------|-------------------------------|
-    | `xml_file`              | **str**    | `"walker2d.xml"` | Path to a MuJoCo model |
-    | `forward_reward_weight` | **float**  | `1.0`        | Weight for *forward_reward* term (see section on reward) |
-    | `ctrl_cost_weight`      | **float**  | `1e-3`       | Weight for *ctr_cost* term (see section on reward) |
-    | `healthy_reward`        | **float**  | `1.0`        | Constant reward given if the ant is "healthy" after timestep |
-    | `terminate_when_unhealthy` | **bool**| `True`       | If true, issue a done signal if the z-coordinate of the walker is no longer healthy |
-    | `healthy_z_range`       | **tuple**  | `(0.8, 2)`   | The z-coordinate of the top of the walker must be in this range to be considered healthy |
-    | `healthy_angle_range`   | **tuple**  | `(-1, 1)`    | The angle must be in this range to be considered healthy|
-    | `reset_noise_scale`     | **float**  | `5e-3`       | Scale of random perturbations of initial position and velocity (see section on Starting State) |
-    | `exclude_current_positions_from_observation`| **bool** | `True`| Whether or not to omit the x-coordinate from observations. Excluding the position can serve as an inductive bias to induce position-agnostic behavior in policies |
-
-
-    ### Version History
-
-    * v3: support for gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. rgb rendering comes from tracking camera (so agent does not run away from screen)
-    * v2: All continuous control environments now use mujoco_py >= 1.50
-    * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments.
-    * v0: Initial versions release (1.0.0)
-
-    """
-
    def __init__(
        self,
        xml_file="walker2d.xml",
@@ -172,7 +41,7 @@ class Walker2dEnv(mujoco_env.MujocoEnv, utils.EzPickle):
            exclude_current_positions_from_observation
        )

-        mujoco_env.MujocoEnv.__init__(self, xml_file, 4)
+        mujoco_env.MujocoEnv.__init__(self, xml_file, 4, mujoco_bindings="mujoco_py")

    @property
    def healthy_reward(self):
--- a/gym/envs/mujoco/walker2d_v4.py
+++ b/gym/envs/mujoco/walker2d_v4.py
@@ -0,0 +1,276 @@
+import numpy as np
+
+from gym import utils
+from gym.envs.mujoco import mujoco_env
+
+DEFAULT_CAMERA_CONFIG = {
+    "trackbodyid": 2,
+    "distance": 4.0,
+    "lookat": np.array((0.0, 0.0, 1.15)),
+    "elevation": -20.0,
+}
+
+DEFAULT_CAMERA_CONFIG = {
+    "trackbodyid": 2,
+    "distance": 4.0,
+    "lookat": np.array((0.0, 0.0, 1.15)),
+    "elevation": -20.0,
+}
+
+
+class Walker2dEnv(mujoco_env.MujocoEnv, utils.EzPickle):
+    """
+    ### Description
+
+    This environment builds on the hopper environment based on the work done by Erez, Tassa, and Todorov
+    in ["Infinite Horizon Model Predictive Control for Nonlinear Periodic Tasks"](http://www.roboticsproceedings.org/rss07/p10.pdf)
+    by adding another set of legs making it possiible for the robot to walker forward instead of
+    hop. Like other Mujoco environments, this environment aims to increase the number of independent state
+    and control variables as compared to the classic control environments. The walker is a
+    two-dimensional two-legged figure that consist of four main body parts - a single torso at the top
+    (with the two legs splitting after the torso), two thighs in the middle below the torso, two legs
+    in the bottom below the thighs, and two feet attached to the legs on which the entire body rests.
+    The goal is to make coordinate both sets of feet, legs, and thighs to move in the forward (right)
+    direction by applying torques on the six hinges connecting the six body parts.
+
+    ### Action Space
+    The agent take a 6-element vector for actions.
+    The action space is a continuous `(action, action, action, action, action, action)`
+    all in `[-1, 1]`, where `action` represents the numerical torques applied at the hinge joints.
+
+    | Num | Action                                 | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit         |
+    |-----|----------------------------------------|-------------|-------------|----------------------------------|-------|--------------|
+    | 0   | Torque applied on the thigh rotor      | -1          | 1           | thigh_joint                      | hinge | torque (N m) |
+    | 1   | Torque applied on the leg rotor        | -1          | 1           | leg_joint                        | hinge | torque (N m) |
+    | 2   | Torque applied on the foot rotor       | -1          | 1           | foot_joint                       | hinge | torque (N m) |
+    | 3   | Torque applied on the left thigh rotor | -1          | 1           | thigh_left_joint                 | hinge | torque (N m) |
+    | 4   | Torque applied on the left leg rotor   | -1          | 1           | leg_left_joint                   | hinge | torque (N m) |
+    | 5   | Torque applied on the left foot rotor  | -1          | 1           | foot_left_joint                  | hinge | torque (N m) |
+
+    ### Observation Space
+
+    The state space consists of positional values of different body parts of the walker,
+    followed by the velocities of those individual parts (their derivatives) with all the positions ordered before all the velocities.
+
+    The observation is a `ndarray` with shape `(17,)` where the elements correspond to the following:
+
+    | Num | Observation                                                       | Min                | Max                | Name (in corresponding XML file) | Joint | Unit |
+    |-----|--------------------------------------------------------|----------------|-----------------|----------------------------------------|-------|------|
+    | 0   | x-coordinate of the top                            | -Inf                 | Inf                | rootx (torso)      | slide | position (m) |
+    | 1   | z-coordinate of the top (height of hopper)         | -Inf                 | Inf                | rootz (torso)      | slide | position (m) |
+    | 2   | angle of the top                                   | -Inf                 | Inf                | rooty (torso)      | hinge | angle (rad) |
+    | 3   | angle of the thigh joint                           | -Inf                 | Inf                | thigh_joint        | hinge | angle (rad) |
+    | 4   | angle of the leg joint                             | -Inf                 | Inf                | leg_joint            | hinge | angle (rad) |
+    | 5   | angle of the foot joint                            | -Inf                 | Inf                | foot_joint          | hinge | angle (rad) |
+    | 6   | angle of the left thigh joint                      | -Inf                 | Inf                | thigh_left_joint  | hinge | angle (rad) |
+    | 7   | angle of the left leg joint                        | -Inf                 | Inf                | leg_left_joint    | hinge | angle (rad) |
+    | 8   | angle of the left foot joint                       | -Inf                 | Inf                | foot_left_joint  | hinge | angle (rad) |
+    | 9   | velocity of the x-coordinate of the top            | -Inf                 | Inf                | rootx               | slide | velocity (m/s) |
+    | 10  | velocity of the z-coordinate (height) of the top   | -Inf                 | Inf                | rootz                | slide | velocity (m/s)  |
+    | 11  | angular velocity of the angle of the top           | -Inf                 | Inf                | rooty                | hinge | angular velocity (rad/s) |
+    | 12  | angular velocity of the thigh hinge                | -Inf                 | Inf                | thigh_joint        | hinge | angular velocity (rad/s) |
+    | 13  | angular velocity of the leg hinge                  | -Inf                 | Inf                | leg_joint            |  hinge | angular velocity (rad/s) |
+    | 14  | angular velocity of the foot hinge                 | -Inf                 | Inf                | foot_joint           | hinge | angular velocity (rad/s) |
+    | 15  | angular velocity of the thigh hinge                | -Inf                 | Inf                | thigh_left_joint   | hinge | angular velocity (rad/s) |
+    | 16  | angular velocity of the leg hinge                  | -Inf                 | Inf                | leg_left_joint     | hinge | angular velocity (rad/s) |
+    | 17  | angular velocity of the foot hinge                 | -Inf                 | Inf                | foot_left_joint    | hinge | angular velocity (rad/s) |
+
+
+
+    **Note:**
+    In practice (and Gym implementation), the first positional element is omitted from the
+    state space since the reward function is calculated based on that value. This value is
+    hidden from the algorithm, which in turn has to develop an abstract understanding of it
+    from the observed rewards. Therefore, observation space has shape `(17,)`
+    instead of `(18,)` and looks like:
+
+    | Num | Observation                                                       | Min                | Max                | Name (in corresponding XML file) | Joint | Unit |
+    |-----|--------------------------------------------------------|----------------|-----------------|----------------------------------------|-------|------|
+    | 0   | z-coordinate of the top (height of hopper)         | -Inf                 | Inf                | rootz (torso)      | slide | position (m) |
+    | 1   | angle of the top                                   | -Inf                 | Inf                | rooty (torso)      | hinge | angle (rad) |
+    | 2   | angle of the thigh joint                           | -Inf                 | Inf                | thigh_joint        | hinge | angle (rad) |
+    | 3   | angle of the leg joint                             | -Inf                 | Inf                | leg_joint            | hinge | angle (rad) |
+    | 4   | angle of the foot joint                            | -Inf                 | Inf                | foot_joint          | hinge | angle (rad) |
+    | 5   | angle of the left thigh joint                      | -Inf                 | Inf                | thigh_left_joint  | hinge | angle (rad) |
+    | 6   | angle of the left leg joint                        | -Inf                 | Inf                | leg_left_joint    | hinge | angle (rad) |
+    | 7   | angle of the left foot joint                       | -Inf                 | Inf                | foot_left_joint  | hinge | angle (rad) |
+    | 8   | velocity of the x-coordinate of the top            | -Inf                 | Inf                | rootx               | slide | velocity (m/s) |
+    | 9   | velocity of the z-coordinate (height) of the top   | -Inf                 | Inf                | rootz                | slide | velocity (m/s)  |
+    | 10  | angular velocity of the angle of the top           | -Inf                 | Inf                | rooty                | hinge | angular velocity (rad/s) |
+    | 11  | angular velocity of the thigh hinge                | -Inf                 | Inf                | thigh_joint        | hinge | angular velocity (rad/s) |
+    | 12  | angular velocity of the leg hinge                  | -Inf                 | Inf                | leg_joint            |  hinge | angular velocity (rad/s) |
+    | 13  | angular velocity of the foot hinge                 | -Inf                 | Inf                | foot_joint           | hinge | angular velocity (rad/s) |
+    | 14  | angular velocity of the thigh hinge                | -Inf                 | Inf                | thigh_left_joint   | hinge | angular velocity (rad/s) |
+    | 15  | angular velocity of the leg hinge                  | -Inf                 | Inf                | leg_left_joint     | hinge | angular velocity (rad/s) |
+    | 16  | angular velocity of the foot hinge                 | -Inf                 | Inf                | foot_left_joint    | hinge | angular velocity (rad/s) |
+
+    ### Rewards
+    The reward consists of three parts:
+    - *alive bonus*: Every timestep that the walker is alive, it gets a reward of 1,
+    - *reward_forward*: A reward of walking forward which is measured as
+    *(x-coordinate before action - x-coordinate after action)/dt*.
+    *dt* is the time between actions and is dependeent on the frame_skip parameter
+    (default is 4), where the *dt* for one frame is 0.002 - making the default
+    *dt = 4 * 0.002 = 0.008*. This reward would be positive if the walker walks forward (right) desired.
+    - *reward_control*: A negative reward for penalising the walker if it
+    takes actions that are too large. It is measured as
+    *-coefficient **x** sum(action<sup>2</sup>)* where *coefficient* is
+    a parameter set for the control and has a default value of 0.001
+
+    The total reward returned is ***reward*** *=* *alive bonus + reward_forward + reward_control*
+
+    ### Starting State
+    All observations start in state
+    (0.0, 1.25, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
+    with a uniform noise in the range of [-0.005, 0.005] added to the values for stochasticity.
+
+    ### Episode Termination
+    The episode terminates when any of the following happens:
+
+    1. The episode duration reaches a 1000 timesteps
+    2. Any of the state space values is no longer finite
+    3. The height of the walker (index 1) is ***not*** in the range `[0.8, 2]`
+    4. The absolute value of the angle (index 2) is ***not*** in the range `[-1, 1]`
+
+    ### Arguments
+
+    No additional arguments are currently supported (in v2 and lower),
+    but modifications can be made to the XML file in the assets folder
+    (or by changing the path to a modified XML file in another folder)..
+
+    ```
+    env = gym.make('Walker2d-v2')
+    ```
+
+    v3 and beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
+
+    ```
+    env = gym.make('Walker2d-v3', ctrl_cost_weight=0.1, ....)
+    ```
+
+    ### Version History
+
+    * v4: all mujoco environments now use the mujoco binidings in mujoco>=2.1.3
+    * v3: support for gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. rgb rendering comes from tracking camera (so agent does not run away from screen)
+    * v2: All continuous control environments now use mujoco_py >= 1.50
+    * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments.
+    * v0: Initial versions release (1.0.0)
+
+    """
+
+    def __init__(
+        self,
+        xml_file="walker2d.xml",
+        forward_reward_weight=1.0,
+        ctrl_cost_weight=1e-3,
+        healthy_reward=1.0,
+        terminate_when_unhealthy=True,
+        healthy_z_range=(0.8, 2.0),
+        healthy_angle_range=(-1.0, 1.0),
+        reset_noise_scale=5e-3,
+        exclude_current_positions_from_observation=True,
+    ):
+        utils.EzPickle.__init__(**locals())
+
+        self._forward_reward_weight = forward_reward_weight
+        self._ctrl_cost_weight = ctrl_cost_weight
+
+        self._healthy_reward = healthy_reward
+        self._terminate_when_unhealthy = terminate_when_unhealthy
+
+        self._healthy_z_range = healthy_z_range
+        self._healthy_angle_range = healthy_angle_range
+
+        self._reset_noise_scale = reset_noise_scale
+
+        self._exclude_current_positions_from_observation = (
+            exclude_current_positions_from_observation
+        )
+
+        mujoco_env.MujocoEnv.__init__(self, xml_file, 4)
+
+    @property
+    def healthy_reward(self):
+        return (
+            float(self.is_healthy or self._terminate_when_unhealthy)
+            * self._healthy_reward
+        )
+
+    def control_cost(self, action):
+        control_cost = self._ctrl_cost_weight * np.sum(np.square(action))
+        return control_cost
+
+    @property
+    def is_healthy(self):
+        z, angle = self.data.qpos[1:3]
+
+        min_z, max_z = self._healthy_z_range
+        min_angle, max_angle = self._healthy_angle_range
+
+        healthy_z = min_z < z < max_z
+        healthy_angle = min_angle < angle < max_angle
+        is_healthy = healthy_z and healthy_angle
+
+        return is_healthy
+
+    @property
+    def done(self):
+        done = not self.is_healthy if self._terminate_when_unhealthy else False
+        return done
+
+    def _get_obs(self):
+        position = self.data.qpos.flat.copy()
+        velocity = np.clip(self.data.qvel.flat.copy(), -10, 10)
+
+        if self._exclude_current_positions_from_observation:
+            position = position[1:]
+
+        observation = np.concatenate((position, velocity)).ravel()
+        return observation
+
+    def step(self, action):
+        x_position_before = self.data.qpos[0]
+        self.do_simulation(action, self.frame_skip)
+        x_position_after = self.data.qpos[0]
+        x_velocity = (x_position_after - x_position_before) / self.dt
+
+        ctrl_cost = self.control_cost(action)
+
+        forward_reward = self._forward_reward_weight * x_velocity
+        healthy_reward = self.healthy_reward
+
+        rewards = forward_reward + healthy_reward
+        costs = ctrl_cost
+
+        observation = self._get_obs()
+        reward = rewards - costs
+        done = self.done
+        info = {
+            "x_position": x_position_after,
+            "x_velocity": x_velocity,
+        }
+
+        return observation, reward, done, info
+
+    def reset_model(self):
+        noise_low = -self._reset_noise_scale
+        noise_high = self._reset_noise_scale
+
+        qpos = self.init_qpos + self.np_random.uniform(
+            low=noise_low, high=noise_high, size=self.model.nq
+        )
+        qvel = self.init_qvel + self.np_random.uniform(
+            low=noise_low, high=noise_high, size=self.model.nv
+        )
+
+        self.set_state(qpos, qvel)
+
+        observation = self._get_obs()
+        return observation
+
+    def viewer_setup(self):
+        for key, value in DEFAULT_CAMERA_CONFIG.items():
+            if isinstance(value, np.ndarray):
+                getattr(self.viewer.cam, key)[:] = value
+            else:
+                setattr(self.viewer.cam, key, value)
--- a/py.Dockerfile
+++ b/py.Dockerfile
@@ -6,15 +6,14 @@ RUN apt-get -y update && apt-get install -y unzip libglu1-mesa-dev libgl1-mesa-d
 # Download mujoco
 RUN mkdir /root/.mujoco && \
    cd /root/.mujoco  && \
-    curl -O https://www.roboti.us/download/mjpro150_linux.zip && \
-    unzip mjpro150_linux.zip && \
-    echo DUMMY_KEY > /root/.mujoco/mjkey.txt
+    wget https://github.com/deepmind/mujoco/releases/download/2.1.0/mujoco210-linux-x86_64.tar.gz &&\
+    tar -xf mujoco210-linux-x86_64.tar.gz

-ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/root/.mujoco/mjpro150/bin
+ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/root/.mujoco/mujoco210/bin

 COPY . /usr/local/gym/
 WORKDIR /usr/local/gym/

-RUN pip install .[nomujoco] && pip install -r test_requirements.txt
+RUN pip install .[noatari] && pip install -r test_requirements.txt

 ENTRYPOINT ["/usr/local/gym/bin/docker_entrypoint"]
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,6 @@
 ale-py~=0.7
 opencv-python>=3.0
 box2d-py==2.3.5
-mujoco_py>=1.50, <2.0
 scipy>=1.4.1
 numpy>=1.18.0
 pyglet>=1.4.0
--- a/setup.py
+++ b/setup.py
@@ -15,19 +15,25 @@ extras = {
    "accept-rom-license": ["autorom[accept-rom-license]~=0.4.2"],
    "box2d": ["box2d-py==2.3.5", "pygame==2.1.0"],
    "classic_control": ["pygame==2.1.0"],
-    "mujoco": ["mujoco_py>=1.50, <2.0"],
+    "mujoco_py": ["mujoco_py<2.2,>=2.1"],
+    "mujoco": ["mujoco==2.1.5", "imageio>=2.14.1"],
    "toy_text": ["pygame==2.1.0", "scipy>=1.4.1"],
    "other": ["lz4>=3.1.0", "opencv-python>=3.0", "matplotlib>=3.0"],
 }

 # Meta dependency groups.
-nomujoco_blacklist = {"mujoco", "accept-rom-license", "atari"}
+nomujoco_blacklist = {"mujoco_py", "mujoco", "accept-rom-license", "atari"}
 nomujoco_groups = set(extras.keys()) - nomujoco_blacklist

 extras["nomujoco"] = list(
    itertools.chain.from_iterable(map(lambda group: extras[group], nomujoco_groups))
 )

+noatari_blacklist = {"accept-rom-license", "atari"}
+noatari_groups = set(extras.keys()) - noatari_blacklist
+extras["noatari"] = list(
+    itertools.chain.from_iterable(map(lambda group: extras[group], noatari_groups))
+)

 all_blacklist = {"accept-rom-license"}
 all_groups = set(extras.keys()) - all_blacklist
--- a/tests/envs/spec_list.py
+++ b/tests/envs/spec_list.py
@@ -1,19 +1,14 @@
-import os
-
 from gym import envs, logger

-SKIP_MUJOCO_WARNING_MESSAGE = (
-    "Cannot run mujoco test (either license key not found or mujoco not"
-    "installed properly)."
+SKIP_MUJOCO_V3_WARNING_MESSAGE = (
+    "Cannot run mujoco test because mujoco-py is not installed"
 )

-
-skip_mujoco = not (os.environ.get("MUJOCO_KEY"))
-if not skip_mujoco:
+skip_mujoco_v3 = False
 try:
    import mujoco_py  # noqa:F401
 except ImportError:
-        skip_mujoco = True
+    skip_mujoco_v3 = True


 def should_skip_env_spec_for_tests(spec):
@@ -21,7 +16,7 @@ def should_skip_env_spec_for_tests(spec):
    # troublesome to run frequently
    ep = spec.entry_point
    # Skip mujoco tests for pull request CI
-    if skip_mujoco and ep.startswith("gym.envs.mujoco"):
+    if skip_mujoco_v3 and ep.startswith("gym.envs.mujoco"):
        return True
    try:
        import gym.envs.atari  # noqa:F401
@@ -48,8 +43,19 @@ def should_skip_env_spec_for_tests(spec):
    return False


+def skip_mujoco_py_env_for_test(spec):
+    ep = spec.entry_point
+    version = spec.version
+    if ep.startswith("gym.envs.mujoco") and version < 4:
+        return True
+    return False
+
+
 spec_list = [
    spec
    for spec in sorted(envs.registry.values(), key=lambda x: x.id)
    if spec.entry_point is not None and not should_skip_env_spec_for_tests(spec)
 ]
+spec_list_no_mujoco_py = [
+    spec for spec in spec_list if not skip_mujoco_py_env_for_test(spec)
+]
--- a/tests/envs/test_action_dim_check.py
+++ b/tests/envs/test_action_dim_check.py
@@ -8,17 +8,19 @@ from gym.envs.registration import EnvSpec
 from gym.spaces.box import Box
 from gym.spaces.discrete import Discrete
 from gym.spaces.space import Space
-from tests.envs.spec_list import SKIP_MUJOCO_WARNING_MESSAGE, skip_mujoco, spec_list
+from tests.envs.spec_list import (
+    SKIP_MUJOCO_V3_WARNING_MESSAGE,
+    skip_mujoco_v3,
+    spec_list,
+)

 ENVIRONMENT_IDS = ("HalfCheetah-v2",)


 def make_envs_by_action_space_type(spec_list: List[EnvSpec], action_space: Space):
    """Make environments of specific action_space type.
-
    This function returns a filtered list of environment from the
    spec_list that matches the action_space type.
-
    Args:
        spec_list (list): list of registered environments' specification
        action_space (gym.spaces.Space): action_space type
@@ -31,7 +33,7 @@ def make_envs_by_action_space_type(spec_list: List[EnvSpec], action_space: Space
    return filtered_envs


-@pytest.mark.skipif(skip_mujoco, reason=SKIP_MUJOCO_WARNING_MESSAGE)
+@pytest.mark.skipif(skip_mujoco_v3, reason=SKIP_MUJOCO_V3_WARNING_MESSAGE)
@pytest.mark.parametrize("environment_id", ENVIRONMENT_IDS)
 def test_serialize_deserialize(environment_id):
    env = envs.make(environment_id)
@@ -47,10 +49,8 @@ def test_serialize_deserialize(environment_id):
@pytest.mark.parametrize("env", make_envs_by_action_space_type(spec_list, Discrete))
 def test_discrete_actions_out_of_bound(env):
    """Test out of bound actions in Discrete action_space.
-
    In discrete action_space environments, `out-of-bound`
    actions are not allowed and should raise an exception.
-
    Args:
        env (gym.Env): the gym environment
    """
@@ -69,11 +69,9 @@ def test_discrete_actions_out_of_bound(env):
 )
 def test_box_actions_out_of_bound(env, seed):
    """Test out of bound actions in Box action_space.
-
    Environments with Box actions spaces perform clipping inside `step`.
    The expected behaviour is that an action `out-of-bound` has the same effect
    of an action with value exactly at the upper (or lower) bound.
-
    Args:
        env (gym.Env): the gym environment
        seed (int): seed value for determinism
--- a/tests/envs/test_envs.py
+++ b/tests/envs/test_envs.py
@@ -4,7 +4,7 @@ import pytest
 from gym import envs
 from gym.spaces import Box
 from gym.utils.env_checker import check_env
-from tests.envs.spec_list import spec_list
+from tests.envs.spec_list import spec_list, spec_list_no_mujoco_py


 # This runs a smoketest on each official registered env. We may want
@@ -13,7 +13,9 @@ from tests.envs.spec_list import spec_list
@pytest.mark.filterwarnings(
    "ignore:.*We recommend you to use a symmetric and normalized Box action space.*"
 )
-@pytest.mark.parametrize("spec", spec_list, ids=[spec.id for spec in spec_list])
+@pytest.mark.parametrize(
+    "spec", spec_list_no_mujoco_py, ids=[spec.id for spec in spec_list_no_mujoco_py]
+)
 def test_env(spec):
    # Capture warnings
    with pytest.warns(None) as warnings:
@@ -47,12 +49,14 @@ def test_env(spec):
        assert (
            observation.dtype == ob_space.dtype
        ), f"Step observation dtype: {ob.dtype}, expected: {ob_space.dtype}"
-
    for mode in env.metadata.get("render_modes", []):
+        if not (mode == "human" and spec.entry_point.startswith("gym.envs.mujoco")):
            env.render(mode=mode)

    # Make sure we can render the environment after close.
    for mode in env.metadata.get("render_modes", []):
+        if not (mode == "human" and spec.entry_point.startswith("gym.envs.mujoco")):
+
            env.render(mode=mode)

    env.close()
--- a/tests/envs/test_mujoco_v2_to_v3_conversion.py
+++ b/tests/envs/test_mujoco_v2_to_v3_conversion.py
@@ -3,7 +3,7 @@ import unittest
 import numpy as np

 from gym import envs
-from tests.envs.spec_list import SKIP_MUJOCO_WARNING_MESSAGE, skip_mujoco
+from tests.envs.spec_list import SKIP_MUJOCO_V3_WARNING_MESSAGE, skip_mujoco_v3


 def verify_environments_match(
@@ -31,7 +31,7 @@ def verify_environments_match(
            np.testing.assert_allclose(old_info[key], new_info[key], atol=eps)


-@unittest.skipIf(skip_mujoco, SKIP_MUJOCO_WARNING_MESSAGE)
+@unittest.skipIf(skip_mujoco_v3, SKIP_MUJOCO_V3_WARNING_MESSAGE)
 class Mujocov2Tov3ConversionTest(unittest.TestCase):
    def test_environments_match(self):
        test_cases = (