diff --git a/gym/envs/mujoco/ant_v3.py b/gym/envs/mujoco/ant_v3.py index 73336e8db..4e20dc3ca 100644 --- a/gym/envs/mujoco/ant_v3.py +++ b/gym/envs/mujoco/ant_v3.py @@ -12,10 +12,10 @@ class AntEnv(mujoco_env.MujocoEnv, utils.EzPickle): """ ### Description - This environment is based on the environment iintroduced by Schulman, + This environment is based on the environment introduced by Schulman, Moritz, Levine, Jordan and Abbeel in ["High-Dimensional Continuous Control Using Generalized Advantage Estimation"](https://arxiv.org/abs/1506.02438). - The ant is a 3D roboot consisting of one torso (free rotational body) with + The ant is a 3D robot consisting of one torso (free rotational body) with four legs attached to it with each leg having two links. The goal is to coordinate the four legs to move in the forward (right) direction by applying torques on the eight hinges connecting the two links of each leg and the torso @@ -41,14 +41,14 @@ class AntEnv(mujoco_env.MujocoEnv, utils.EzPickle): ### Observation Space - The state space consists of positional values of different body parts of the hopper, + The state space consists of positional values of different body parts of the ant, followed by the velocities of those individual parts (their derivatives) with all the positions ordered before all the velocities. The observation is a `ndarray` with shape `(111,)` where the elements correspond to the following: - | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit | - |-----|---------------------------------------------------------|----------------|-----------------|----------------------------------------|-------|------| + | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit | + |-----|-------------------------------------------------------------|----------------|-----------------|----------------------------------------|-------|------| | 0 | x-coordinate of the torso (centre) | -Inf | Inf | torso | free | position (m) | | 1 | y-coordinate of the torso (centre) | -Inf | Inf | torso | free | position (m) | | 2 | z-coordinate of the torso (centre) | -Inf | Inf | torso | free | position (m) | @@ -137,7 +137,7 @@ class AntEnv(mujoco_env.MujocoEnv, utils.EzPickle): No additional arguments are currently supported (in v2 and lower), but modifications can be made to the XML file in the assets folder (or by changing the path to a modified - XML file in another folder).. + XML file in another folder). ``` env = gym.make('Ant-v2') @@ -146,7 +146,7 @@ class AntEnv(mujoco_env.MujocoEnv, utils.EzPickle): v3 and beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. ``` - env = gym.make('Ant-v3', ctrl_cost_weight=0.1, ....) + env = gym.make('Ant-v3', ctrl_cost_weight=0.1, ...) ``` ### Version History diff --git a/gym/envs/mujoco/half_cheetah_v3.py b/gym/envs/mujoco/half_cheetah_v3.py index 99663a94a..56e148756 100644 --- a/gym/envs/mujoco/half_cheetah_v3.py +++ b/gym/envs/mujoco/half_cheetah_v3.py @@ -101,7 +101,7 @@ class HalfCheetahEnv(mujoco_env.MujocoEnv, utils.EzPickle): (default is 5), where the *dt* for one frame is 0.01 - making the default *dt = 5*0.01 = 0.05*. This reward would be positive if the cheetah runs forward (right) desired. - - *reward_control*: A negative reward for penalising the swimmer if it takes + - *reward_control*: A negative reward for penalising the cheetah if it takes actions that are too large. It is measured as *-coefficient x sum(action2)* where *coefficient* is a parameter set for the control and has a default value of 0.1 diff --git a/gym/envs/mujoco/hopper_v3.py b/gym/envs/mujoco/hopper_v3.py index a9312ec51..b9b6211e0 100644 --- a/gym/envs/mujoco/hopper_v3.py +++ b/gym/envs/mujoco/hopper_v3.py @@ -18,8 +18,7 @@ class HopperEnv(mujoco_env.MujocoEnv, utils.EzPickle): ### Description This environment is based on the work done by Erez, Tassa, and Todorov in - ["Infinite Horizon Model Predictive Control for Nonlinear Periodic Tasks"] - (http://www.roboticsproceedings.org/rss07/p10.pdf). The environment aims to + ["Infinite Horizon Model Predictive Control for Nonlinear Periodic Tasks"](http://www.roboticsproceedings.org/rss07/p10.pdf). The environment aims to increase the number of independent state and control variables as compared to the classic control environments. The hopper is a two-dimensional one-legged figure that consist of four main body parts - the torso at the @@ -71,19 +70,20 @@ class HopperEnv(mujoco_env.MujocoEnv, utils.EzPickle): on that value. This value is hidden from the algorithm, which in turn has to develop an abstract understanding of it from the observed rewards. Therefore, observation space has shape `(11,)` instead of `(12,)` and looks like: + | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint| Unit | |-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------| - | 0 | z-coordinate of the top (height of hopper) | -Inf | Inf | rootz | slide | position (m) | - | 1 | angle of the top | -Inf | Inf | rooty | hinge | angle (rad) | - | 2 | angle of the thigh joint | -Inf | Inf | thigh_joint | hinge | angle (rad) | - | 3 | angle of the leg joint | -Inf | Inf | leg_joint | hinge | angle (rad) | - | 4 | angle of the foot joint | -Inf | Inf | foot_joint | hinge | angle (rad) | - | 5 | velocity of the x-coordinate of the top | -Inf | Inf | rootx | slide | velocity (m/s) | - | 6 | velocity of the z-coordinate (height) of the top | -Inf | Inf | rootz | slide | velocity (m/s) | - | 7 | angular velocity of the angle of the top | -Inf | Inf | rooty | hinge | angular velocity (rad/s) | - | 8 | angular velocity of the thigh hinge | -Inf | Inf | thigh_joint | hinge | angular velocity (rad/s) | - | 9 | angular velocity of the leg hinge | -Inf | Inf | leg_joint | hinge | angular velocity (rad/s) | - | 10 | angular velocity of the foot hinge | -Inf | Inf | foot_joint | hinge | angular velocity (rad/s) | + | 0 | z-coordinate of the top (height of hopper) | -Inf | Inf | rootz | slide | position (m) | + | 1 | angle of the top | -Inf | Inf | rooty | hinge | angle (rad) | + | 2 | angle of the thigh joint | -Inf | Inf | thigh_joint | hinge | angle (rad) | + | 3 | angle of the leg joint | -Inf | Inf | leg_joint | hinge | angle (rad) | + | 4 | angle of the foot joint | -Inf | Inf | foot_joint | hinge | angle (rad) | + | 5 | velocity of the x-coordinate of the top | -Inf | Inf | rootx | slide | velocity (m/s) | + | 6 | velocity of the z-coordinate (height) of the top | -Inf | Inf | rootz | slide | velocity (m/s) | + | 7 | angular velocity of the angle of the top | -Inf | Inf | rooty | hinge | angular velocity (rad/s) | + | 8 | angular velocity of the thigh hinge | -Inf | Inf | thigh_joint | hinge | angular velocity (rad/s) | + | 9 | angular velocity of the leg hinge | -Inf | Inf | leg_joint | hinge | angular velocity (rad/s) | + | 10 | angular velocity of the foot hinge | -Inf | Inf | foot_joint | hinge | angular velocity (rad/s) | ### Rewards The reward consists of three parts: @@ -103,8 +103,8 @@ class HopperEnv(mujoco_env.MujocoEnv, utils.EzPickle): ### Starting State All observations start in state - (0.0, 1.25, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0) with a uniform nois - e in the range of [-0.005, 0.005] added to the values for stochasticity. + (0.0, 1.25, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0) with a uniform noise + in the range of [-0.005, 0.005] added to the values for stochasticity. ### Episode Termination The episode terminates when any of the following happens: diff --git a/gym/envs/mujoco/humanoid_v3.py b/gym/envs/mujoco/humanoid_v3.py index 0ba35caf2..3b2131f5b 100644 --- a/gym/envs/mujoco/humanoid_v3.py +++ b/gym/envs/mujoco/humanoid_v3.py @@ -144,7 +144,7 @@ class HumanoidEnv(mujoco_env.MujocoEnv, utils.EzPickle): ### Rewards The reward consists of three parts: - - *alive_bonus*: Every timestep that the ant is alive, it gets a reward of 5. + - *alive_bonus*: Every timestep that the humanoid is alive, it gets a reward of 5. - *lin_vel_cost*: A reward of walking forward which is measured as *1.25 * (average center of mass before action - average center of mass after action)/dt*. *dt* is the time between actions and is dependent on the frame_skip parameter diff --git a/gym/envs/mujoco/humanoidstandup.py b/gym/envs/mujoco/humanoidstandup.py index b410f975d..820e40e16 100644 --- a/gym/envs/mujoco/humanoidstandup.py +++ b/gym/envs/mujoco/humanoidstandup.py @@ -8,12 +8,12 @@ class HumanoidStandupEnv(mujoco_env.MujocoEnv, utils.EzPickle): ### Description This environment is based on the environment introduced by Tassa, Erez and Todorov - in ["Synthesis and stabilization of complex behaviors through online trajectory optimization"] - (https://ieeexplore.ieee.org/document/6386025). The 3D bipedal robot is designed to simulate - a human. It has a torso (abdomen) with a pair of legs and arms. The legs each consist of two - links, and so the arms (representing the knees and elbows respectively). The environment starts - with the humanoid layiing on the ground, and then the goal of the environment is to make the - humanoid standup and then keep it standing by applying torques on the various hinges. + in ["Synthesis and stabilization of complex behaviors through online trajectory optimization"](https://ieeexplore.ieee.org/document/6386025). + The 3D bipedal robot is designed to simulate a human. It has a torso (abdomen) with a + pair of legs and arms. The legs each consist of two links, and so the arms (representing the + knees and elbows respectively). The environment starts with the humanoid layiing on the ground, + and then the goal of the environment is to make the humanoid standup and then keep it standing + by applying torques on the various hinges. ### Action Space The agent take a 17-element vector for actions. diff --git a/gym/envs/mujoco/inverted_double_pendulum.py b/gym/envs/mujoco/inverted_double_pendulum.py index 710b7e2a3..9fbd65404 100644 --- a/gym/envs/mujoco/inverted_double_pendulum.py +++ b/gym/envs/mujoco/inverted_double_pendulum.py @@ -29,7 +29,7 @@ class InvertedDoublePendulumEnv(mujoco_env.MujocoEnv, utils.EzPickle): ### Observation Space - The state space consists of positional values of different body parts of the hopper, + The state space consists of positional values of different body parts of the pendulum system, followed by the velocities of those individual parts (their derivatives) with all the positions ordered before all the velocities. diff --git a/gym/envs/mujoco/inverted_pendulum.py b/gym/envs/mujoco/inverted_pendulum.py index e8b823df5..696bedc27 100644 --- a/gym/envs/mujoco/inverted_pendulum.py +++ b/gym/envs/mujoco/inverted_pendulum.py @@ -30,7 +30,7 @@ class InvertedPendulumEnv(mujoco_env.MujocoEnv, utils.EzPickle): ### Observation Space The state space consists of positional values of different body parts of - the hopper, followed by the velocities of those individual parts (their derivatives) + the pendulum system, followed by the velocities of those individual parts (their derivatives) with all the positions ordered before all the velocities. The observation is a `ndarray` with shape `(4,)` where the elements correspond to the following: diff --git a/gym/envs/mujoco/reacher.py b/gym/envs/mujoco/reacher.py index cb3c29d83..bfda039a9 100644 --- a/gym/envs/mujoco/reacher.py +++ b/gym/envs/mujoco/reacher.py @@ -46,6 +46,7 @@ class ReacherEnv(mujoco_env.MujocoEnv, utils.EzPickle): reacher the state is created by combining only certain elements of the position and velocity, and performing some function transformations on them. If one is to read the `.xml` for reacher then they will find 4 joints: + | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit | |-----|-----------------------------|----------|----------|----------------------------------|-------|--------------------| | 0 | angle of the first arm | -Inf | Inf | joint0 | hinge | angle (rad | diff --git a/gym/envs/mujoco/swimmer_v3.py b/gym/envs/mujoco/swimmer_v3.py index 43ad6b630..b64606244 100644 --- a/gym/envs/mujoco/swimmer_v3.py +++ b/gym/envs/mujoco/swimmer_v3.py @@ -72,6 +72,7 @@ class SwimmerEnv(mujoco_env.MujocoEnv, utils.EzPickle): In practice (and Gym implementation), the first two positional elements are omitted from the state space since the reward function is calculated based on those values. Therefore, observation space has shape `(8,)` and looks like: + | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint| Unit | |-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------| | 0 | angle of the front tip | -Inf | Inf | rot | hinge | angle (rad) | @@ -89,7 +90,7 @@ class SwimmerEnv(mujoco_env.MujocoEnv, utils.EzPickle): as *(x-coordinate before action - x-coordinate after action)/dt*. *dt* is the time between actions and is dependeent on the frame_skip parameter (default is 4), where the *dt* for one frame is 0.01 - making the - default *dt = 4*0.01 = 0.04*. This reward would be positive if the swimmer + default *dt = 4 * 0.01 = 0.04*. This reward would be positive if the swimmer swims right as desired. - *reward_control*: A negative reward for penalising the swimmer if it takes actions that are too large. It is measured as *-coefficient x diff --git a/gym/envs/mujoco/walker2d_v3.py b/gym/envs/mujoco/walker2d_v3.py index faebde0ae..a69926bae 100644 --- a/gym/envs/mujoco/walker2d_v3.py +++ b/gym/envs/mujoco/walker2d_v3.py @@ -76,6 +76,7 @@ class Walker2dEnv(mujoco_env.MujocoEnv, utils.EzPickle): hidden from the algorithm, which in turn has to develop an abstract understanding of it from the observed rewards. Therefore, observation space has shape `(17,)` instead of `(18,)` and looks like: + | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit | |-----|--------------------------------------------------------|----------------|-----------------|----------------------------------------|-------|------| | 0 | z-coordinate of the top (height of hopper) | -Inf | Inf | rootz (torso) | slide | position (m) |