2016-04-27 08:00:58 -07:00
"""
Classic cart - pole system implemented by Rich Sutton et al .
2017-06-14 16:27:42 -04:00
Copied from http : / / incompleteideas . net / sutton / book / code / pole . c
permalink : https : / / perma . cc / C9ZM - 652 R
2016-04-27 08:00:58 -07:00
"""
import math
2022-02-10 18:24:41 +01:00
from typing import Optional , Union
2021-12-08 22:14:15 +01:00
2022-02-11 23:48:42 +08:00
import numpy as np
import pygame
from pygame import gfxdraw
2016-04-27 08:00:58 -07:00
import gym
Cleanup, removal of unmaintained code (#836)
* add dtype to Box
* remove board_game, debugging, safety, parameter_tuning environments
* massive set of breaking changes
- remove python logging module
- _step, _reset, _seed, _close => non underscored method
- remove benchmark and scoring folder
* Improve render("human"), now resizable, closable window.
* get rid of default step and reset in wrappers, so it doesn’t silently fail for people with underscore methods
* CubeCrash unit test environment
* followup fixes
* MemorizeDigits unit test envrionment
* refactored spaces a bit
fixed indentation
disabled test_env_semantics
* fix unit tests
* fixes
* CubeCrash, MemorizeDigits tested
* gym backwards compatibility patch
* gym backwards compatibility, followup fixes
* changelist, add spaces to main namespaces
* undo_logger_setup for backwards compat
* remove configuration.py
2018-01-25 18:20:14 -08:00
from gym import spaces , logger
2016-05-29 09:07:09 -07:00
from gym . utils import seeding
2016-04-27 08:00:58 -07:00
2020-05-08 22:03:48 +02:00
2022-02-10 18:24:41 +01:00
class CartPoleEnv ( gym . Env [ np . ndarray , Union [ int , np . ndarray ] ] ) :
2018-08-24 19:30:17 -04:00
"""
2022-01-26 16:02:42 -05:00
### Description
This environment corresponds to the version of the cart - pole problem
described by Barto , Sutton , and Anderson in [ " Neuronlike Adaptive Elements That Can Solve Difficult Learning Control Problem " ] ( https : / / ieeexplore . ieee . org / document / 6313077 ) .
A pole is attached by an un - actuated joint to a cart , which moves along a
frictionless track . The pendulum starts upright , and the goal is to prevent
it from falling over by increasing and reducing the cart ' s velocity.
### Action Space
The agent take a 1 - element vector for actions .
The action space is ` ( action ) ` in ` [ 0 , 1 ] ` , where ` action ` is used to push
the cart with a fixed amount of force :
| Num | Action |
| - - - - - | - - - - - - - - - - - - - - - - - - - - - - - - |
| 0 | Push cart to the left |
| 1 | Push cart to the right |
Note : The amount the velocity is reduced or increased is not fixed as it depends on the angle the pole is pointing .
This is because the center of gravity of the pole increases the amount of energy needed to move the cart underneath it
### Observation Space
The observation is a ` ndarray ` with shape ` ( 4 , ) ` where the elements correspond to the following :
| Num | Observation | Min | Max |
| - - - - - | - - - - - - - - - - - - - - - - - - - - - - - | - - - - - - - - - - - - - - - - - - - - - - | - - - - - - - - - - - - - - - - - - - - |
| 0 | Cart Position | - 4.8 * | 4.8 * |
| 1 | Cart Velocity | - Inf | Inf |
| 2 | Pole Angle | ~ - 0.418 rad ( - 24 ° ) * * | ~ 0.418 rad ( 24 ° ) * * |
| 3 | Pole Angular Velocity | - Inf | Inf |
* * Note : * * above denotes the ranges of possible observations for each element , but in two cases this range exceeds the
range of possible values in an un - terminated episode :
- ` * ` : the cart x - position can be observed between ` ( - 4.8 , 4.8 ) ` , but an episode terminates if the cart leaves the
` ( - 2.4 , 2.4 ) ` range .
- ` * * ` : Similarly , the pole angle can be observed between ` ( - .418 , .418 ) ` radians or precisely * * ± 24 ° * * , but an episode is
terminated if the pole angle is outside the ` ( - .2095 , .2095 ) ` range or precisely * * ± 12 ° * *
### Rewards
Reward is 1 for every step taken , including the termination step . The threshold is 475 for v1 .
### Starting State
All observations are assigned a uniform random value between ( - 0.05 , 0.05 )
### Episode Termination
The episode terminates of one of the following occurs :
1. Pole Angle is more than ± 12 °
2. Cart Position is more than ± 2.4 ( center of the cart reaches the edge of the display )
3. Episode length is greater than 500 ( 200 for v0 )
### Arguments
No additional arguments are currently supported .
2018-08-24 19:30:17 -04:00
"""
2020-05-08 22:03:48 +02:00
2021-07-29 02:26:34 +02:00
metadata = { " render.modes " : [ " human " , " rgb_array " ] , " video.frames_per_second " : 50 }
2016-04-27 08:00:58 -07:00
def __init__ ( self ) :
self . gravity = 9.8
self . masscart = 1.0
self . masspole = 0.1
2021-07-29 02:26:34 +02:00
self . total_mass = self . masspole + self . masscart
2020-05-08 22:03:48 +02:00
self . length = 0.5 # actually half the pole's length
2021-07-29 02:26:34 +02:00
self . polemass_length = self . masspole * self . length
2016-04-27 08:00:58 -07:00
self . force_mag = 10.0
self . tau = 0.02 # seconds between state updates
2021-07-29 02:26:34 +02:00
self . kinematics_integrator = " euler "
2016-04-27 08:00:58 -07:00
# Angle at which to fail the episode
self . theta_threshold_radians = 12 * 2 * math . pi / 360
self . x_threshold = 2.4
2016-05-29 09:07:09 -07:00
2020-05-08 22:03:48 +02:00
# Angle limit set to 2 * theta_threshold_radians so failing observation
# is still within bounds.
2021-07-29 02:26:34 +02:00
high = np . array (
[
self . x_threshold * 2 ,
np . finfo ( np . float32 ) . max ,
self . theta_threshold_radians * 2 ,
np . finfo ( np . float32 ) . max ,
] ,
dtype = np . float32 ,
)
2016-06-20 16:42:06 -04:00
2016-05-30 18:07:59 -07:00
self . action_space = spaces . Discrete ( 2 )
2018-09-17 13:28:02 -04:00
self . observation_space = spaces . Box ( - high , high , dtype = np . float32 )
2016-05-30 18:07:59 -07:00
2022-02-11 23:48:42 +08:00
self . screen = None
self . isopen = True
2017-02-27 10:00:48 -08:00
self . state = None
2016-04-27 08:00:58 -07:00
2016-05-29 09:07:09 -07:00
self . steps_beyond_done = None
Cleanup, removal of unmaintained code (#836)
* add dtype to Box
* remove board_game, debugging, safety, parameter_tuning environments
* massive set of breaking changes
- remove python logging module
- _step, _reset, _seed, _close => non underscored method
- remove benchmark and scoring folder
* Improve render("human"), now resizable, closable window.
* get rid of default step and reset in wrappers, so it doesn’t silently fail for people with underscore methods
* CubeCrash unit test environment
* followup fixes
* MemorizeDigits unit test envrionment
* refactored spaces a bit
fixed indentation
disabled test_env_semantics
* fix unit tests
* fixes
* CubeCrash, MemorizeDigits tested
* gym backwards compatibility patch
* gym backwards compatibility, followup fixes
* changelist, add spaces to main namespaces
* undo_logger_setup for backwards compat
* remove configuration.py
2018-01-25 18:20:14 -08:00
def step ( self , action ) :
2021-11-14 01:53:32 +01:00
err_msg = f " { action !r} ( { type ( action ) } ) invalid "
2020-05-08 22:03:48 +02:00
assert self . action_space . contains ( action ) , err_msg
2022-02-10 18:24:41 +01:00
assert self . state is not None , " Call reset before using step method. "
2020-05-08 22:03:48 +02:00
x , x_dot , theta , theta_dot = self . state
force = self . force_mag if action == 1 else - self . force_mag
2016-04-27 08:00:58 -07:00
costheta = math . cos ( theta )
sintheta = math . sin ( theta )
2020-05-08 22:03:48 +02:00
# For the interested reader:
# https://coneural.org/florian/papers/05_cart_pole.pdf
2021-07-29 15:39:42 -04:00
temp = (
force + self . polemass_length * theta_dot * * 2 * sintheta
) / self . total_mass
2021-07-29 02:26:34 +02:00
thetaacc = ( self . gravity * sintheta - costheta * temp ) / (
self . length * ( 4.0 / 3.0 - self . masspole * costheta * * 2 / self . total_mass )
)
2020-05-08 22:03:48 +02:00
xacc = temp - self . polemass_length * thetaacc * costheta / self . total_mass
2021-07-29 02:26:34 +02:00
if self . kinematics_integrator == " euler " :
2020-05-08 22:03:48 +02:00
x = x + self . tau * x_dot
2018-09-21 17:19:40 -05:00
x_dot = x_dot + self . tau * xacc
theta = theta + self . tau * theta_dot
theta_dot = theta_dot + self . tau * thetaacc
2020-05-08 22:03:48 +02:00
else : # semi-implicit euler
2018-09-21 17:19:40 -05:00
x_dot = x_dot + self . tau * xacc
2020-05-08 22:03:48 +02:00
x = x + self . tau * x_dot
2018-09-21 17:19:40 -05:00
theta_dot = theta_dot + self . tau * thetaacc
theta = theta + self . tau * theta_dot
2020-05-08 22:03:48 +02:00
self . state = ( x , x_dot , theta , theta_dot )
done = bool (
x < - self . x_threshold
or x > self . x_threshold
or theta < - self . theta_threshold_radians
or theta > self . theta_threshold_radians
)
2016-04-28 22:31:46 -07:00
if not done :
reward = 1.0
elif self . steps_beyond_done is None :
# Pole just fell!
self . steps_beyond_done = 0
reward = 1.0
else :
if self . steps_beyond_done == 0 :
2020-05-08 22:03:48 +02:00
logger . warn (
" You are calling ' step() ' even though this "
" environment has already returned done = True. You "
" should always call ' reset() ' once you receive ' done = "
" True ' -- any further steps are undefined behavior. "
)
2016-04-28 22:31:46 -07:00
self . steps_beyond_done + = 1
reward = 0.0
2021-08-22 00:11:19 +02:00
return np . array ( self . state , dtype = np . float32 ) , reward , done , { }
2016-04-27 08:00:58 -07:00
2022-02-06 17:28:27 -06:00
def reset (
self ,
* ,
seed : Optional [ int ] = None ,
return_info : bool = False ,
options : Optional [ dict ] = None ,
) :
2021-12-08 22:14:15 +01:00
super ( ) . reset ( seed = seed )
2016-05-29 09:07:09 -07:00
self . state = self . np_random . uniform ( low = - 0.05 , high = 0.05 , size = ( 4 , ) )
2016-04-29 02:12:46 -07:00
self . steps_beyond_done = None
2022-02-06 17:28:27 -06:00
if not return_info :
return np . array ( self . state , dtype = np . float32 )
else :
return np . array ( self . state , dtype = np . float32 ) , { }
2016-04-27 08:00:58 -07:00
2021-07-29 02:26:34 +02:00
def render ( self , mode = " human " ) :
2016-04-27 08:00:58 -07:00
screen_width = 600
screen_height = 400
2020-05-08 22:03:48 +02:00
world_width = self . x_threshold * 2
2021-07-29 02:26:34 +02:00
scale = screen_width / world_width
2016-04-27 08:00:58 -07:00
polewidth = 10.0
2018-10-18 14:30:18 -07:00
polelen = scale * ( 2 * self . length )
2016-04-27 08:00:58 -07:00
cartwidth = 50.0
cartheight = 30.0
2022-02-11 23:48:42 +08:00
x = self . state
if self . screen is None :
pygame . init ( )
self . screen = pygame . display . set_mode ( ( screen_width , screen_height ) )
self . surf = pygame . Surface ( ( screen_width , screen_height ) )
self . surf . fill ( ( 255 , 255 , 255 ) )
l , r , t , b = - cartwidth / 2 , cartwidth / 2 , cartheight / 2 , - cartheight / 2
axleoffset = cartheight / 4.0
cartx = x [ 0 ] * scale + screen_width / 2.0 # MIDDLE OF CART
carty = 100 # TOP OF CART
cart_coords = [ ( l , b ) , ( l , t ) , ( r , t ) , ( r , b ) ]
cart_coords = [ ( c [ 0 ] + cartx , c [ 1 ] + carty ) for c in cart_coords ]
gfxdraw . aapolygon ( self . surf , cart_coords , ( 0 , 0 , 0 ) )
gfxdraw . filled_polygon ( self . surf , cart_coords , ( 0 , 0 , 0 ) )
2021-07-29 02:26:34 +02:00
l , r , t , b = (
- polewidth / 2 ,
polewidth / 2 ,
polelen - polewidth / 2 ,
- polewidth / 2 ,
)
2018-10-18 14:30:18 -07:00
2022-02-11 23:48:42 +08:00
pole_coords = [ ]
for coord in [ ( l , b ) , ( l , t ) , ( r , t ) , ( r , b ) ] :
coord = pygame . math . Vector2 ( coord ) . rotate_rad ( - x [ 2 ] )
coord = ( coord [ 0 ] + cartx , coord [ 1 ] + carty + axleoffset )
pole_coords . append ( coord )
gfxdraw . aapolygon ( self . surf , pole_coords , ( 202 , 152 , 101 ) )
gfxdraw . filled_polygon ( self . surf , pole_coords , ( 202 , 152 , 101 ) )
gfxdraw . aacircle (
self . surf ,
int ( cartx ) ,
int ( carty + axleoffset ) ,
int ( polewidth / 2 ) ,
( 129 , 132 , 203 ) ,
)
gfxdraw . filled_circle (
self . surf ,
int ( cartx ) ,
int ( carty + axleoffset ) ,
int ( polewidth / 2 ) ,
( 129 , 132 , 203 ) ,
)
2016-04-27 08:00:58 -07:00
2022-02-11 23:48:42 +08:00
gfxdraw . hline ( self . surf , 0 , screen_width , carty , ( 0 , 0 , 0 ) )
self . surf = pygame . transform . flip ( self . surf , False , True )
self . screen . blit ( self . surf , ( 0 , 0 ) )
if mode == " human " :
pygame . display . flip ( )
if mode == " rgb_array " :
return np . transpose (
np . array ( pygame . surfarray . pixels3d ( self . screen ) ) , axes = ( 1 , 0 , 2 )
)
else :
return self . isopen
Cleanup, removal of unmaintained code (#836)
* add dtype to Box
* remove board_game, debugging, safety, parameter_tuning environments
* massive set of breaking changes
- remove python logging module
- _step, _reset, _seed, _close => non underscored method
- remove benchmark and scoring folder
* Improve render("human"), now resizable, closable window.
* get rid of default step and reset in wrappers, so it doesn’t silently fail for people with underscore methods
* CubeCrash unit test environment
* followup fixes
* MemorizeDigits unit test envrionment
* refactored spaces a bit
fixed indentation
disabled test_env_semantics
* fix unit tests
* fixes
* CubeCrash, MemorizeDigits tested
* gym backwards compatibility patch
* gym backwards compatibility, followup fixes
* changelist, add spaces to main namespaces
* undo_logger_setup for backwards compat
* remove configuration.py
2018-01-25 18:20:14 -08:00
def close ( self ) :
2022-02-11 23:48:42 +08:00
if self . screen is not None :
pygame . quit ( )
self . isopen = False