Clean some docstrings (#1854)

* add type of argument

* fix typos

* split lines for formatting

* reformat string, add ellipsis, remove r string

* make docstring stylistically consistent

* make docstrings a little more elaboratet

* reduce by 1 space

* make line wrap 120

* remove unnecessary line

* add returns to docstring

* add docstring, make code more pep8 and delete some unused print functions

* more pep8

* file docstring instead of comments

* delete unused variables, add file docstring and add some pep8 spring cleaning

* add file docstring, fix typos and add some pep8 correections

Co-authored-by: Dan <daniel.timbrell@ing.com>
This commit is contained in:
Dan Timbrell
2020-04-24 23:10:27 +02:00
committed by GitHub
parent f2c9793eb7
commit 3bd5ef71c2
7 changed files with 360 additions and 291 deletions

View File

@@ -11,12 +11,20 @@ def cem(f, th_mean, batch_size, n_iter, elite_frac, initial_std=1.0):
""" """
Generic implementation of the cross-entropy method for maximizing a black-box function Generic implementation of the cross-entropy method for maximizing a black-box function
f: a function mapping from vector -> scalar Args:
th_mean: initial mean over input distribution f: a function mapping from vector -> scalar
batch_size: number of samples of theta to evaluate per batch th_mean (np.array): initial mean over input distribution
n_iter: number of batches batch_size (int): number of samples of theta to evaluate per batch
elite_frac: each batch, select this fraction of the top-performing samples n_iter (int): number of batches
initial_std: initial standard deviation over parameter vectors elite_frac (float): each batch, select this fraction of the top-performing samples
initial_std (float): initial standard deviation over parameter vectors
returns:
A generator of dicts. Subsequent dicts correspond to iterations of CEM algorithm.
The dicts contain the following values:
'ys' : numpy array with values of function evaluated at current population
'ys_mean': mean value of function over current population
'theta_mean': mean value of the parameter vector over current population
""" """
n_elite = int(np.round(batch_size*elite_frac)) n_elite = int(np.round(batch_size*elite_frac))
th_std = np.ones_like(th_mean) * initial_std th_std = np.ones_like(th_mean) * initial_std

View File

@@ -6,7 +6,7 @@ env_closer = closer.Closer()
class Env(object): class Env(object):
r"""The main OpenAI Gym class. It encapsulates an environment with """The main OpenAI Gym class. It encapsulates an environment with
arbitrary behind-the-scenes dynamics. An environment can be arbitrary behind-the-scenes dynamics. An environment can be
partially or fully observed. partially or fully observed.
@@ -26,9 +26,7 @@ class Env(object):
Note: a default reward range set to [-inf,+inf] already exists. Set it if you want a narrower range. Note: a default reward range set to [-inf,+inf] already exists. Set it if you want a narrower range.
The methods are accessed publicly as "step", "reset", etc.. The The methods are accessed publicly as "step", "reset", etc...
non-underscored versions are wrapper methods to which we may add
functionality over time.
""" """
# Set this in SOME subclasses # Set this in SOME subclasses
metadata = {'render.modes': []} metadata = {'render.modes': []}
@@ -174,9 +172,9 @@ class GoalEnv(Env):
def compute_reward(self, achieved_goal, desired_goal, info): def compute_reward(self, achieved_goal, desired_goal, info):
"""Compute the step reward. This externalizes the reward function and makes """Compute the step reward. This externalizes the reward function and makes
it dependent on an a desired goal and the one that was achieved. If you wish to include it dependent on a desired goal and the one that was achieved. If you wish to include
additional rewards that are independent of the goal, you can include the necessary values additional rewards that are independent of the goal, you can include the necessary values
to derive it in info and compute it accordingly. to derive it in 'info' and compute it accordingly.
Args: Args:
achieved_goal (object): the goal that was achieved during execution achieved_goal (object): the goal that was achieved during execution
@@ -194,7 +192,7 @@ class GoalEnv(Env):
class Wrapper(Env): class Wrapper(Env):
r"""Wraps the environment to allow a modular transformation. """Wraps the environment to allow a modular transformation.
This class is the base class for all wrappers. The subclass could override This class is the base class for all wrappers. The subclass could override
some methods to change the behavior of the original environment without touching the some methods to change the behavior of the original environment without touching the
@@ -290,4 +288,4 @@ class ActionWrapper(Wrapper):
raise NotImplementedError raise NotImplementedError
def reverse_action(self, action): def reverse_action(self, action):
raise NotImplementedError raise NotImplementedError

View File

@@ -1,34 +1,36 @@
"""
Top-down car dynamics simulation.
Some ideas are taken from this great tutorial http://www.iforce2d.net/b2dtut/top-down-car by Chris Campbell.
This simulation is a bit more detailed, with wheels rotation.
Created by Oleg Klimov. Licensed on the same terms as the rest of OpenAI Gym.
"""
import numpy as np import numpy as np
import math import math
import Box2D import Box2D
from Box2D.b2 import (edgeShape, circleShape, fixtureDef, polygonShape, revoluteJointDef, contactListener, shape) from Box2D.b2 import (edgeShape, circleShape, fixtureDef, polygonShape, revoluteJointDef, contactListener, shape)
# Top-down car dynamics simulation.
#
# Some ideas are taken from this great tutorial http://www.iforce2d.net/b2dtut/top-down-car by Chris Campbell.
# This simulation is a bit more detailed, with wheels rotation.
#
# Created by Oleg Klimov. Licensed on the same terms as the rest of OpenAI Gym.
SIZE = 0.02 SIZE = 0.02
ENGINE_POWER = 100000000*SIZE*SIZE ENGINE_POWER = 100000000*SIZE*SIZE
WHEEL_MOMENT_OF_INERTIA = 4000*SIZE*SIZE WHEEL_MOMENT_OF_INERTIA = 4000*SIZE*SIZE
FRICTION_LIMIT = 1000000*SIZE*SIZE # friction ~= mass ~= size^2 (calculated implicitly using density) FRICTION_LIMIT = 1000000*SIZE*SIZE # friction ~= mass ~= size^2 (calculated implicitly using density)
WHEEL_R = 27 WHEEL_R = 27
WHEEL_W = 14 WHEEL_W = 14
WHEELPOS = [ WHEELPOS = [
(-55,+80), (+55,+80), (-55, +80), (+55, +80),
(-55,-82), (+55,-82) (-55, -82), (+55, -82)
] ]
HULL_POLY1 =[ HULL_POLY1 = [
(-60,+130), (+60,+130), (-60, +130), (+60, +130),
(+60,+110), (-60,+110) (+60, +110), (-60, +110)
] ]
HULL_POLY2 =[ HULL_POLY2 = [
(-15,+120), (+15,+120), (-15, +120), (+15, +120),
(+20, +20), (-20, 20) (+20, +20), (-20, 20)
] ]
HULL_POLY3 =[ HULL_POLY3 = [
(+25, +20), (+25, +20),
(+50, -10), (+50, -10),
(+50, -40), (+50, -40),
@@ -38,41 +40,42 @@ HULL_POLY3 =[
(-50, -10), (-50, -10),
(-25, +20) (-25, +20)
] ]
HULL_POLY4 =[ HULL_POLY4 = [
(-50,-120), (+50,-120), (-50, -120), (+50, -120),
(+50,-90), (-50,-90) (+50, -90), (-50, -90)
] ]
WHEEL_COLOR = (0.0,0.0,0.0) WHEEL_COLOR = (0.0, 0.0, 0.0)
WHEEL_WHITE = (0.3,0.3,0.3) WHEEL_WHITE = (0.3, 0.3, 0.3)
MUD_COLOR = (0.4,0.4,0.0) MUD_COLOR = (0.4, 0.4, 0.0)
class Car: class Car:
def __init__(self, world, init_angle, init_x, init_y): def __init__(self, world, init_angle, init_x, init_y):
self.world = world self.world = world
self.hull = self.world.CreateDynamicBody( self.hull = self.world.CreateDynamicBody(
position = (init_x, init_y), position=(init_x, init_y),
angle = init_angle, angle=init_angle,
fixtures = [ fixtures=[
fixtureDef(shape = polygonShape(vertices=[ (x*SIZE,y*SIZE) for x,y in HULL_POLY1 ]), density=1.0), fixtureDef(shape=polygonShape(vertices=[(x*SIZE, y*SIZE) for x, y in HULL_POLY1]), density=1.0),
fixtureDef(shape = polygonShape(vertices=[ (x*SIZE,y*SIZE) for x,y in HULL_POLY2 ]), density=1.0), fixtureDef(shape=polygonShape(vertices=[(x*SIZE, y*SIZE) for x, y in HULL_POLY2]), density=1.0),
fixtureDef(shape = polygonShape(vertices=[ (x*SIZE,y*SIZE) for x,y in HULL_POLY3 ]), density=1.0), fixtureDef(shape=polygonShape(vertices=[(x*SIZE, y*SIZE) for x, y in HULL_POLY3]), density=1.0),
fixtureDef(shape = polygonShape(vertices=[ (x*SIZE,y*SIZE) for x,y in HULL_POLY4 ]), density=1.0) fixtureDef(shape=polygonShape(vertices=[(x*SIZE, y*SIZE) for x, y in HULL_POLY4]), density=1.0)
] ]
) )
self.hull.color = (0.8,0.0,0.0) self.hull.color = (0.8, 0.0, 0.0)
self.wheels = [] self.wheels = []
self.fuel_spent = 0.0 self.fuel_spent = 0.0
WHEEL_POLY = [ WHEEL_POLY = [
(-WHEEL_W,+WHEEL_R), (+WHEEL_W,+WHEEL_R), (-WHEEL_W, +WHEEL_R), (+WHEEL_W, +WHEEL_R),
(+WHEEL_W,-WHEEL_R), (-WHEEL_W,-WHEEL_R) (+WHEEL_W, -WHEEL_R), (-WHEEL_W, -WHEEL_R)
] ]
for wx,wy in WHEELPOS: for wx, wy in WHEELPOS:
front_k = 1.0 if wy > 0 else 1.0 front_k = 1.0 if wy > 0 else 1.0
w = self.world.CreateDynamicBody( w = self.world.CreateDynamicBody(
position = (init_x+wx*SIZE, init_y+wy*SIZE), position=(init_x+wx*SIZE, init_y+wy*SIZE),
angle = init_angle, angle=init_angle,
fixtures = fixtureDef( fixtures=fixtureDef(
shape=polygonShape(vertices=[ (x*front_k*SIZE,y*front_k*SIZE) for x,y in WHEEL_POLY ]), shape=polygonShape(vertices=[(x*front_k*SIZE,y*front_k*SIZE) for x, y in WHEEL_POLY]),
density=0.1, density=0.1,
categoryBits=0x0020, categoryBits=0x0020,
maskBits=0x001, maskBits=0x001,
@@ -80,7 +83,7 @@ class Car:
) )
w.wheel_rad = front_k*WHEEL_R*SIZE w.wheel_rad = front_k*WHEEL_R*SIZE
w.color = WHEEL_COLOR w.color = WHEEL_COLOR
w.gas = 0.0 w.gas = 0.0
w.brake = 0.0 w.brake = 0.0
w.steer = 0.0 w.steer = 0.0
w.phase = 0.0 # wheel angle w.phase = 0.0 # wheel angle
@@ -90,24 +93,28 @@ class Car:
rjd = revoluteJointDef( rjd = revoluteJointDef(
bodyA=self.hull, bodyA=self.hull,
bodyB=w, bodyB=w,
localAnchorA=(wx*SIZE,wy*SIZE), localAnchorA=(wx*SIZE, wy*SIZE),
localAnchorB=(0,0), localAnchorB=(0,0),
enableMotor=True, enableMotor=True,
enableLimit=True, enableLimit=True,
maxMotorTorque=180*900*SIZE*SIZE, maxMotorTorque=180*900*SIZE*SIZE,
motorSpeed = 0, motorSpeed=0,
lowerAngle = -0.4, lowerAngle=-0.4,
upperAngle = +0.4, upperAngle=+0.4,
) )
w.joint = self.world.CreateJoint(rjd) w.joint = self.world.CreateJoint(rjd)
w.tiles = set() w.tiles = set()
w.userData = w w.userData = w
self.wheels.append(w) self.wheels.append(w)
self.drawlist = self.wheels + [self.hull] self.drawlist = self.wheels + [self.hull]
self.particles = [] self.particles = []
def gas(self, gas): def gas(self, gas):
'control: rear wheel drive' """control: rear wheel drive
Args:
gas (float): How much gas gets applied. Gets clipped between 0 and 1.
"""
gas = np.clip(gas, 0, 1) gas = np.clip(gas, 0, 1)
for w in self.wheels[2:4]: for w in self.wheels[2:4]:
diff = gas - w.gas diff = gas - w.gas
@@ -115,12 +122,18 @@ class Car:
w.gas += diff w.gas += diff
def brake(self, b): def brake(self, b):
'control: brake b=0..1, more than 0.9 blocks wheels to zero rotation' """control: brake
Args:
b (0..1): Degree to which the brakes are applied. More than 0.9 blocks the wheels to zero rotation"""
for w in self.wheels: for w in self.wheels:
w.brake = b w.brake = b
def steer(self, s): def steer(self, s):
'control: steer s=-1..1, it takes time to rotate steering wheel from side to side, s is target position' """control: steer
Args:
s (-1..1): target position, it takes time to rotate steering wheel from side-to-side"""
self.wheels[0].steer = s self.wheels[0].steer = s
self.wheels[1].steer = s self.wheels[1].steer = s
@@ -148,7 +161,9 @@ class Car:
# WHEEL_MOMENT_OF_INERTIA*np.square(w.omega)/2 = E -- energy # WHEEL_MOMENT_OF_INERTIA*np.square(w.omega)/2 = E -- energy
# WHEEL_MOMENT_OF_INERTIA*w.omega * domega/dt = dE/dt = W -- power # WHEEL_MOMENT_OF_INERTIA*w.omega * domega/dt = dE/dt = W -- power
# domega = dt*W/WHEEL_MOMENT_OF_INERTIA/w.omega # domega = dt*W/WHEEL_MOMENT_OF_INERTIA/w.omega
w.omega += dt*ENGINE_POWER*w.gas/WHEEL_MOMENT_OF_INERTIA/(abs(w.omega)+5.0) # small coef not to divide by zero
# add small coef not to divide by zero
w.omega += dt*ENGINE_POWER*w.gas/WHEEL_MOMENT_OF_INERTIA/(abs(w.omega)+5.0)
self.fuel_spent += dt*ENGINE_POWER*w.gas self.fuel_spent += dt*ENGINE_POWER*w.gas
if w.brake >= 0.9: if w.brake >= 0.9:
@@ -167,13 +182,15 @@ class Car:
# Physically correct is to always apply friction_limit until speed is equal. # Physically correct is to always apply friction_limit until speed is equal.
# But dt is finite, that will lead to oscillations if difference is already near zero. # But dt is finite, that will lead to oscillations if difference is already near zero.
f_force *= 205000*SIZE*SIZE # Random coefficient to cut oscillations in few steps (have no effect on friction_limit)
# Random coefficient to cut oscillations in few steps (have no effect on friction_limit)
f_force *= 205000*SIZE*SIZE
p_force *= 205000*SIZE*SIZE p_force *= 205000*SIZE*SIZE
force = np.sqrt(np.square(f_force) + np.square(p_force)) force = np.sqrt(np.square(f_force) + np.square(p_force))
# Skid trace # Skid trace
if abs(force) > 2.0*friction_limit: if abs(force) > 2.0*friction_limit:
if w.skid_particle and w.skid_particle.grass==grass and len(w.skid_particle.poly) < 30: if w.skid_particle and w.skid_particle.grass == grass and len(w.skid_particle.poly) < 30:
w.skid_particle.poly.append( (w.position[0], w.position[1]) ) w.skid_particle.poly.append( (w.position[0], w.position[1]) )
elif w.skid_start is None: elif w.skid_start is None:
w.skid_start = w.position w.skid_start = w.position
@@ -213,9 +230,9 @@ class Car:
s2 = math.sin(a2) s2 = math.sin(a2)
c1 = math.cos(a1) c1 = math.cos(a1)
c2 = math.cos(a2) c2 = math.cos(a2)
if s1>0 and s2>0: continue if s1 > 0 and s2 > 0: continue
if s1>0: c1 = np.sign(c1) if s1 > 0: c1 = np.sign(c1)
if s2>0: c2 = np.sign(c2) if s2 > 0: c2 = np.sign(c2)
white_poly = [ white_poly = [
(-WHEEL_W*SIZE, +WHEEL_R*c1*SIZE), (+WHEEL_W*SIZE, +WHEEL_R*c1*SIZE), (-WHEEL_W*SIZE, +WHEEL_R*c1*SIZE), (+WHEEL_W*SIZE, +WHEEL_R*c1*SIZE),
(+WHEEL_W*SIZE, +WHEEL_R*c2*SIZE), (-WHEEL_W*SIZE, +WHEEL_R*c2*SIZE) (+WHEEL_W*SIZE, +WHEEL_R*c2*SIZE), (-WHEEL_W*SIZE, +WHEEL_R*c2*SIZE)
@@ -228,7 +245,7 @@ class Car:
p = Particle() p = Particle()
p.color = WHEEL_COLOR if not grass else MUD_COLOR p.color = WHEEL_COLOR if not grass else MUD_COLOR
p.ttl = 1 p.ttl = 1
p.poly = [(point1[0],point1[1]), (point2[0],point2[1])] p.poly = [(point1[0], point1[1]), (point2[0], point2[1])]
p.grass = grass p.grass = grass
self.particles.append(p) self.particles.append(p)
while len(self.particles) > 30: while len(self.particles) > 30:

View File

@@ -1,3 +1,33 @@
"""
Easiest continuous control task to learn from pixels, a top-down racing environment.
Discrete control is reasonable in this environment as well, on/off discretization is
fine.
State consists of STATE_W x STATE_H pixels.
The reward is -0.1 every frame and +1000/N for every track tile visited, where N is
the total number of tiles visited in the track. For example, if you have finished in 732 frames,
your reward is 1000 - 0.1*732 = 926.8 points.
The game is solved when the agent consistently gets 900+ points. The generated track is random every episode.
The episode finishes when all the tiles are visited. The car also can go outside of the PLAYFIELD - that
is far off the track, then it will get -100 and die.
Some indicators are shown at the bottom of the window along with the state RGB buffer. From
left to right: the true speed, four ABS sensors, the steering wheel position and gyroscope.
To play yourself (it's rather fast for humans), type:
python gym/envs/box2d/car_racing.py
Remember it's a powerful rear-wheel drive car - don't press the accelerator and turn at the
same time.
Created by Oleg Klimov. Licensed on the same terms as the rest of OpenAI Gym.
"""
import sys, math import sys, math
import numpy as np import numpy as np
@@ -12,33 +42,6 @@ from gym.utils import colorize, seeding, EzPickle
import pyglet import pyglet
from pyglet import gl from pyglet import gl
# Easiest continuous control task to learn from pixels, a top-down racing environment.
# Discrete control is reasonable in this environment as well, on/off discretization is
# fine.
#
# State consists of STATE_W x STATE_H pixels.
#
# Reward is -0.1 every frame and +1000/N for every track tile visited, where N is
# the total number of tiles visited in the track. For example, if you have finished in 732 frames,
# your reward is 1000 - 0.1*732 = 926.8 points.
#
# Game is solved when agent consistently gets 900+ points. Track generated is random every episode.
#
# Episode finishes when all tiles are visited. Car also can go outside of PLAYFIELD, that
# is far off the track, then it will get -100 and die.
#
# Some indicators shown at the bottom of the window and the state RGB buffer. From
# left to right: true speed, four ABS sensors, steering wheel position and gyroscope.
#
# To play yourself (it's rather fast for humans), type:
#
# python gym/envs/box2d/car_racing.py
#
# Remember it's powerful rear-wheel drive car, don't press accelerator and turn at the
# same time.
#
# Created by Oleg Klimov. Licensed on the same terms as the rest of OpenAI Gym.
STATE_W = 96 # less than Atari 160x192 STATE_W = 96 # less than Atari 160x192
STATE_H = 96 STATE_H = 96
VIDEO_W = 600 VIDEO_W = 600
@@ -46,12 +49,12 @@ VIDEO_H = 400
WINDOW_W = 1000 WINDOW_W = 1000
WINDOW_H = 800 WINDOW_H = 800
SCALE = 6.0 # Track scale SCALE = 6.0 # Track scale
TRACK_RAD = 900/SCALE # Track is heavily morphed circle with this radius TRACK_RAD = 900/SCALE # Track is heavily morphed circle with this radius
PLAYFIELD = 2000/SCALE # Game over boundary PLAYFIELD = 2000/SCALE # Game over boundary
FPS = 50 # Frames per second FPS = 50 # Frames per second
ZOOM = 2.7 # Camera zoom ZOOM = 2.7 # Camera zoom
ZOOM_FOLLOW = True # Set to False for fixed view (don't use zoom) ZOOM_FOLLOW = True # Set to False for fixed view (don't use zoom)
TRACK_DETAIL_STEP = 21/SCALE TRACK_DETAIL_STEP = 21/SCALE
@@ -62,14 +65,18 @@ BORDER_MIN_COUNT = 4
ROAD_COLOR = [0.4, 0.4, 0.4] ROAD_COLOR = [0.4, 0.4, 0.4]
class FrictionDetector(contactListener): class FrictionDetector(contactListener):
def __init__(self, env): def __init__(self, env):
contactListener.__init__(self) contactListener.__init__(self)
self.env = env self.env = env
def BeginContact(self, contact): def BeginContact(self, contact):
self._contact(contact, True) self._contact(contact, True)
def EndContact(self, contact): def EndContact(self, contact):
self._contact(contact, False) self._contact(contact, False)
def _contact(self, contact, begin): def _contact(self, contact, begin):
tile = None tile = None
obj = None obj = None
@@ -77,10 +84,10 @@ class FrictionDetector(contactListener):
u2 = contact.fixtureB.body.userData u2 = contact.fixtureB.body.userData
if u1 and "road_friction" in u1.__dict__: if u1 and "road_friction" in u1.__dict__:
tile = u1 tile = u1
obj = u2 obj = u2
if u2 and "road_friction" in u2.__dict__: if u2 and "road_friction" in u2.__dict__:
tile = u2 tile = u2
obj = u1 obj = u1
if not tile: if not tile:
return return
@@ -91,14 +98,12 @@ class FrictionDetector(contactListener):
return return
if begin: if begin:
obj.tiles.add(tile) obj.tiles.add(tile)
# print tile.road_friction, "ADD", len(obj.tiles)
if not tile.road_visited: if not tile.road_visited:
tile.road_visited = True tile.road_visited = True
self.env.reward += 1000.0/len(self.env.track) self.env.reward += 1000.0/len(self.env.track)
self.env.tile_visited_count += 1 self.env.tile_visited_count += 1
else: else:
obj.tiles.remove(tile) obj.tiles.remove(tile)
# print tile.road_friction, "DEL", len(obj.tiles) -- should delete to zero when on grass (this works)
class CarRacing(gym.Env, EzPickle): class CarRacing(gym.Env, EzPickle):
metadata = { metadata = {
@@ -120,10 +125,12 @@ class CarRacing(gym.Env, EzPickle):
self.prev_reward = 0.0 self.prev_reward = 0.0
self.verbose = verbose self.verbose = verbose
self.fd_tile = fixtureDef( self.fd_tile = fixtureDef(
shape = polygonShape(vertices= shape=polygonShape(vertices=[(0, 0), (1, 0), (1, -1), (0, -1)]))
[(0, 0),(1, 0),(1, -1),(0, -1)]))
self.action_space = spaces.Box(np.array([-1, 0, 0]),
np.array([+1, +1, +1]),
dtype=np.float32) # steer, gas, brake
self.action_space = spaces.Box( np.array([-1,0,0]), np.array([+1,+1,+1]), dtype=np.float32) # steer, gas, brake
self.observation_space = spaces.Box(low=0, high=255, shape=(STATE_H, STATE_W, 3), dtype=np.uint8) self.observation_space = spaces.Box(low=0, high=255, shape=(STATE_H, STATE_W, 3), dtype=np.uint8)
def seed(self, seed=None): def seed(self, seed=None):
@@ -146,19 +153,14 @@ class CarRacing(gym.Env, EzPickle):
for c in range(CHECKPOINTS): for c in range(CHECKPOINTS):
alpha = 2*math.pi*c/CHECKPOINTS + self.np_random.uniform(0, 2*math.pi*1/CHECKPOINTS) alpha = 2*math.pi*c/CHECKPOINTS + self.np_random.uniform(0, 2*math.pi*1/CHECKPOINTS)
rad = self.np_random.uniform(TRACK_RAD/3, TRACK_RAD) rad = self.np_random.uniform(TRACK_RAD/3, TRACK_RAD)
if c==0: if c == 0:
alpha = 0 alpha = 0
rad = 1.5*TRACK_RAD rad = 1.5*TRACK_RAD
if c==CHECKPOINTS-1: if c == CHECKPOINTS-1:
alpha = 2*math.pi*c/CHECKPOINTS alpha = 2*math.pi*c/CHECKPOINTS
self.start_alpha = 2*math.pi*(-0.5)/CHECKPOINTS self.start_alpha = 2*math.pi*(-0.5)/CHECKPOINTS
rad = 1.5*TRACK_RAD rad = 1.5*TRACK_RAD
checkpoints.append( (alpha, rad*math.cos(alpha), rad*math.sin(alpha)) ) checkpoints.append((alpha, rad*math.cos(alpha), rad*math.sin(alpha)))
# print "\n".join(str(h) for h in checkpoints)
# self.road_poly = [ ( # uncomment this to see checkpoints
# [ (tx,ty) for a,tx,ty in checkpoints ],
# (0.7,0.7,0.9) ) ]
self.road = [] self.road = []
# Go from one checkpoint to another to create track # Go from one checkpoint to another to create track
@@ -197,43 +199,42 @@ class CarRacing(gym.Env, EzPickle):
dest_dx = dest_x - x # vector towards destination dest_dx = dest_x - x # vector towards destination
dest_dy = dest_y - y dest_dy = dest_y - y
proj = r1x*dest_dx + r1y*dest_dy # destination vector projected on rad proj = r1x*dest_dx + r1y*dest_dy # destination vector projected on rad
while beta - alpha > 1.5*math.pi: while beta - alpha > 1.5*math.pi:
beta -= 2*math.pi beta -= 2*math.pi
while beta - alpha < -1.5*math.pi: while beta - alpha < -1.5*math.pi:
beta += 2*math.pi beta += 2*math.pi
prev_beta = beta prev_beta = beta
proj *= SCALE proj *= SCALE
if proj > 0.3: if proj > 0.3:
beta -= min(TRACK_TURN_RATE, abs(0.001*proj)) beta -= min(TRACK_TURN_RATE, abs(0.001*proj))
if proj < -0.3: if proj < -0.3:
beta += min(TRACK_TURN_RATE, abs(0.001*proj)) beta += min(TRACK_TURN_RATE, abs(0.001*proj))
x += p1x*TRACK_DETAIL_STEP x += p1x*TRACK_DETAIL_STEP
y += p1y*TRACK_DETAIL_STEP y += p1y*TRACK_DETAIL_STEP
track.append( (alpha,prev_beta*0.5 + beta*0.5,x,y) ) track.append((alpha,prev_beta*0.5 + beta*0.5,x,y))
if laps > 4: if laps > 4:
break break
no_freeze -= 1 no_freeze -= 1
if no_freeze==0: if no_freeze == 0:
break break
# print "\n".join([str(t) for t in enumerate(track)])
# Find closed loop range i1..i2, first loop should be ignored, second is OK # Find closed loop range i1..i2, first loop should be ignored, second is OK
i1, i2 = -1, -1 i1, i2 = -1, -1
i = len(track) i = len(track)
while True: while True:
i -= 1 i -= 1
if i==0: if i == 0:
return False # Failed return False # Failed
pass_through_start = track[i][0] > self.start_alpha and track[i-1][0] <= self.start_alpha pass_through_start = track[i][0] > self.start_alpha and track[i-1][0] <= self.start_alpha
if pass_through_start and i2==-1: if pass_through_start and i2 == -1:
i2 = i i2 = i
elif pass_through_start and i1==-1: elif pass_through_start and i1 == -1:
i1 = i i1 = i
break break
if self.verbose == 1: if self.verbose == 1:
print("Track generation: %i..%i -> %i-tiles track" % (i1, i2, i2-i1)) print("Track generation: %i..%i -> %i-tiles track" % (i1, i2, i2-i1))
assert i1!=-1 assert i1 != -1
assert i2!=-1 assert i2 != -1
track = track[i1:i2-1] track = track[i1:i2-1]
@@ -242,8 +243,8 @@ class CarRacing(gym.Env, EzPickle):
first_perp_y = math.sin(first_beta) first_perp_y = math.sin(first_beta)
# Length of perpendicular jump to put together head and tail # Length of perpendicular jump to put together head and tail
well_glued_together = np.sqrt( well_glued_together = np.sqrt(
np.square( first_perp_x*(track[0][2] - track[-1][2]) ) + np.square(first_perp_x*(track[0][2] - track[-1][2])) +
np.square( first_perp_y*(track[0][3] - track[-1][3]) )) np.square(first_perp_y*(track[0][3] - track[-1][3])))
if well_glued_together > TRACK_DETAIL_STEP: if well_glued_together > TRACK_DETAIL_STEP:
return False return False
@@ -284,11 +285,13 @@ class CarRacing(gym.Env, EzPickle):
self.road.append(t) self.road.append(t)
if border[i]: if border[i]:
side = np.sign(beta2 - beta1) side = np.sign(beta2 - beta1)
b1_l = (x1 + side* TRACK_WIDTH *math.cos(beta1), y1 + side* TRACK_WIDTH *math.sin(beta1)) b1_l = (x1 + side * TRACK_WIDTH * math.cos(beta1), y1 + side * TRACK_WIDTH * math.sin(beta1))
b1_r = (x1 + side*(TRACK_WIDTH+BORDER)*math.cos(beta1), y1 + side*(TRACK_WIDTH+BORDER)*math.sin(beta1)) b1_r = (x1 + side * (TRACK_WIDTH+BORDER) * math.cos(beta1),
b2_l = (x2 + side* TRACK_WIDTH *math.cos(beta2), y2 + side* TRACK_WIDTH *math.sin(beta2)) y1 + side * (TRACK_WIDTH+BORDER)*math.sin(beta1))
b2_r = (x2 + side*(TRACK_WIDTH+BORDER)*math.cos(beta2), y2 + side*(TRACK_WIDTH+BORDER)*math.sin(beta2)) b2_l = (x2 + side * TRACK_WIDTH * math.cos(beta2), y2 + side * TRACK_WIDTH * math.sin(beta2))
self.road_poly.append(( [b1_l, b1_r, b2_r, b2_l], (1,1,1) if i%2==0 else (1,0,0) )) b2_r = (x2 + side * (TRACK_WIDTH+BORDER) * math.cos(beta2),
y2 + side * (TRACK_WIDTH+BORDER) * math.sin(beta2))
self.road_poly.append(([b1_l, b1_r, b2_r, b2_l], (1, 1, 1) if i % 2 == 0 else (1, 0, 0)))
self.track = track self.track = track
return True return True
@@ -305,7 +308,7 @@ class CarRacing(gym.Env, EzPickle):
if success: if success:
break break
if self.verbose == 1: if self.verbose == 1:
print("retry to generate track (normal if there are not many of this messages)") print("retry to generate track (normal if there are not many instances of this message)")
self.car = Car(self.world, *self.track[0][1:4]) self.car = Car(self.world, *self.track[0][1:4])
return self.step(None)[0] return self.step(None)[0]
@@ -331,7 +334,7 @@ class CarRacing(gym.Env, EzPickle):
self.car.fuel_spent = 0.0 self.car.fuel_spent = 0.0
step_reward = self.reward - self.prev_reward step_reward = self.reward - self.prev_reward
self.prev_reward = self.reward self.prev_reward = self.reward
if self.tile_visited_count==len(self.track): if self.tile_visited_count == len(self.track):
done = True done = True
x, y = self.car.hull.position x, y = self.car.hull.position
if abs(x) > PLAYFIELD or abs(y) > PLAYFIELD: if abs(x) > PLAYFIELD or abs(y) > PLAYFIELD:
@@ -353,8 +356,6 @@ class CarRacing(gym.Env, EzPickle):
if "t" not in self.__dict__: return # reset() not called yet if "t" not in self.__dict__: return # reset() not called yet
zoom = 0.1*SCALE*max(1-self.t, 0) + ZOOM*SCALE*min(self.t, 1) # Animate zoom first second zoom = 0.1*SCALE*max(1-self.t, 0) + ZOOM*SCALE*min(self.t, 1) # Animate zoom first second
zoom_state = ZOOM*SCALE*STATE_W/WINDOW_W
zoom_video = ZOOM*SCALE*VIDEO_W/WINDOW_W
scroll_x = self.car.hull.position[0] scroll_x = self.car.hull.position[0]
scroll_y = self.car.hull.position[1] scroll_y = self.car.hull.position[1]
angle = -self.car.hull.angle angle = -self.car.hull.angle
@@ -364,10 +365,10 @@ class CarRacing(gym.Env, EzPickle):
self.transform.set_scale(zoom, zoom) self.transform.set_scale(zoom, zoom)
self.transform.set_translation( self.transform.set_translation(
WINDOW_W/2 - (scroll_x*zoom*math.cos(angle) - scroll_y*zoom*math.sin(angle)), WINDOW_W/2 - (scroll_x*zoom*math.cos(angle) - scroll_y*zoom*math.sin(angle)),
WINDOW_H/4 - (scroll_x*zoom*math.sin(angle) + scroll_y*zoom*math.cos(angle)) ) WINDOW_H/4 - (scroll_x*zoom*math.sin(angle) + scroll_y*zoom*math.cos(angle)))
self.transform.set_rotation(angle) self.transform.set_rotation(angle)
self.car.draw(self.viewer, mode!="state_pixels") self.car.draw(self.viewer, mode != "state_pixels")
arr = None arr = None
win = self.viewer.window win = self.viewer.window
@@ -376,7 +377,7 @@ class CarRacing(gym.Env, EzPickle):
win.clear() win.clear()
t = self.transform t = self.transform
if mode=='rgb_array': if mode == 'rgb_array':
VP_W = VIDEO_W VP_W = VIDEO_W
VP_H = VIDEO_H VP_H = VIDEO_H
elif mode == 'state_pixels': elif mode == 'state_pixels':
@@ -439,17 +440,19 @@ class CarRacing(gym.Env, EzPickle):
gl.glBegin(gl.GL_QUADS) gl.glBegin(gl.GL_QUADS)
s = W/40.0 s = W/40.0
h = H/40.0 h = H/40.0
gl.glColor4f(0,0,0,1) gl.glColor4f(0, 0, 0, 1)
gl.glVertex3f(W, 0, 0) gl.glVertex3f(W, 0, 0)
gl.glVertex3f(W, 5*h, 0) gl.glVertex3f(W, 5*h, 0)
gl.glVertex3f(0, 5*h, 0) gl.glVertex3f(0, 5*h, 0)
gl.glVertex3f(0, 0, 0) gl.glVertex3f(0, 0, 0)
def vertical_ind(place, val, color): def vertical_ind(place, val, color):
gl.glColor4f(color[0], color[1], color[2], 1) gl.glColor4f(color[0], color[1], color[2], 1)
gl.glVertex3f((place+0)*s, h + h*val, 0) gl.glVertex3f((place+0)*s, h + h*val, 0)
gl.glVertex3f((place+1)*s, h + h*val, 0) gl.glVertex3f((place+1)*s, h + h*val, 0)
gl.glVertex3f((place+1)*s, h, 0) gl.glVertex3f((place+1)*s, h, 0)
gl.glVertex3f((place+0)*s, h, 0) gl.glVertex3f((place+0)*s, h, 0)
def horiz_ind(place, val, color): def horiz_ind(place, val, color):
gl.glColor4f(color[0], color[1], color[2], 1) gl.glColor4f(color[0], color[1], color[2], 1)
gl.glVertex3f((place+0)*s, 4*h , 0) gl.glVertex3f((place+0)*s, 4*h , 0)
@@ -457,13 +460,13 @@ class CarRacing(gym.Env, EzPickle):
gl.glVertex3f((place+val)*s, 2*h, 0) gl.glVertex3f((place+val)*s, 2*h, 0)
gl.glVertex3f((place+0)*s, 2*h, 0) gl.glVertex3f((place+0)*s, 2*h, 0)
true_speed = np.sqrt(np.square(self.car.hull.linearVelocity[0]) + np.square(self.car.hull.linearVelocity[1])) true_speed = np.sqrt(np.square(self.car.hull.linearVelocity[0]) + np.square(self.car.hull.linearVelocity[1]))
vertical_ind(5, 0.02*true_speed, (1,1,1)) vertical_ind(5, 0.02*true_speed, (1, 1, 1))
vertical_ind(7, 0.01*self.car.wheels[0].omega, (0.0,0,1)) # ABS sensors vertical_ind(7, 0.01*self.car.wheels[0].omega, (0.0, 0, 1)) # ABS sensors
vertical_ind(8, 0.01*self.car.wheels[1].omega, (0.0,0,1)) vertical_ind(8, 0.01*self.car.wheels[1].omega, (0.0, 0, 1))
vertical_ind(9, 0.01*self.car.wheels[2].omega, (0.2,0,1)) vertical_ind(9, 0.01*self.car.wheels[2].omega, (0.2, 0, 1))
vertical_ind(10,0.01*self.car.wheels[3].omega, (0.2,0,1)) vertical_ind(10,0.01*self.car.wheels[3].omega, (0.2, 0, 1))
horiz_ind(20, -10.0*self.car.wheels[0].joint.angle, (0,1,0)) horiz_ind(20, -10.0*self.car.wheels[0].joint.angle, (0, 1, 0))
horiz_ind(30, -0.8*self.car.hull.angularVelocity, (1,0,0)) horiz_ind(30, -0.8*self.car.hull.angularVelocity, (1, 0, 0))
gl.glEnd() gl.glEnd()
self.score_label.text = "%04i" % self.reward self.score_label.text = "%04i" % self.reward
self.score_label.draw() self.score_label.draw()
@@ -471,19 +474,21 @@ class CarRacing(gym.Env, EzPickle):
if __name__=="__main__": if __name__=="__main__":
from pyglet.window import key from pyglet.window import key
a = np.array( [0.0, 0.0, 0.0] ) a = np.array([0.0, 0.0, 0.0])
def key_press(k, mod): def key_press(k, mod):
global restart global restart
if k==0xff0d: restart = True if k == 0xff0d: restart = True
if k==key.LEFT: a[0] = -1.0 if k == key.LEFT: a[0] = -1.0
if k==key.RIGHT: a[0] = +1.0 if k == key.RIGHT: a[0] = +1.0
if k==key.UP: a[1] = +1.0 if k == key.UP: a[1] = +1.0
if k==key.DOWN: a[2] = +0.8 # set 1.0 for wheels to block to zero rotation if k == key.DOWN: a[2] = +0.8 # set 1.0 for wheels to block to zero rotation
def key_release(k, mod): def key_release(k, mod):
if k==key.LEFT and a[0]==-1.0: a[0] = 0 if k == key.LEFT and a[0] == -1.0: a[0] = 0
if k==key.RIGHT and a[0]==+1.0: a[0] = 0 if k == key.RIGHT and a[0] == +1.0: a[0] = 0
if k==key.UP: a[1] = 0 if k == key.UP: a[1] = 0
if k==key.DOWN: a[2] = 0 if k == key.DOWN: a[2] = 0
env = CarRacing() env = CarRacing()
env.render() env.render()
env.viewer.window.on_key_press = key_press env.viewer.window.on_key_press = key_press
@@ -504,9 +509,6 @@ if __name__=="__main__":
if steps % 200 == 0 or done: if steps % 200 == 0 or done:
print("\naction " + str(["{:+0.2f}".format(x) for x in a])) print("\naction " + str(["{:+0.2f}".format(x) for x in a]))
print("step {} total_reward {:+0.2f}".format(steps, total_reward)) print("step {} total_reward {:+0.2f}".format(steps, total_reward))
#import matplotlib.pyplot as plt
#plt.imshow(s)
#plt.savefig("test.jpeg")
steps += 1 steps += 1
isopen = env.render() isopen = env.render()
if done or restart or isopen == False: if done or restart or isopen == False:

View File

@@ -1,3 +1,31 @@
"""
Rocket trajectory optimization is a classic topic in Optimal Control.
According to Pontryagin's maximum principle it's optimal to fire engine full throttle or
turn it off. That's the reason this environment is OK to have discreet actions (engine on or off).
The landing pad is always at coordinates (0,0). The coordinates are the first two numbers in the state vector.
Reward for moving from the top of the screen to the landing pad and zero speed is about 100..140 points.
If the lander moves away from the landing pad it loses reward. The episode finishes if the lander crashes or
comes to rest, receiving an additional -100 or +100 points. Each leg with ground contact is +10 points.
Firing the main engine is -0.3 points each frame. Firing the side engine is -0.03 points each frame.
Solved is 200 points.
Landing outside the landing pad is possible. Fuel is infinite, so an agent can learn to fly and then land
on its first attempt. Please see the source code for details.
To see a heuristic landing, run:
python gym/envs/box2d/lunar_lander.py
To play yourself, run:
python examples/agents/keyboard_agent.py LunarLander-v2
Created by Oleg Klimov. Licensed on the same terms as the rest of OpenAI Gym.
"""
import sys, math import sys, math
import numpy as np import numpy as np
@@ -8,41 +36,17 @@ import gym
from gym import spaces from gym import spaces
from gym.utils import seeding, EzPickle from gym.utils import seeding, EzPickle
# Rocket trajectory optimization is a classic topic in Optimal Control. FPS = 50
# SCALE = 30.0 # affects how fast-paced the game is, forces should be adjusted as well
# According to Pontryagin's maximum principle it's optimal to fire engine full throttle or
# turn it off. That's the reason this environment is OK to have discreet actions (engine on or off).
#
# Landing pad is always at coordinates (0,0). Coordinates are the first two numbers in state vector.
# Reward for moving from the top of the screen to landing pad and zero speed is about 100..140 points.
# If lander moves away from landing pad it loses reward back. Episode finishes if the lander crashes or
# comes to rest, receiving additional -100 or +100 points. Each leg ground contact is +10. Firing main
# engine is -0.3 points each frame. Firing side engine is -0.03 points each frame. Solved is 200 points.
#
# Landing outside landing pad is possible. Fuel is infinite, so an agent can learn to fly and then land
# on its first attempt. Please see source code for details.
#
# To see heuristic landing, run:
#
# python gym/envs/box2d/lunar_lander.py
#
# To play yourself, run:
#
# python examples/agents/keyboard_agent.py LunarLander-v2
#
# Created by Oleg Klimov. Licensed on the same terms as the rest of OpenAI Gym.
FPS = 50 MAIN_ENGINE_POWER = 13.0
SCALE = 30.0 # affects how fast-paced the game is, forces should be adjusted as well SIDE_ENGINE_POWER = 0.6
MAIN_ENGINE_POWER = 13.0
SIDE_ENGINE_POWER = 0.6
INITIAL_RANDOM = 1000.0 # Set 1500 to make game harder INITIAL_RANDOM = 1000.0 # Set 1500 to make game harder
LANDER_POLY =[ LANDER_POLY =[
(-14,+17), (-17,0), (-17,-10), (-14, +17), (-17, 0), (-17 ,-10),
(+17,-10), (+17,0), (+14,+17) (+17, -10), (+17, 0), (+14, +17)
] ]
LEG_AWAY = 20 LEG_AWAY = 20
LEG_DOWN = 18 LEG_DOWN = 18
@@ -50,26 +54,30 @@ LEG_W, LEG_H = 2, 8
LEG_SPRING_TORQUE = 40 LEG_SPRING_TORQUE = 40
SIDE_ENGINE_HEIGHT = 14.0 SIDE_ENGINE_HEIGHT = 14.0
SIDE_ENGINE_AWAY = 12.0 SIDE_ENGINE_AWAY = 12.0
VIEWPORT_W = 600 VIEWPORT_W = 600
VIEWPORT_H = 400 VIEWPORT_H = 400
class ContactDetector(contactListener): class ContactDetector(contactListener):
def __init__(self, env): def __init__(self, env):
contactListener.__init__(self) contactListener.__init__(self)
self.env = env self.env = env
def BeginContact(self, contact): def BeginContact(self, contact):
if self.env.lander==contact.fixtureA.body or self.env.lander==contact.fixtureB.body: if self.env.lander == contact.fixtureA.body or self.env.lander == contact.fixtureB.body:
self.env.game_over = True self.env.game_over = True
for i in range(2): for i in range(2):
if self.env.legs[i] in [contact.fixtureA.body, contact.fixtureB.body]: if self.env.legs[i] in [contact.fixtureA.body, contact.fixtureB.body]:
self.env.legs[i].ground_contact = True self.env.legs[i].ground_contact = True
def EndContact(self, contact): def EndContact(self, contact):
for i in range(2): for i in range(2):
if self.env.legs[i] in [contact.fixtureA.body, contact.fixtureB.body]: if self.env.legs[i] in [contact.fixtureA.body, contact.fixtureB.body]:
self.env.legs[i].ground_contact = False self.env.legs[i].ground_contact = False
class LunarLander(gym.Env, EzPickle): class LunarLander(gym.Env, EzPickle):
metadata = { metadata = {
'render.modes': ['human', 'rgb_array'], 'render.modes': ['human', 'rgb_array'],
@@ -131,11 +139,11 @@ class LunarLander(gym.Env, EzPickle):
# terrain # terrain
CHUNKS = 11 CHUNKS = 11
height = self.np_random.uniform(0, H/2, size=(CHUNKS+1,) ) height = self.np_random.uniform(0, H/2, size=(CHUNKS+1,))
chunk_x = [W/(CHUNKS-1)*i for i in range(CHUNKS)] chunk_x = [W/(CHUNKS-1)*i for i in range(CHUNKS)]
self.helipad_x1 = chunk_x[CHUNKS//2-1] self.helipad_x1 = chunk_x[CHUNKS//2-1]
self.helipad_x2 = chunk_x[CHUNKS//2+1] self.helipad_x2 = chunk_x[CHUNKS//2+1]
self.helipad_y = H/4 self.helipad_y = H/4
height[CHUNKS//2-2] = self.helipad_y height[CHUNKS//2-2] = self.helipad_y
height[CHUNKS//2-1] = self.helipad_y height[CHUNKS//2-1] = self.helipad_y
height[CHUNKS//2+0] = self.helipad_y height[CHUNKS//2+0] = self.helipad_y
@@ -143,45 +151,45 @@ class LunarLander(gym.Env, EzPickle):
height[CHUNKS//2+2] = self.helipad_y height[CHUNKS//2+2] = self.helipad_y
smooth_y = [0.33*(height[i-1] + height[i+0] + height[i+1]) for i in range(CHUNKS)] smooth_y = [0.33*(height[i-1] + height[i+0] + height[i+1]) for i in range(CHUNKS)]
self.moon = self.world.CreateStaticBody( shapes=edgeShape(vertices=[(0, 0), (W, 0)]) ) self.moon = self.world.CreateStaticBody(shapes=edgeShape(vertices=[(0, 0), (W, 0)]))
self.sky_polys = [] self.sky_polys = []
for i in range(CHUNKS-1): for i in range(CHUNKS-1):
p1 = (chunk_x[i], smooth_y[i]) p1 = (chunk_x[i], smooth_y[i])
p2 = (chunk_x[i+1], smooth_y[i+1]) p2 = (chunk_x[i+1], smooth_y[i+1])
self.moon.CreateEdgeFixture( self.moon.CreateEdgeFixture(
vertices=[p1,p2], vertices=[p1,p2],
density=0, density=0,
friction=0.1) friction=0.1)
self.sky_polys.append( [p1, p2, (p2[0],H), (p1[0],H)] ) self.sky_polys.append([p1, p2, (p2[0], H), (p1[0], H)])
self.moon.color1 = (0.0,0.0,0.0) self.moon.color1 = (0.0, 0.0, 0.0)
self.moon.color2 = (0.0,0.0,0.0) self.moon.color2 = (0.0, 0.0, 0.0)
initial_y = VIEWPORT_H/SCALE initial_y = VIEWPORT_H/SCALE
self.lander = self.world.CreateDynamicBody( self.lander = self.world.CreateDynamicBody(
position = (VIEWPORT_W/SCALE/2, initial_y), position=(VIEWPORT_W/SCALE/2, initial_y),
angle=0.0, angle=0.0,
fixtures = fixtureDef( fixtures = fixtureDef(
shape=polygonShape(vertices=[ (x/SCALE,y/SCALE) for x,y in LANDER_POLY ]), shape=polygonShape(vertices=[(x/SCALE, y/SCALE) for x, y in LANDER_POLY]),
density=5.0, density=5.0,
friction=0.1, friction=0.1,
categoryBits=0x0010, categoryBits=0x0010,
maskBits=0x001, # collide only with ground maskBits=0x001, # collide only with ground
restitution=0.0) # 0.99 bouncy restitution=0.0) # 0.99 bouncy
) )
self.lander.color1 = (0.5,0.4,0.9) self.lander.color1 = (0.5, 0.4, 0.9)
self.lander.color2 = (0.3,0.3,0.5) self.lander.color2 = (0.3, 0.3, 0.5)
self.lander.ApplyForceToCenter( ( self.lander.ApplyForceToCenter( (
self.np_random.uniform(-INITIAL_RANDOM, INITIAL_RANDOM), self.np_random.uniform(-INITIAL_RANDOM, INITIAL_RANDOM),
self.np_random.uniform(-INITIAL_RANDOM, INITIAL_RANDOM) self.np_random.uniform(-INITIAL_RANDOM, INITIAL_RANDOM)
), True) ), True)
self.legs = [] self.legs = []
for i in [-1,+1]: for i in [-1, +1]:
leg = self.world.CreateDynamicBody( leg = self.world.CreateDynamicBody(
position = (VIEWPORT_W/SCALE/2 - i*LEG_AWAY/SCALE, initial_y), position=(VIEWPORT_W/SCALE/2 - i*LEG_AWAY/SCALE, initial_y),
angle = (i*0.05), angle=(i * 0.05),
fixtures = fixtureDef( fixtures=fixtureDef(
shape=polygonShape(box=(LEG_W/SCALE, LEG_H/SCALE)), shape=polygonShape(box=(LEG_W/SCALE, LEG_H/SCALE)),
density=1.0, density=1.0,
restitution=0.0, restitution=0.0,
@@ -189,20 +197,20 @@ class LunarLander(gym.Env, EzPickle):
maskBits=0x001) maskBits=0x001)
) )
leg.ground_contact = False leg.ground_contact = False
leg.color1 = (0.5,0.4,0.9) leg.color1 = (0.5, 0.4, 0.9)
leg.color2 = (0.3,0.3,0.5) leg.color2 = (0.3, 0.3, 0.5)
rjd = revoluteJointDef( rjd = revoluteJointDef(
bodyA=self.lander, bodyA=self.lander,
bodyB=leg, bodyB=leg,
localAnchorA=(0, 0), localAnchorA=(0, 0),
localAnchorB=(i*LEG_AWAY/SCALE, LEG_DOWN/SCALE), localAnchorB=(i * LEG_AWAY/SCALE, LEG_DOWN/SCALE),
enableMotor=True, enableMotor=True,
enableLimit=True, enableLimit=True,
maxMotorTorque=LEG_SPRING_TORQUE, maxMotorTorque=LEG_SPRING_TORQUE,
motorSpeed=+0.3*i # low enough not to jump back into the sky motorSpeed=+0.3 * i # low enough not to jump back into the sky
) )
if i==-1: if i == -1:
rjd.lowerAngle = +0.9 - 0.5 # Yes, the most esoteric numbers here, angles legs have freedom to travel within rjd.lowerAngle = +0.9 - 0.5 # The most esoteric numbers here, angled legs have freedom to travel within
rjd.upperAngle = +0.9 rjd.upperAngle = +0.9
else: else:
rjd.lowerAngle = -0.9 rjd.lowerAngle = -0.9
@@ -212,14 +220,14 @@ class LunarLander(gym.Env, EzPickle):
self.drawlist = [self.lander] + self.legs self.drawlist = [self.lander] + self.legs
return self.step(np.array([0,0]) if self.continuous else 0)[0] return self.step(np.array([0, 0]) if self.continuous else 0)[0]
def _create_particle(self, mass, x, y, ttl): def _create_particle(self, mass, x, y, ttl):
p = self.world.CreateDynamicBody( p = self.world.CreateDynamicBody(
position = (x,y), position = (x, y),
angle=0.0, angle=0.0,
fixtures = fixtureDef( fixtures = fixtureDef(
shape=circleShape(radius=2/SCALE, pos=(0,0)), shape=circleShape(radius=2/SCALE, pos=(0, 0)),
density=mass, density=mass,
friction=0.1, friction=0.1,
categoryBits=0x0100, categoryBits=0x0100,
@@ -232,7 +240,7 @@ class LunarLander(gym.Env, EzPickle):
return p return p
def _clean_particles(self, all): def _clean_particles(self, all):
while self.particles and (all or self.particles[0].ttl<0): while self.particles and (all or self.particles[0].ttl < 0):
self.world.DestroyBody(self.particles.pop(0)) self.world.DestroyBody(self.particles.pop(0))
def step(self, action): def step(self, action):
@@ -243,40 +251,53 @@ class LunarLander(gym.Env, EzPickle):
# Engines # Engines
tip = (math.sin(self.lander.angle), math.cos(self.lander.angle)) tip = (math.sin(self.lander.angle), math.cos(self.lander.angle))
side = (-tip[1], tip[0]); side = (-tip[1], tip[0])
dispersion = [self.np_random.uniform(-1.0, +1.0) / SCALE for _ in range(2)] dispersion = [self.np_random.uniform(-1.0, +1.0) / SCALE for _ in range(2)]
m_power = 0.0 m_power = 0.0
if (self.continuous and action[0] > 0.0) or (not self.continuous and action==2): if (self.continuous and action[0] > 0.0) or (not self.continuous and action == 2):
# Main engine # Main engine
if self.continuous: if self.continuous:
m_power = (np.clip(action[0], 0.0,1.0) + 1.0)*0.5 # 0.5..1.0 m_power = (np.clip(action[0], 0.0,1.0) + 1.0)*0.5 # 0.5..1.0
assert m_power>=0.5 and m_power <= 1.0 assert m_power >= 0.5 and m_power <= 1.0
else: else:
m_power = 1.0 m_power = 1.0
ox = tip[0]*(4/SCALE + 2*dispersion[0]) + side[0]*dispersion[1] # 4 is move a bit downwards, +-2 for randomness ox = (tip[0] * (4/SCALE + 2 * dispersion[0]) +
oy = -tip[1]*(4/SCALE + 2*dispersion[0]) - side[1]*dispersion[1] side[0] * dispersion[1]) # 4 is move a bit downwards, +-2 for randomness
oy = -tip[1] * (4/SCALE + 2 * dispersion[0]) - side[1] * dispersion[1]
impulse_pos = (self.lander.position[0] + ox, self.lander.position[1] + oy) impulse_pos = (self.lander.position[0] + ox, self.lander.position[1] + oy)
p = self._create_particle(3.5, impulse_pos[0], impulse_pos[1], m_power) # particles are just a decoration, 3.5 is here to make particle speed adequate p = self._create_particle(3.5, # 3.5 is here to make particle speed adequate
p.ApplyLinearImpulse( ( ox*MAIN_ENGINE_POWER*m_power, oy*MAIN_ENGINE_POWER*m_power), impulse_pos, True) impulse_pos[0],
self.lander.ApplyLinearImpulse( (-ox*MAIN_ENGINE_POWER*m_power, -oy*MAIN_ENGINE_POWER*m_power), impulse_pos, True) impulse_pos[1],
m_power) # particles are just a decoration
p.ApplyLinearImpulse((ox * MAIN_ENGINE_POWER * m_power, oy * MAIN_ENGINE_POWER * m_power),
impulse_pos,
True)
self.lander.ApplyLinearImpulse((-ox * MAIN_ENGINE_POWER * m_power, -oy * MAIN_ENGINE_POWER * m_power),
impulse_pos,
True)
s_power = 0.0 s_power = 0.0
if (self.continuous and np.abs(action[1]) > 0.5) or (not self.continuous and action in [1,3]): if (self.continuous and np.abs(action[1]) > 0.5) or (not self.continuous and action in [1, 3]):
# Orientation engines # Orientation engines
if self.continuous: if self.continuous:
direction = np.sign(action[1]) direction = np.sign(action[1])
s_power = np.clip(np.abs(action[1]), 0.5,1.0) s_power = np.clip(np.abs(action[1]), 0.5, 1.0)
assert s_power>=0.5 and s_power <= 1.0 assert s_power >= 0.5 and s_power <= 1.0
else: else:
direction = action-2 direction = action-2
s_power = 1.0 s_power = 1.0
ox = tip[0]*dispersion[0] + side[0]*(3*dispersion[1]+direction*SIDE_ENGINE_AWAY/SCALE) ox = tip[0] * dispersion[0] + side[0] * (3 * dispersion[1] + direction * SIDE_ENGINE_AWAY/SCALE)
oy = -tip[1]*dispersion[0] - side[1]*(3*dispersion[1]+direction*SIDE_ENGINE_AWAY/SCALE) oy = -tip[1] * dispersion[0] - side[1] * (3 * dispersion[1] + direction * SIDE_ENGINE_AWAY/SCALE)
impulse_pos = (self.lander.position[0] + ox - tip[0]*17/SCALE, self.lander.position[1] + oy + tip[1]*SIDE_ENGINE_HEIGHT/SCALE) impulse_pos = (self.lander.position[0] + ox - tip[0] * 17/SCALE,
self.lander.position[1] + oy + tip[1] * SIDE_ENGINE_HEIGHT/SCALE)
p = self._create_particle(0.7, impulse_pos[0], impulse_pos[1], s_power) p = self._create_particle(0.7, impulse_pos[0], impulse_pos[1], s_power)
p.ApplyLinearImpulse( ( ox*SIDE_ENGINE_POWER*s_power, oy*SIDE_ENGINE_POWER*s_power), impulse_pos, True) p.ApplyLinearImpulse((ox * SIDE_ENGINE_POWER * s_power, oy * SIDE_ENGINE_POWER * s_power),
self.lander.ApplyLinearImpulse( (-ox*SIDE_ENGINE_POWER*s_power, -oy*SIDE_ENGINE_POWER*s_power), impulse_pos, True) impulse_pos
, True)
self.lander.ApplyLinearImpulse((-ox * SIDE_ENGINE_POWER * s_power, -oy * SIDE_ENGINE_POWER * s_power),
impulse_pos,
True)
self.world.Step(1.0/FPS, 6*30, 2*30) self.world.Step(1.0/FPS, 6*30, 2*30)
@@ -292,27 +313,27 @@ class LunarLander(gym.Env, EzPickle):
1.0 if self.legs[0].ground_contact else 0.0, 1.0 if self.legs[0].ground_contact else 0.0,
1.0 if self.legs[1].ground_contact else 0.0 1.0 if self.legs[1].ground_contact else 0.0
] ]
assert len(state)==8 assert len(state) == 8
reward = 0 reward = 0
shaping = \ shaping = \
- 100*np.sqrt(state[0]*state[0] + state[1]*state[1]) \ - 100*np.sqrt(state[0]*state[0] + state[1]*state[1]) \
- 100*np.sqrt(state[2]*state[2] + state[3]*state[3]) \ - 100*np.sqrt(state[2]*state[2] + state[3]*state[3]) \
- 100*abs(state[4]) + 10*state[6] + 10*state[7] # And ten points for legs contact, the idea is if you - 100*abs(state[4]) + 10*state[6] + 10*state[7] # And ten points for legs contact, the idea is if you
# lose contact again after landing, you get negative reward # lose contact again after landing, you get negative reward
if self.prev_shaping is not None: if self.prev_shaping is not None:
reward = shaping - self.prev_shaping reward = shaping - self.prev_shaping
self.prev_shaping = shaping self.prev_shaping = shaping
reward -= m_power*0.30 # less fuel spent is better, about -30 for heurisic landing reward -= m_power*0.30 # less fuel spent is better, about -30 for heuristic landing
reward -= s_power*0.03 reward -= s_power*0.03
done = False done = False
if self.game_over or abs(state[0]) >= 1.0: if self.game_over or abs(state[0]) >= 1.0:
done = True done = True
reward = -100 reward = -100
if not self.lander.awake: if not self.lander.awake:
done = True done = True
reward = +100 reward = +100
return np.array(state, dtype=np.float32), reward, done, {} return np.array(state, dtype=np.float32), reward, done, {}
@@ -324,13 +345,13 @@ class LunarLander(gym.Env, EzPickle):
for obj in self.particles: for obj in self.particles:
obj.ttl -= 0.15 obj.ttl -= 0.15
obj.color1 = (max(0.2,0.2+obj.ttl), max(0.2,0.5*obj.ttl), max(0.2,0.5*obj.ttl)) obj.color1 = (max(0.2, 0.2+obj.ttl), max(0.2, 0.5*obj.ttl), max(0.2, 0.5*obj.ttl))
obj.color2 = (max(0.2,0.2+obj.ttl), max(0.2,0.5*obj.ttl), max(0.2,0.5*obj.ttl)) obj.color2 = (max(0.2, 0.2+obj.ttl), max(0.2, 0.5*obj.ttl), max(0.2, 0.5*obj.ttl))
self._clean_particles(False) self._clean_particles(False)
for p in self.sky_polys: for p in self.sky_polys:
self.viewer.draw_polygon(p, color=(0,0,0)) self.viewer.draw_polygon(p, color=(0, 0, 0))
for obj in self.particles + self.drawlist: for obj in self.particles + self.drawlist:
for f in obj.fixtures: for f in obj.fixtures:
@@ -348,42 +369,56 @@ class LunarLander(gym.Env, EzPickle):
for x in [self.helipad_x1, self.helipad_x2]: for x in [self.helipad_x1, self.helipad_x2]:
flagy1 = self.helipad_y flagy1 = self.helipad_y
flagy2 = flagy1 + 50/SCALE flagy2 = flagy1 + 50/SCALE
self.viewer.draw_polyline( [(x, flagy1), (x, flagy2)], color=(1,1,1) ) self.viewer.draw_polyline([(x, flagy1), (x, flagy2)], color=(1, 1, 1))
self.viewer.draw_polygon( [(x, flagy2), (x, flagy2-10/SCALE), (x+25/SCALE, flagy2-5/SCALE)], color=(0.8,0.8,0) ) self.viewer.draw_polygon([(x, flagy2), (x, flagy2-10/SCALE), (x + 25/SCALE, flagy2 - 5/SCALE)],
color=(0.8, 0.8, 0))
return self.viewer.render(return_rgb_array = mode=='rgb_array') return self.viewer.render(return_rgb_array=mode == 'rgb_array')
def close(self): def close(self):
if self.viewer is not None: if self.viewer is not None:
self.viewer.close() self.viewer.close()
self.viewer = None self.viewer = None
class LunarLanderContinuous(LunarLander): class LunarLanderContinuous(LunarLander):
continuous = True continuous = True
def heuristic(env, s): def heuristic(env, s):
# Heuristic for: """
# 1. Testing. The heuristic for
# 2. Demonstration rollout. 1. Testing
angle_targ = s[0]*0.5 + s[2]*1.0 # angle should point towards center (s[0] is horizontal coordinate, s[2] hor speed) 2. Demonstration rollout.
if angle_targ > 0.4: angle_targ = 0.4 # more than 0.4 radians (22 degrees) is bad
Args:
env: The environment
s (list): The state. Attributes:
s[0] is the horizontal coordinate
s[1] is the vertical coordinate
s[2] is the horizontal speed
s[3] is the vertical speed
s[4] is the angle
s[5] is the angular speed
s[6] 1 if first leg has contact, else 0
s[7] 1 if second leg has contact, else 0
returns:
a: The heuristic to be fed into the step function defined above to determine the next step and reward.
"""
angle_targ = s[0]*0.5 + s[2]*1.0 # angle should point towards center
if angle_targ > 0.4: angle_targ = 0.4 # more than 0.4 radians (22 degrees) is bad
if angle_targ < -0.4: angle_targ = -0.4 if angle_targ < -0.4: angle_targ = -0.4
hover_targ = 0.55*np.abs(s[0]) # target y should be proporional to horizontal offset hover_targ = 0.55*np.abs(s[0]) # target y should be proportional to horizontal offset
# PID controller: s[4] angle, s[5] angularSpeed angle_todo = (angle_targ - s[4]) * 0.5 - (s[5])*1.0
angle_todo = (angle_targ - s[4])*0.5 - (s[5])*1.0
#print("angle_targ=%0.2f, angle_todo=%0.2f" % (angle_targ, angle_todo))
# PID controller: s[1] vertical coordinate s[3] vertical speed
hover_todo = (hover_targ - s[1])*0.5 - (s[3])*0.5 hover_todo = (hover_targ - s[1])*0.5 - (s[3])*0.5
#print("hover_targ=%0.2f, hover_todo=%0.2f" % (hover_targ, hover_todo))
if s[6] or s[7]: # legs have contact if s[6] or s[7]: # legs have contact
angle_todo = 0 angle_todo = 0
hover_todo = -(s[3])*0.5 # override to reduce fall speed, that's all we need after contact hover_todo = -(s[3])*0.5 # override to reduce fall speed, that's all we need after contact
if env.continuous: if env.continuous:
a = np.array( [hover_todo*20 - 1, -angle_todo*20] ) a = np.array([hover_todo*20 - 1, -angle_todo*20])
a = np.clip(a, -1, +1) a = np.clip(a, -1, +1)
else: else:
a = 0 a = 0
@@ -416,5 +451,3 @@ def demo_heuristic_lander(env, seed=None, render=False):
if __name__ == '__main__': if __name__ == '__main__':
demo_heuristic_lander(LunarLander(), render=True) demo_heuristic_lander(LunarLander(), render=True)

View File

@@ -214,13 +214,17 @@ class AcrobotEnv(core.Env):
self.viewer = None self.viewer = None
def wrap(x, m, M): def wrap(x, m, M):
""" """Wraps ``x`` so m <= x <= M; but unlike ``bound()`` which
:param x: a scalar
:param m: minimum possible value in range
:param M: maximum possible value in range
Wraps ``x`` so m <= x <= M; but unlike ``bound()`` which
truncates, ``wrap()`` wraps x around the coordinate system defined by m,M.\n truncates, ``wrap()`` wraps x around the coordinate system defined by m,M.\n
For example, m = -180, M = 180 (degrees), x = 360 --> returns 0. For example, m = -180, M = 180 (degrees), x = 360 --> returns 0.
Args:
x: a scalar
m: minimum possible value in range
M: maximum possible value in range
Returns:
x: a scalar, wrapped
""" """
diff = M - m diff = M - m
while x > M: while x > M:
@@ -230,10 +234,14 @@ def wrap(x, m, M):
return x return x
def bound(x, m, M=None): def bound(x, m, M=None):
""" """Either have m as scalar, so bound(x,m,M) which returns m <= x <= M *OR*
:param x: scalar
Either have m as scalar, so bound(x,m,M) which returns m <= x <= M *OR*
have m as length 2 vector, bound(x,m, <IGNORED>) returns m[0] <= x <= m[1]. have m as length 2 vector, bound(x,m, <IGNORED>) returns m[0] <= x <= m[1].
Args:
x: scalar
Returns:
x: scalar, bound between min (m) and Max (M)
""" """
if M is None: if M is None:
M = m[1] M = m[1]
@@ -248,17 +256,14 @@ def rk4(derivs, y0, t, *args, **kwargs):
This is a toy implementation which may be useful if you find This is a toy implementation which may be useful if you find
yourself stranded on a system w/o scipy. Otherwise use yourself stranded on a system w/o scipy. Otherwise use
:func:`scipy.integrate`. :func:`scipy.integrate`.
*y0*
initial state vector Args:
*t* derivs: the derivative of the system and has the signature ``dy = derivs(yi, ti)``
sample times y0: initial state vector
*derivs* t: sample times
returns the derivative of the system and has the args: additional arguments passed to the derivative function
signature ``dy = derivs(yi, ti)`` kwargs: additional keyword arguments passed to the derivative function
*args*
additional arguments passed to the derivative function
*kwargs*
additional keyword arguments passed to the derivative function
Example 1 :: Example 1 ::
## 2D system ## 2D system
def derivs6(x,t): def derivs6(x,t):
@@ -278,6 +283,9 @@ def rk4(derivs, y0, t, *args, **kwargs):
yout = rk4(derivs, y0, t) yout = rk4(derivs, y0, t)
If you have access to scipy, you should probably be using the If you have access to scipy, you should probably be using the
scipy.integrate tools rather than this function. scipy.integrate tools rather than this function.
Returns:
yout: Runge-Kutta approximation of the ODE
""" """
try: try:

View File

@@ -13,17 +13,18 @@ import numpy as np
class CartPoleEnv(gym.Env): class CartPoleEnv(gym.Env):
""" """
Description: Description:
A pole is attached by an un-actuated joint to a cart, which moves along a frictionless track. The pendulum starts upright, and the goal is to prevent it from falling over by increasing and reducing the cart's velocity. A pole is attached by an un-actuated joint to a cart, which moves along a frictionless track. The pendulum
starts upright, and the goal is to prevent it from falling over by increasing and reducing the cart's velocity.
Source: Source:
This environment corresponds to the version of the cart-pole problem described by Barto, Sutton, and Anderson This environment corresponds to the version of the cart-pole problem described by Barto, Sutton, and Anderson
Observation: Observation:
Type: Box(4) Type: Box(4)
Num Observation Min Max Num Observation Min Max
0 Cart Position -4.8 4.8 0 Cart Position -4.8 4.8
1 Cart Velocity -Inf Inf 1 Cart Velocity -Inf Inf
2 Pole Angle -24 deg 24 deg 2 Pole Angle -24 deg 24 deg
3 Pole Velocity At Tip -Inf Inf 3 Pole Velocity At Tip -Inf Inf
Actions: Actions:
@@ -32,7 +33,9 @@ class CartPoleEnv(gym.Env):
0 Push cart to the left 0 Push cart to the left
1 Push cart to the right 1 Push cart to the right
Note: The amount the velocity that is reduced or increased is not fixed; it depends on the angle the pole is pointing. This is because the center of gravity of the pole increases the amount of energy needed to move the cart underneath it Note: The amount the velocity that is reduced or increased is not fixed; it depends on the angle the pole is
pointing. This is because the center of gravity of the pole increases the amount of energy needed to move the
cart underneath it
Reward: Reward:
Reward is 1 for every step taken, including the termination step Reward is 1 for every step taken, including the termination step