LunarLanderContinuous (#307)

* New LunarLanderContinuous, LunarLander-v2 remains exactly the same, no version bump.

* keyboard_agent.py works again.
This commit is contained in:
Oleg Klimov
2016-08-25 02:08:32 +03:00
committed by Greg Brockman
parent c97551e8e5
commit ee2c0243c0
5 changed files with 114 additions and 52 deletions

View File

@@ -22,13 +22,13 @@ def key_press(key, mod):
global human_agent_action, human_wants_restart, human_sets_pause global human_agent_action, human_wants_restart, human_sets_pause
if key==0xff0d: human_wants_restart = True if key==0xff0d: human_wants_restart = True
if key==32: human_sets_pause = not human_sets_pause if key==32: human_sets_pause = not human_sets_pause
a = key - ord('0') a = int( key - ord('0') )
if a <= 0 or a >= ACTIONS: return if a <= 0 or a >= ACTIONS: return
human_agent_action = a human_agent_action = a
def key_release(key, mod): def key_release(key, mod):
global human_agent_action global human_agent_action
a = key - ord('0') a = int( key - ord('0') )
if a <= 0 or a >= ACTIONS: return if a <= 0 or a >= ACTIONS: return
if human_agent_action == a: if human_agent_action == a:
human_agent_action = 0 human_agent_action = 0

View File

@@ -100,6 +100,13 @@ register(
reward_threshold=200, reward_threshold=200,
) )
register(
id='LunarLanderContinuous-v2',
entry_point='gym.envs.box2d:LunarLanderContinuous',
timestep_limit=1000,
reward_threshold=200,
)
register( register(
id='BipedalWalker-v2', id='BipedalWalker-v2',
entry_point='gym.envs.box2d:BipedalWalker', entry_point='gym.envs.box2d:BipedalWalker',

View File

@@ -1,3 +1,4 @@
from gym.envs.box2d.lunar_lander import LunarLander from gym.envs.box2d.lunar_lander import LunarLander
from gym.envs.box2d.lunar_lander import LunarLanderContinuous
from gym.envs.box2d.bipedal_walker import BipedalWalker, BipedalWalkerHardcore from gym.envs.box2d.bipedal_walker import BipedalWalker, BipedalWalkerHardcore
from gym.envs.box2d.car_racing import CarRacing from gym.envs.box2d.car_racing import CarRacing

View File

@@ -76,6 +76,8 @@ class LunarLander(gym.Env):
'video.frames_per_second' : FPS 'video.frames_per_second' : FPS
} }
continuous = False
def __init__(self): def __init__(self):
self._seed() self._seed()
self.viewer = None self.viewer = None
@@ -87,12 +89,18 @@ class LunarLander(gym.Env):
self.prev_reward = None self.prev_reward = None
# useful range is -1 .. +1 high = np.array([np.inf]*8) # useful range is -1 .. +1, but spikes can be higher
high = np.array([np.inf]*8)
# nop, fire left engine, main engine, right engine
self.action_space = spaces.Discrete(4)
self.observation_space = spaces.Box(-high, high) self.observation_space = spaces.Box(-high, high)
if self.continuous:
# Action is two floats [main engine, left-right engines].
# Main engine: -1..0 off, 0..+1 throttle from 50% to 100% power. Engine can't work with less than 50% power.
# Left-right: -1.0..-0.5 fire left engine, +0.5..+1.0 fire right engine, -0.5..0.5 off
self.action_space = spaces.Box(-1, +1, (2,))
else:
# Nop, fire left engine, main engine, right engine
self.action_space = spaces.Discrete(4)
self._reset() self._reset()
def _seed(self, seed=None): def _seed(self, seed=None):
@@ -203,9 +211,9 @@ class LunarLander(gym.Env):
self.drawlist = [self.lander] + self.legs self.drawlist = [self.lander] + self.legs
return self._step(0)[0] return self._step(np.array([0,0]) if self.continuous else 0)[0]
def _create_particle(self, mass, x, y): def _create_particle(self, mass, x, y, ttl):
p = self.world.CreateDynamicBody( p = self.world.CreateDynamicBody(
position = (x,y), position = (x,y),
angle=0.0, angle=0.0,
@@ -217,7 +225,7 @@ class LunarLander(gym.Env):
maskBits=0x001, # collide only with ground maskBits=0x001, # collide only with ground
restitution=0.3) restitution=0.3)
) )
p.ttl = 1 p.ttl = ttl
self.particles.append(p) self.particles.append(p)
self._clean_particles(False) self._clean_particles(False)
return p return p
@@ -233,22 +241,38 @@ class LunarLander(gym.Env):
tip = (math.sin(self.lander.angle), math.cos(self.lander.angle)) tip = (math.sin(self.lander.angle), math.cos(self.lander.angle))
side = (-tip[1], tip[0]); side = (-tip[1], tip[0]);
dispersion = [self.np_random.uniform(-1.0, +1.0) / SCALE for _ in range(2)] dispersion = [self.np_random.uniform(-1.0, +1.0) / SCALE for _ in range(2)]
if action==2: # Main engine
m_power = 0.0
if (self.continuous and action[0] > 0.0) or (not self.continuous and action==2):
# Main engine
if self.continuous:
m_power = (np.clip(action[0], 0.0,1.0) + 1.0)*0.5 # 0.5..1.0
assert m_power>=0.5 and m_power <= 1.0
else:
m_power = 1.0
ox = tip[0]*(4/SCALE + 2*dispersion[0]) + side[0]*dispersion[1] # 4 is move a bit downwards, +-2 for randomness ox = tip[0]*(4/SCALE + 2*dispersion[0]) + side[0]*dispersion[1] # 4 is move a bit downwards, +-2 for randomness
oy = -tip[1]*(4/SCALE + 2*dispersion[0]) - side[1]*dispersion[1] oy = -tip[1]*(4/SCALE + 2*dispersion[0]) - side[1]*dispersion[1]
impulse_pos = (self.lander.position[0] + ox, self.lander.position[1] + oy) impulse_pos = (self.lander.position[0] + ox, self.lander.position[1] + oy)
p = self._create_particle(3.5, *impulse_pos) # particles are just a decoration, 3.5 is here to make particle speed adequate p = self._create_particle(3.5, impulse_pos[0], impulse_pos[1], m_power) # particles are just a decoration, 3.5 is here to make particle speed adequate
p.ApplyLinearImpulse( ( ox*MAIN_ENGINE_POWER, oy*MAIN_ENGINE_POWER), impulse_pos, True) p.ApplyLinearImpulse( ( ox*MAIN_ENGINE_POWER*m_power, oy*MAIN_ENGINE_POWER*m_power), impulse_pos, True)
self.lander.ApplyLinearImpulse( (-ox*MAIN_ENGINE_POWER, -oy*MAIN_ENGINE_POWER), impulse_pos, True) self.lander.ApplyLinearImpulse( (-ox*MAIN_ENGINE_POWER*m_power, -oy*MAIN_ENGINE_POWER*m_power), impulse_pos, True)
if action==1 or action==3: # Orientation engines s_power = 0.0
direction = action-2 if (self.continuous and np.abs(action[1]) > 0.5) or (not self.continuous and action in [1,3]):
# Orientation engines
if self.continuous:
direction = np.sign(action[1])
s_power = np.clip(np.abs(action[1]), 0.5,1.0)
assert s_power>=0.5 and s_power <= 1.0
else:
direction = action-2
s_power = 1.0
ox = tip[0]*dispersion[0] + side[0]*(3*dispersion[1]+direction*SIDE_ENGINE_AWAY/SCALE) ox = tip[0]*dispersion[0] + side[0]*(3*dispersion[1]+direction*SIDE_ENGINE_AWAY/SCALE)
oy = -tip[1]*dispersion[0] - side[1]*(3*dispersion[1]+direction*SIDE_ENGINE_AWAY/SCALE) oy = -tip[1]*dispersion[0] - side[1]*(3*dispersion[1]+direction*SIDE_ENGINE_AWAY/SCALE)
impulse_pos = (self.lander.position[0] + ox - tip[0]*17/SCALE, self.lander.position[1] + oy + tip[1]*SIDE_ENGINE_HEIGHT/SCALE) impulse_pos = (self.lander.position[0] + ox - tip[0]*17/SCALE, self.lander.position[1] + oy + tip[1]*SIDE_ENGINE_HEIGHT/SCALE)
p = self._create_particle(0.7, *impulse_pos) p = self._create_particle(0.7, impulse_pos[0], impulse_pos[1], s_power)
p.ApplyLinearImpulse( ( ox*SIDE_ENGINE_POWER, oy*SIDE_ENGINE_POWER), impulse_pos, True) p.ApplyLinearImpulse( ( ox*SIDE_ENGINE_POWER*s_power, oy*SIDE_ENGINE_POWER*s_power), impulse_pos, True)
self.lander.ApplyLinearImpulse( (-ox*SIDE_ENGINE_POWER, -oy*SIDE_ENGINE_POWER), impulse_pos, True) self.lander.ApplyLinearImpulse( (-ox*SIDE_ENGINE_POWER*s_power, -oy*SIDE_ENGINE_POWER*s_power), impulse_pos, True)
self.world.Step(1.0/FPS, 6*30, 2*30) self.world.Step(1.0/FPS, 6*30, 2*30)
@@ -276,10 +300,8 @@ class LunarLander(gym.Env):
reward = shaping - self.prev_shaping reward = shaping - self.prev_shaping
self.prev_shaping = shaping self.prev_shaping = shaping
if action==2: # main engine reward -= m_power*0.30 # less fuel spent is better, about -30 for heurisic landing
reward -= 0.30 # less fuel spent is better, about -30 for heurisic landing reward -= s_power*0.03
elif action != 0:
reward -= 0.03
done = False done = False
if self.game_over or abs(state[0]) >= 1.0: if self.game_over or abs(state[0]) >= 1.0:
@@ -333,42 +355,53 @@ class LunarLander(gym.Env):
return self.viewer.render(return_rgb_array = mode=='rgb_array') return self.viewer.render(return_rgb_array = mode=='rgb_array')
class LunarLanderContinuous(LunarLander):
continuous = True
def heuristic(env, s):
# Heuristic for:
# 1. Testing.
# 2. Demonstration rollout.
angle_targ = s[0]*0.5 + s[2]*1.0 # angle should point towards center (s[0] is horizontal coordinate, s[2] hor speed)
if angle_targ > 0.4: angle_targ = 0.4 # more than 0.4 radians (22 degrees) is bad
if angle_targ < -0.4: angle_targ = -0.4
hover_targ = 0.55*np.abs(s[0]) # target y should be proporional to horizontal offset
# PID controller: s[4] angle, s[5] angularSpeed
angle_todo = (angle_targ - s[4])*0.5 - (s[5])*1.0
#print("angle_targ=%0.2f, angle_todo=%0.2f" % (angle_targ, angle_todo))
# PID controller: s[1] vertical coordinate s[3] vertical speed
hover_todo = (hover_targ - s[1])*0.5 - (s[3])*0.5
#print("hover_targ=%0.2f, hover_todo=%0.2f" % (hover_targ, hover_todo))
if s[6] or s[7]: # legs have contact
angle_todo = 0
hover_todo = -(s[3])*0.5 # override to reduce fall speed, that's all we need after contact
if env.continuous:
a = np.array( [hover_todo*20 - 1, -angle_todo*20] )
a = np.clip(a, -1, +1)
else:
a = 0
if hover_todo > np.abs(angle_todo) and hover_todo > 0.05: a = 2
elif angle_todo < -0.05: a = 3
elif angle_todo > +0.05: a = 1
return a
if __name__=="__main__": if __name__=="__main__":
# Heuristic for testing. #env = LunarLander()
env = LunarLander() env = LunarLanderContinuous()
env.reset() s = env.reset()
steps = 0
total_reward = 0 total_reward = 0
a = 0 steps = 0
while True: while True:
a = heuristic(env, s)
s, r, done, info = env.step(a) s, r, done, info = env.step(a)
env.render()
total_reward += r total_reward += r
if steps % 20 == 0 or done: if steps % 20 == 0 or done:
print(["{:+0.2f}".format(x) for x in s]) print(["{:+0.2f}".format(x) for x in s])
print("step {} total_reward {:+0.2f}".format(steps, total_reward)) print("step {} total_reward {:+0.2f}".format(steps, total_reward))
steps += 1 steps += 1
angle_targ = s[0]*0.5 + s[2]*1.0 # angle should point towards center (s[0] is horizontal coordinate, s[2] hor speed)
if angle_targ > 0.4: angle_targ = 0.4 # more than 0.4 radians (22 degrees) is bad
if angle_targ < -0.4: angle_targ = -0.4
hover_targ = 0.55*np.abs(s[0]) # target y should be proporional to horizontal offset
# PID controller: s[4] angle, s[5] angularSpeed
angle_todo = (angle_targ - s[4])*0.5 - (s[5])*1.0
#print("angle_targ=%0.2f, angle_todo=%0.2f" % (angle_targ, angle_todo))
# PID controller: s[1] vertical coordinate s[3] vertical speed
hover_todo = (hover_targ - s[1])*0.5 - (s[3])*0.5
#print("hover_targ=%0.2f, hover_todo=%0.2f" % (hover_targ, hover_todo))
if s[6] or s[7]: # legs have contact
angle_todo = 0
hover_todo = -(s[3])*0.5 # override to reduce fall speed, that's all we need after contact
a = 0
if hover_todo > np.abs(angle_todo) and hover_todo > 0.05: a = 2
elif angle_todo < -0.05: a = 3
elif angle_todo > +0.05: a = 1
env.render()
if done: break if done: break

View File

@@ -303,6 +303,27 @@ comes to rest, receiving additional -100 or +100 points. Each leg ground contact
engine is -0.3 points each frame. Solved is 200 points. engine is -0.3 points each frame. Solved is 200 points.
Landing outside landing pad is possible. Fuel is infinite, so an agent can learn to fly and then land Landing outside landing pad is possible. Fuel is infinite, so an agent can learn to fly and then land
on its first attempt. on its first attempt.
Four discrete actions available: do nothing, fire left orientation engine, fire main engine, fire
right orientation engine.
""")
add_task(
id='LunarLanderContinuous-v2',
group='box2d',
experimental=True,
contributor='olegklimov',
summary='Navigate a lander to its landing pad.',
description="""
Landing pad is always at coordinates (0,0). Coordinates are the first two numbers in state vector.
Reward for moving from the top of the screen to landing pad and zero speed is about 100..140 points.
If lander moves away from landing pad it loses reward back. Episode finishes if the lander crashes or
comes to rest, receiving additional -100 or +100 points. Each leg ground contact is +10. Firing main
engine is -0.3 points each frame. Solved is 200 points.
Landing outside landing pad is possible. Fuel is infinite, so an agent can learn to fly and then land
on its first attempt.
Action is two real values vector from -1 to +1. First controls main engine, -1..0 off, 0..+1 throttle
from 50% to 100% power. Engine can't work with less than 50% power. Second value -1.0..-0.5 fire left
engine, +0.5..+1.0 fire right engine, -0.5..0.5 off.
""") """)
add_task( add_task(