From ee2c0243c088dcdb749030f8b3fb60b718652449 Mon Sep 17 00:00:00 2001 From: Oleg Klimov Date: Thu, 25 Aug 2016 02:08:32 +0300 Subject: [PATCH] LunarLanderContinuous (#307) * New LunarLanderContinuous, LunarLander-v2 remains exactly the same, no version bump. * keyboard_agent.py works again. --- examples/agents/keyboard_agent.py | 4 +- gym/envs/__init__.py | 7 ++ gym/envs/box2d/__init__.py | 1 + gym/envs/box2d/lunar_lander.py | 133 +++++++++++++++++++----------- gym/scoreboard/__init__.py | 21 +++++ 5 files changed, 114 insertions(+), 52 deletions(-) diff --git a/examples/agents/keyboard_agent.py b/examples/agents/keyboard_agent.py index aea5b8d79..d4e9d5cf1 100644 --- a/examples/agents/keyboard_agent.py +++ b/examples/agents/keyboard_agent.py @@ -22,13 +22,13 @@ def key_press(key, mod): global human_agent_action, human_wants_restart, human_sets_pause if key==0xff0d: human_wants_restart = True if key==32: human_sets_pause = not human_sets_pause - a = key - ord('0') + a = int( key - ord('0') ) if a <= 0 or a >= ACTIONS: return human_agent_action = a def key_release(key, mod): global human_agent_action - a = key - ord('0') + a = int( key - ord('0') ) if a <= 0 or a >= ACTIONS: return if human_agent_action == a: human_agent_action = 0 diff --git a/gym/envs/__init__.py b/gym/envs/__init__.py index 210361fdf..4a491e6fc 100644 --- a/gym/envs/__init__.py +++ b/gym/envs/__init__.py @@ -100,6 +100,13 @@ register( reward_threshold=200, ) +register( + id='LunarLanderContinuous-v2', + entry_point='gym.envs.box2d:LunarLanderContinuous', + timestep_limit=1000, + reward_threshold=200, +) + register( id='BipedalWalker-v2', entry_point='gym.envs.box2d:BipedalWalker', diff --git a/gym/envs/box2d/__init__.py b/gym/envs/box2d/__init__.py index abcc18322..725f319ea 100644 --- a/gym/envs/box2d/__init__.py +++ b/gym/envs/box2d/__init__.py @@ -1,3 +1,4 @@ from gym.envs.box2d.lunar_lander import LunarLander +from gym.envs.box2d.lunar_lander import LunarLanderContinuous from gym.envs.box2d.bipedal_walker import BipedalWalker, BipedalWalkerHardcore from gym.envs.box2d.car_racing import CarRacing diff --git a/gym/envs/box2d/lunar_lander.py b/gym/envs/box2d/lunar_lander.py index 61ce6428a..4c4ee68b1 100644 --- a/gym/envs/box2d/lunar_lander.py +++ b/gym/envs/box2d/lunar_lander.py @@ -76,6 +76,8 @@ class LunarLander(gym.Env): 'video.frames_per_second' : FPS } + continuous = False + def __init__(self): self._seed() self.viewer = None @@ -87,12 +89,18 @@ class LunarLander(gym.Env): self.prev_reward = None - # useful range is -1 .. +1 - high = np.array([np.inf]*8) - # nop, fire left engine, main engine, right engine - self.action_space = spaces.Discrete(4) + high = np.array([np.inf]*8) # useful range is -1 .. +1, but spikes can be higher self.observation_space = spaces.Box(-high, high) - + + if self.continuous: + # Action is two floats [main engine, left-right engines]. + # Main engine: -1..0 off, 0..+1 throttle from 50% to 100% power. Engine can't work with less than 50% power. + # Left-right: -1.0..-0.5 fire left engine, +0.5..+1.0 fire right engine, -0.5..0.5 off + self.action_space = spaces.Box(-1, +1, (2,)) + else: + # Nop, fire left engine, main engine, right engine + self.action_space = spaces.Discrete(4) + self._reset() def _seed(self, seed=None): @@ -203,9 +211,9 @@ class LunarLander(gym.Env): self.drawlist = [self.lander] + self.legs - return self._step(0)[0] + return self._step(np.array([0,0]) if self.continuous else 0)[0] - def _create_particle(self, mass, x, y): + def _create_particle(self, mass, x, y, ttl): p = self.world.CreateDynamicBody( position = (x,y), angle=0.0, @@ -217,7 +225,7 @@ class LunarLander(gym.Env): maskBits=0x001, # collide only with ground restitution=0.3) ) - p.ttl = 1 + p.ttl = ttl self.particles.append(p) self._clean_particles(False) return p @@ -233,22 +241,38 @@ class LunarLander(gym.Env): tip = (math.sin(self.lander.angle), math.cos(self.lander.angle)) side = (-tip[1], tip[0]); dispersion = [self.np_random.uniform(-1.0, +1.0) / SCALE for _ in range(2)] - if action==2: # Main engine + + m_power = 0.0 + if (self.continuous and action[0] > 0.0) or (not self.continuous and action==2): + # Main engine + if self.continuous: + m_power = (np.clip(action[0], 0.0,1.0) + 1.0)*0.5 # 0.5..1.0 + assert m_power>=0.5 and m_power <= 1.0 + else: + m_power = 1.0 ox = tip[0]*(4/SCALE + 2*dispersion[0]) + side[0]*dispersion[1] # 4 is move a bit downwards, +-2 for randomness oy = -tip[1]*(4/SCALE + 2*dispersion[0]) - side[1]*dispersion[1] impulse_pos = (self.lander.position[0] + ox, self.lander.position[1] + oy) - p = self._create_particle(3.5, *impulse_pos) # particles are just a decoration, 3.5 is here to make particle speed adequate - p.ApplyLinearImpulse( ( ox*MAIN_ENGINE_POWER, oy*MAIN_ENGINE_POWER), impulse_pos, True) - self.lander.ApplyLinearImpulse( (-ox*MAIN_ENGINE_POWER, -oy*MAIN_ENGINE_POWER), impulse_pos, True) + p = self._create_particle(3.5, impulse_pos[0], impulse_pos[1], m_power) # particles are just a decoration, 3.5 is here to make particle speed adequate + p.ApplyLinearImpulse( ( ox*MAIN_ENGINE_POWER*m_power, oy*MAIN_ENGINE_POWER*m_power), impulse_pos, True) + self.lander.ApplyLinearImpulse( (-ox*MAIN_ENGINE_POWER*m_power, -oy*MAIN_ENGINE_POWER*m_power), impulse_pos, True) - if action==1 or action==3: # Orientation engines - direction = action-2 + s_power = 0.0 + if (self.continuous and np.abs(action[1]) > 0.5) or (not self.continuous and action in [1,3]): + # Orientation engines + if self.continuous: + direction = np.sign(action[1]) + s_power = np.clip(np.abs(action[1]), 0.5,1.0) + assert s_power>=0.5 and s_power <= 1.0 + else: + direction = action-2 + s_power = 1.0 ox = tip[0]*dispersion[0] + side[0]*(3*dispersion[1]+direction*SIDE_ENGINE_AWAY/SCALE) oy = -tip[1]*dispersion[0] - side[1]*(3*dispersion[1]+direction*SIDE_ENGINE_AWAY/SCALE) impulse_pos = (self.lander.position[0] + ox - tip[0]*17/SCALE, self.lander.position[1] + oy + tip[1]*SIDE_ENGINE_HEIGHT/SCALE) - p = self._create_particle(0.7, *impulse_pos) - p.ApplyLinearImpulse( ( ox*SIDE_ENGINE_POWER, oy*SIDE_ENGINE_POWER), impulse_pos, True) - self.lander.ApplyLinearImpulse( (-ox*SIDE_ENGINE_POWER, -oy*SIDE_ENGINE_POWER), impulse_pos, True) + p = self._create_particle(0.7, impulse_pos[0], impulse_pos[1], s_power) + p.ApplyLinearImpulse( ( ox*SIDE_ENGINE_POWER*s_power, oy*SIDE_ENGINE_POWER*s_power), impulse_pos, True) + self.lander.ApplyLinearImpulse( (-ox*SIDE_ENGINE_POWER*s_power, -oy*SIDE_ENGINE_POWER*s_power), impulse_pos, True) self.world.Step(1.0/FPS, 6*30, 2*30) @@ -276,10 +300,8 @@ class LunarLander(gym.Env): reward = shaping - self.prev_shaping self.prev_shaping = shaping - if action==2: # main engine - reward -= 0.30 # less fuel spent is better, about -30 for heurisic landing - elif action != 0: - reward -= 0.03 + reward -= m_power*0.30 # less fuel spent is better, about -30 for heurisic landing + reward -= s_power*0.03 done = False if self.game_over or abs(state[0]) >= 1.0: @@ -333,42 +355,53 @@ class LunarLander(gym.Env): return self.viewer.render(return_rgb_array = mode=='rgb_array') +class LunarLanderContinuous(LunarLander): + continuous = True + +def heuristic(env, s): + # Heuristic for: + # 1. Testing. + # 2. Demonstration rollout. + angle_targ = s[0]*0.5 + s[2]*1.0 # angle should point towards center (s[0] is horizontal coordinate, s[2] hor speed) + if angle_targ > 0.4: angle_targ = 0.4 # more than 0.4 radians (22 degrees) is bad + if angle_targ < -0.4: angle_targ = -0.4 + hover_targ = 0.55*np.abs(s[0]) # target y should be proporional to horizontal offset + + # PID controller: s[4] angle, s[5] angularSpeed + angle_todo = (angle_targ - s[4])*0.5 - (s[5])*1.0 + #print("angle_targ=%0.2f, angle_todo=%0.2f" % (angle_targ, angle_todo)) + + # PID controller: s[1] vertical coordinate s[3] vertical speed + hover_todo = (hover_targ - s[1])*0.5 - (s[3])*0.5 + #print("hover_targ=%0.2f, hover_todo=%0.2f" % (hover_targ, hover_todo)) + + if s[6] or s[7]: # legs have contact + angle_todo = 0 + hover_todo = -(s[3])*0.5 # override to reduce fall speed, that's all we need after contact + + if env.continuous: + a = np.array( [hover_todo*20 - 1, -angle_todo*20] ) + a = np.clip(a, -1, +1) + else: + a = 0 + if hover_todo > np.abs(angle_todo) and hover_todo > 0.05: a = 2 + elif angle_todo < -0.05: a = 3 + elif angle_todo > +0.05: a = 1 + return a + if __name__=="__main__": - # Heuristic for testing. - env = LunarLander() - env.reset() - steps = 0 + #env = LunarLander() + env = LunarLanderContinuous() + s = env.reset() total_reward = 0 - a = 0 + steps = 0 while True: + a = heuristic(env, s) s, r, done, info = env.step(a) + env.render() total_reward += r if steps % 20 == 0 or done: print(["{:+0.2f}".format(x) for x in s]) print("step {} total_reward {:+0.2f}".format(steps, total_reward)) steps += 1 - - angle_targ = s[0]*0.5 + s[2]*1.0 # angle should point towards center (s[0] is horizontal coordinate, s[2] hor speed) - if angle_targ > 0.4: angle_targ = 0.4 # more than 0.4 radians (22 degrees) is bad - if angle_targ < -0.4: angle_targ = -0.4 - hover_targ = 0.55*np.abs(s[0]) # target y should be proporional to horizontal offset - - # PID controller: s[4] angle, s[5] angularSpeed - angle_todo = (angle_targ - s[4])*0.5 - (s[5])*1.0 - #print("angle_targ=%0.2f, angle_todo=%0.2f" % (angle_targ, angle_todo)) - - # PID controller: s[1] vertical coordinate s[3] vertical speed - hover_todo = (hover_targ - s[1])*0.5 - (s[3])*0.5 - #print("hover_targ=%0.2f, hover_todo=%0.2f" % (hover_targ, hover_todo)) - - if s[6] or s[7]: # legs have contact - angle_todo = 0 - hover_todo = -(s[3])*0.5 # override to reduce fall speed, that's all we need after contact - - a = 0 - if hover_todo > np.abs(angle_todo) and hover_todo > 0.05: a = 2 - elif angle_todo < -0.05: a = 3 - elif angle_todo > +0.05: a = 1 - - env.render() if done: break diff --git a/gym/scoreboard/__init__.py b/gym/scoreboard/__init__.py index 0e52c5df7..e97afb786 100644 --- a/gym/scoreboard/__init__.py +++ b/gym/scoreboard/__init__.py @@ -303,6 +303,27 @@ comes to rest, receiving additional -100 or +100 points. Each leg ground contact engine is -0.3 points each frame. Solved is 200 points. Landing outside landing pad is possible. Fuel is infinite, so an agent can learn to fly and then land on its first attempt. +Four discrete actions available: do nothing, fire left orientation engine, fire main engine, fire +right orientation engine. +""") + +add_task( + id='LunarLanderContinuous-v2', + group='box2d', + experimental=True, + contributor='olegklimov', + summary='Navigate a lander to its landing pad.', + description=""" +Landing pad is always at coordinates (0,0). Coordinates are the first two numbers in state vector. +Reward for moving from the top of the screen to landing pad and zero speed is about 100..140 points. +If lander moves away from landing pad it loses reward back. Episode finishes if the lander crashes or +comes to rest, receiving additional -100 or +100 points. Each leg ground contact is +10. Firing main +engine is -0.3 points each frame. Solved is 200 points. +Landing outside landing pad is possible. Fuel is infinite, so an agent can learn to fly and then land +on its first attempt. +Action is two real values vector from -1 to +1. First controls main engine, -1..0 off, 0..+1 throttle +from 50% to 100% power. Engine can't work with less than 50% power. Second value -1.0..-0.5 fire left +engine, +0.5..+1.0 fire right engine, -0.5..0.5 off. """) add_task(