LunarLanderContinuous (#307)

* New LunarLanderContinuous, LunarLander-v2 remains exactly the same, no version bump. * keyboard_agent.py works again.
2025-08-20 05:52:03 +00:00 · 2016-08-25 02:08:32 +03:00
parent c97551e8e5
commit ee2c0243c0
5 changed files with 114 additions and 52 deletions
--- a/examples/agents/keyboard_agent.py
+++ b/examples/agents/keyboard_agent.py
@@ -22,13 +22,13 @@ def key_press(key, mod):
    global human_agent_action, human_wants_restart, human_sets_pause
    if key==0xff0d: human_wants_restart = True
    if key==32: human_sets_pause = not human_sets_pause
-    a = key - ord('0')
+    a = int( key - ord('0') )
    if a <= 0 or a >= ACTIONS: return
    human_agent_action = a
 def key_release(key, mod):
    global human_agent_action
-    a = key - ord('0')
+    a = int( key - ord('0') )
    if a <= 0 or a >= ACTIONS: return
    if human_agent_action == a:
        human_agent_action = 0
--- a/gym/envs/init.py
+++ b/gym/envs/init.py
@@ -100,6 +100,13 @@ register(
    reward_threshold=200,
 )
 register(
    id='LunarLanderContinuous-v2',
    entry_point='gym.envs.box2d:LunarLanderContinuous',
    timestep_limit=1000,
    reward_threshold=200,
 )
 register(
    id='BipedalWalker-v2',
    entry_point='gym.envs.box2d:BipedalWalker',
--- a/gym/envs/box2d/init.py
+++ b/gym/envs/box2d/init.py
@@ -1,3 +1,4 @@
 from gym.envs.box2d.lunar_lander import LunarLander
 from gym.envs.box2d.lunar_lander import LunarLanderContinuous
 from gym.envs.box2d.bipedal_walker import BipedalWalker, BipedalWalkerHardcore
 from gym.envs.box2d.car_racing import CarRacing
--- a/gym/envs/box2d/lunar_lander.py
+++ b/gym/envs/box2d/lunar_lander.py
@@ -76,6 +76,8 @@ class LunarLander(gym.Env):
        'video.frames_per_second' : FPS
    }
    continuous = False
    def __init__(self):
        self._seed()
        self.viewer = None
@@ -87,12 +89,18 @@ class LunarLander(gym.Env):
        self.prev_reward = None
-        # useful range is -1 .. +1
+        high = np.array([np.inf]*8)  # useful range is -1 .. +1, but spikes can be higher
        high = np.array([np.inf]*8)
        # nop, fire left engine, main engine, right engine
        self.action_space = spaces.Discrete(4)
        self.observation_space = spaces.Box(-high, high)
        if self.continuous:
            # Action is two floats [main engine, left-right engines].
            # Main engine: -1..0 off, 0..+1 throttle from 50% to 100% power. Engine can't work with less than 50% power.
            # Left-right:  -1.0..-0.5 fire left engine, +0.5..+1.0 fire right engine, -0.5..0.5 off
            self.action_space = spaces.Box(-1, +1, (2,))
        else:
            # Nop, fire left engine, main engine, right engine
            self.action_space = spaces.Discrete(4)
        self._reset()
    def _seed(self, seed=None):
@@ -203,9 +211,9 @@ class LunarLander(gym.Env):
        self.drawlist = [self.lander] + self.legs
-        return self._step(0)[0]
+        return self._step(np.array([0,0]) if self.continuous else 0)[0]
-    def _create_particle(self, mass, x, y):
+    def _create_particle(self, mass, x, y, ttl):
        p = self.world.CreateDynamicBody(
            position = (x,y),
            angle=0.0,
@@ -217,7 +225,7 @@ class LunarLander(gym.Env):
                maskBits=0x001,  # collide only with ground
                restitution=0.3)
                )
-        p.ttl = 1
+        p.ttl = ttl
        self.particles.append(p)
        self._clean_particles(False)
        return p
@@ -233,22 +241,38 @@ class LunarLander(gym.Env):
        tip  = (math.sin(self.lander.angle), math.cos(self.lander.angle))
        side = (-tip[1], tip[0]);
        dispersion = [self.np_random.uniform(-1.0, +1.0) / SCALE for _ in range(2)]
-        if action==2: # Main engine
+
        m_power = 0.0
        if (self.continuous and action[0] > 0.0) or (not self.continuous and action==2):
            # Main engine
            if self.continuous:
                m_power = (np.clip(action[0], 0.0,1.0) + 1.0)*0.5   # 0.5..1.0
                assert m_power>=0.5 and m_power <= 1.0
            else:
                m_power = 1.0
            ox =  tip[0]*(4/SCALE + 2*dispersion[0]) + side[0]*dispersion[1]   # 4 is move a bit downwards, +-2 for randomness
            oy = -tip[1]*(4/SCALE + 2*dispersion[0]) - side[1]*dispersion[1]
            impulse_pos = (self.lander.position[0] + ox, self.lander.position[1] + oy)
-            p = self._create_particle(3.5, *impulse_pos)    # particles are just a decoration, 3.5 is here to make particle speed adequate
+            p = self._create_particle(3.5, impulse_pos[0], impulse_pos[1], m_power)    # particles are just a decoration, 3.5 is here to make particle speed adequate
-            p.ApplyLinearImpulse(           ( ox*MAIN_ENGINE_POWER,  oy*MAIN_ENGINE_POWER), impulse_pos, True)
+            p.ApplyLinearImpulse(           ( ox*MAIN_ENGINE_POWER*m_power,  oy*MAIN_ENGINE_POWER*m_power), impulse_pos, True)
-            self.lander.ApplyLinearImpulse( (-ox*MAIN_ENGINE_POWER, -oy*MAIN_ENGINE_POWER), impulse_pos, True)
+            self.lander.ApplyLinearImpulse( (-ox*MAIN_ENGINE_POWER*m_power, -oy*MAIN_ENGINE_POWER*m_power), impulse_pos, True)
-        if action==1 or action==3: # Orientation engines
+        s_power = 0.0
        if (self.continuous and np.abs(action[1]) > 0.5) or (not self.continuous and action in [1,3]):
            # Orientation engines
            if self.continuous:
                direction = np.sign(action[1])
                s_power = np.clip(np.abs(action[1]), 0.5,1.0)
                assert s_power>=0.5 and s_power <= 1.0
            else:
                direction = action-2
                s_power = 1.0
            ox =  tip[0]*dispersion[0] + side[0]*(3*dispersion[1]+direction*SIDE_ENGINE_AWAY/SCALE)
            oy = -tip[1]*dispersion[0] - side[1]*(3*dispersion[1]+direction*SIDE_ENGINE_AWAY/SCALE)
            impulse_pos = (self.lander.position[0] + ox - tip[0]*17/SCALE, self.lander.position[1] + oy + tip[1]*SIDE_ENGINE_HEIGHT/SCALE)
-            p = self._create_particle(0.7, *impulse_pos)
+            p = self._create_particle(0.7, impulse_pos[0], impulse_pos[1], s_power)
-            p.ApplyLinearImpulse(           ( ox*SIDE_ENGINE_POWER,  oy*SIDE_ENGINE_POWER), impulse_pos, True)
+            p.ApplyLinearImpulse(           ( ox*SIDE_ENGINE_POWER*s_power,  oy*SIDE_ENGINE_POWER*s_power), impulse_pos, True)
-            self.lander.ApplyLinearImpulse( (-ox*SIDE_ENGINE_POWER, -oy*SIDE_ENGINE_POWER), impulse_pos, True)
+            self.lander.ApplyLinearImpulse( (-ox*SIDE_ENGINE_POWER*s_power, -oy*SIDE_ENGINE_POWER*s_power), impulse_pos, True)
        self.world.Step(1.0/FPS, 6*30, 2*30)
@@ -276,10 +300,8 @@ class LunarLander(gym.Env):
            reward = shaping - self.prev_shaping
        self.prev_shaping = shaping
-        if action==2:       # main engine
+        reward -= m_power*0.30  # less fuel spent is better, about -30 for heurisic landing
-            reward -= 0.30  # less fuel spent is better, about -30 for heurisic landing
+        reward -= s_power*0.03
        elif action != 0:
            reward -= 0.03
        done = False
        if self.game_over or abs(state[0]) >= 1.0:
@@ -333,21 +355,13 @@ class LunarLander(gym.Env):
        return self.viewer.render(return_rgb_array = mode=='rgb_array')
-if __name__=="__main__":
+class LunarLanderContinuous(LunarLander):
-    # Heuristic for testing.
+    continuous = True
    env = LunarLander()
    env.reset()
    steps = 0
    total_reward = 0
    a = 0
    while True:
        s, r, done, info = env.step(a)
        total_reward += r
        if steps % 20 == 0 or done:
            print(["{:+0.2f}".format(x) for x in s])
            print("step {} total_reward {:+0.2f}".format(steps, total_reward))
        steps += 1
 def heuristic(env, s):
    # Heuristic for:
    # 1. Testing. 
    # 2. Demonstration rollout.
    angle_targ = s[0]*0.5 + s[2]*1.0         # angle should point towards center (s[0] is horizontal coordinate, s[2] hor speed)
    if angle_targ >  0.4: angle_targ =  0.4  # more than 0.4 radians (22 degrees) is bad
    if angle_targ < -0.4: angle_targ = -0.4
@@ -365,10 +379,29 @@ if __name__=="__main__":
        angle_todo = 0
        hover_todo = -(s[3])*0.5  # override to reduce fall speed, that's all we need after contact
    if env.continuous:
        a = np.array( [hover_todo*20 - 1, -angle_todo*20] )
        a = np.clip(a, -1, +1)
    else:
        a = 0
        if hover_todo > np.abs(angle_todo) and hover_todo > 0.05: a = 2
        elif angle_todo < -0.05: a = 3
        elif angle_todo > +0.05: a = 1
    return a
 if __name__=="__main__":
    #env = LunarLander()
    env = LunarLanderContinuous()
    s = env.reset()
    total_reward = 0
    steps = 0
    while True:
        a = heuristic(env, s)
        s, r, done, info = env.step(a)
        env.render()
        total_reward += r
        if steps % 20 == 0 or done:
            print(["{:+0.2f}".format(x) for x in s])
            print("step {} total_reward {:+0.2f}".format(steps, total_reward))
        steps += 1
        if done: break
--- a/gym/scoreboard/init.py
+++ b/gym/scoreboard/init.py
@@ -303,6 +303,27 @@ comes to rest, receiving additional -100 or +100 points. Each leg ground contact
 engine is -0.3 points each frame. Solved is 200 points.
 Landing outside landing pad is possible. Fuel is infinite, so an agent can learn to fly and then land
 on its first attempt.
 Four discrete actions available: do nothing, fire left orientation engine, fire main engine, fire
 right orientation engine.
 """)
 add_task(
    id='LunarLanderContinuous-v2',
    group='box2d',
    experimental=True,
    contributor='olegklimov',
    summary='Navigate a lander to its landing pad.',
    description="""
 Landing pad is always at coordinates (0,0). Coordinates are the first two numbers in state vector.
 Reward for moving from the top of the screen to landing pad and zero speed is about 100..140 points.
 If lander moves away from landing pad it loses reward back. Episode finishes if the lander crashes or
 comes to rest, receiving additional -100 or +100 points. Each leg ground contact is +10. Firing main
 engine is -0.3 points each frame. Solved is 200 points.
 Landing outside landing pad is possible. Fuel is infinite, so an agent can learn to fly and then land
 on its first attempt.
 Action is two real values vector from -1 to +1. First controls main engine, -1..0 off, 0..+1 throttle
 from 50% to 100% power. Engine can't work with less than 50% power. Second value -1.0..-0.5 fire left
 engine, +0.5..+1.0 fire right engine, -0.5..0.5 off.
 """)
 add_task(