LunarLanderContinuous (#307)

* New LunarLanderContinuous, LunarLander-v2 remains exactly the same, no version bump. * keyboard_agent.py works again.
2025-08-20 05:52:03 +00:00 · 2016-08-25 02:08:32 +03:00
parent c97551e8e5
commit ee2c0243c0
5 changed files with 114 additions and 52 deletions
--- a/examples/agents/keyboard_agent.py
+++ b/examples/agents/keyboard_agent.py
@@ -22,13 +22,13 @@ def key_press(key, mod):
    global human_agent_action, human_wants_restart, human_sets_pause
    if key==0xff0d: human_wants_restart = True
    if key==32: human_sets_pause = not human_sets_pause
-    a = key - ord('0')
+    a = int( key - ord('0') )
    if a <= 0 or a >= ACTIONS: return
    human_agent_action = a

 def key_release(key, mod):
    global human_agent_action
-    a = key - ord('0')
+    a = int( key - ord('0') )
    if a <= 0 or a >= ACTIONS: return
    if human_agent_action == a:
        human_agent_action = 0
--- a/gym/envs/init.py
+++ b/gym/envs/init.py
@@ -100,6 +100,13 @@ register(
    reward_threshold=200,
 )

+register(
+    id='LunarLanderContinuous-v2',
+    entry_point='gym.envs.box2d:LunarLanderContinuous',
+    timestep_limit=1000,
+    reward_threshold=200,
+)
+
 register(
    id='BipedalWalker-v2',
    entry_point='gym.envs.box2d:BipedalWalker',
--- a/gym/envs/box2d/init.py
+++ b/gym/envs/box2d/init.py
@@ -1,3 +1,4 @@
 from gym.envs.box2d.lunar_lander import LunarLander
+from gym.envs.box2d.lunar_lander import LunarLanderContinuous
 from gym.envs.box2d.bipedal_walker import BipedalWalker, BipedalWalkerHardcore
 from gym.envs.box2d.car_racing import CarRacing
--- a/gym/envs/box2d/lunar_lander.py
+++ b/gym/envs/box2d/lunar_lander.py
@@ -76,6 +76,8 @@ class LunarLander(gym.Env):
        'video.frames_per_second' : FPS
    }

+    continuous = False
+
    def __init__(self):
        self._seed()
        self.viewer = None
@@ -87,12 +89,18 @@ class LunarLander(gym.Env):

        self.prev_reward = None

-        # useful range is -1 .. +1
-        high = np.array([np.inf]*8)
-        # nop, fire left engine, main engine, right engine
-        self.action_space = spaces.Discrete(4)
+        high = np.array([np.inf]*8)  # useful range is -1 .. +1, but spikes can be higher
        self.observation_space = spaces.Box(-high, high)
-        
+
+        if self.continuous:
+            # Action is two floats [main engine, left-right engines].
+            # Main engine: -1..0 off, 0..+1 throttle from 50% to 100% power. Engine can't work with less than 50% power.
+            # Left-right:  -1.0..-0.5 fire left engine, +0.5..+1.0 fire right engine, -0.5..0.5 off
+            self.action_space = spaces.Box(-1, +1, (2,))
+        else:
+            # Nop, fire left engine, main engine, right engine
+            self.action_space = spaces.Discrete(4)
+
        self._reset()

    def _seed(self, seed=None):
@@ -203,9 +211,9 @@ class LunarLander(gym.Env):

        self.drawlist = [self.lander] + self.legs

-        return self._step(0)[0]
+        return self._step(np.array([0,0]) if self.continuous else 0)[0]

-    def _create_particle(self, mass, x, y):
+    def _create_particle(self, mass, x, y, ttl):
        p = self.world.CreateDynamicBody(
            position = (x,y),
            angle=0.0,
@@ -217,7 +225,7 @@ class LunarLander(gym.Env):
                maskBits=0x001,  # collide only with ground
                restitution=0.3)
                )
-        p.ttl = 1
+        p.ttl = ttl
        self.particles.append(p)
        self._clean_particles(False)
        return p
@@ -233,22 +241,38 @@ class LunarLander(gym.Env):
        tip  = (math.sin(self.lander.angle), math.cos(self.lander.angle))
        side = (-tip[1], tip[0]);
        dispersion = [self.np_random.uniform(-1.0, +1.0) / SCALE for _ in range(2)]
-        if action==2: # Main engine
+
+        m_power = 0.0
+        if (self.continuous and action[0] > 0.0) or (not self.continuous and action==2):
+            # Main engine
+            if self.continuous:
+                m_power = (np.clip(action[0], 0.0,1.0) + 1.0)*0.5   # 0.5..1.0
+                assert m_power>=0.5 and m_power <= 1.0
+            else:
+                m_power = 1.0
            ox =  tip[0]*(4/SCALE + 2*dispersion[0]) + side[0]*dispersion[1]   # 4 is move a bit downwards, +-2 for randomness
            oy = -tip[1]*(4/SCALE + 2*dispersion[0]) - side[1]*dispersion[1]
            impulse_pos = (self.lander.position[0] + ox, self.lander.position[1] + oy)
-            p = self._create_particle(3.5, *impulse_pos)    # particles are just a decoration, 3.5 is here to make particle speed adequate
-            p.ApplyLinearImpulse(           ( ox*MAIN_ENGINE_POWER,  oy*MAIN_ENGINE_POWER), impulse_pos, True)
-            self.lander.ApplyLinearImpulse( (-ox*MAIN_ENGINE_POWER, -oy*MAIN_ENGINE_POWER), impulse_pos, True)
+            p = self._create_particle(3.5, impulse_pos[0], impulse_pos[1], m_power)    # particles are just a decoration, 3.5 is here to make particle speed adequate
+            p.ApplyLinearImpulse(           ( ox*MAIN_ENGINE_POWER*m_power,  oy*MAIN_ENGINE_POWER*m_power), impulse_pos, True)
+            self.lander.ApplyLinearImpulse( (-ox*MAIN_ENGINE_POWER*m_power, -oy*MAIN_ENGINE_POWER*m_power), impulse_pos, True)

-        if action==1 or action==3: # Orientation engines
-            direction = action-2
+        s_power = 0.0
+        if (self.continuous and np.abs(action[1]) > 0.5) or (not self.continuous and action in [1,3]):
+            # Orientation engines
+            if self.continuous:
+                direction = np.sign(action[1])
+                s_power = np.clip(np.abs(action[1]), 0.5,1.0)
+                assert s_power>=0.5 and s_power <= 1.0
+            else:
+                direction = action-2
+                s_power = 1.0
            ox =  tip[0]*dispersion[0] + side[0]*(3*dispersion[1]+direction*SIDE_ENGINE_AWAY/SCALE)
            oy = -tip[1]*dispersion[0] - side[1]*(3*dispersion[1]+direction*SIDE_ENGINE_AWAY/SCALE)
            impulse_pos = (self.lander.position[0] + ox - tip[0]*17/SCALE, self.lander.position[1] + oy + tip[1]*SIDE_ENGINE_HEIGHT/SCALE)
-            p = self._create_particle(0.7, *impulse_pos)
-            p.ApplyLinearImpulse(           ( ox*SIDE_ENGINE_POWER,  oy*SIDE_ENGINE_POWER), impulse_pos, True)
-            self.lander.ApplyLinearImpulse( (-ox*SIDE_ENGINE_POWER, -oy*SIDE_ENGINE_POWER), impulse_pos, True)
+            p = self._create_particle(0.7, impulse_pos[0], impulse_pos[1], s_power)
+            p.ApplyLinearImpulse(           ( ox*SIDE_ENGINE_POWER*s_power,  oy*SIDE_ENGINE_POWER*s_power), impulse_pos, True)
+            self.lander.ApplyLinearImpulse( (-ox*SIDE_ENGINE_POWER*s_power, -oy*SIDE_ENGINE_POWER*s_power), impulse_pos, True)

        self.world.Step(1.0/FPS, 6*30, 2*30)

@@ -276,10 +300,8 @@ class LunarLander(gym.Env):
            reward = shaping - self.prev_shaping
        self.prev_shaping = shaping

-        if action==2:       # main engine
-            reward -= 0.30  # less fuel spent is better, about -30 for heurisic landing
-        elif action != 0:
-            reward -= 0.03
+        reward -= m_power*0.30  # less fuel spent is better, about -30 for heurisic landing
+        reward -= s_power*0.03

        done = False
        if self.game_over or abs(state[0]) >= 1.0:
@@ -333,42 +355,53 @@ class LunarLander(gym.Env):

        return self.viewer.render(return_rgb_array = mode=='rgb_array')

+class LunarLanderContinuous(LunarLander):
+    continuous = True
+
+def heuristic(env, s):
+    # Heuristic for:
+    # 1. Testing. 
+    # 2. Demonstration rollout.
+    angle_targ = s[0]*0.5 + s[2]*1.0         # angle should point towards center (s[0] is horizontal coordinate, s[2] hor speed)
+    if angle_targ >  0.4: angle_targ =  0.4  # more than 0.4 radians (22 degrees) is bad
+    if angle_targ < -0.4: angle_targ = -0.4
+    hover_targ = 0.55*np.abs(s[0])           # target y should be proporional to horizontal offset
+
+    # PID controller: s[4] angle, s[5] angularSpeed
+    angle_todo = (angle_targ - s[4])*0.5 - (s[5])*1.0
+    #print("angle_targ=%0.2f, angle_todo=%0.2f" % (angle_targ, angle_todo))
+
+    # PID controller: s[1] vertical coordinate s[3] vertical speed
+    hover_todo = (hover_targ - s[1])*0.5 - (s[3])*0.5
+    #print("hover_targ=%0.2f, hover_todo=%0.2f" % (hover_targ, hover_todo))
+
+    if s[6] or s[7]: # legs have contact
+        angle_todo = 0
+        hover_todo = -(s[3])*0.5  # override to reduce fall speed, that's all we need after contact
+
+    if env.continuous:
+        a = np.array( [hover_todo*20 - 1, -angle_todo*20] )
+        a = np.clip(a, -1, +1)
+    else:
+        a = 0
+        if hover_todo > np.abs(angle_todo) and hover_todo > 0.05: a = 2
+        elif angle_todo < -0.05: a = 3
+        elif angle_todo > +0.05: a = 1
+    return a
+
 if __name__=="__main__":
-    # Heuristic for testing.
-    env = LunarLander()
-    env.reset()
-    steps = 0
+    #env = LunarLander()
+    env = LunarLanderContinuous()
+    s = env.reset()
    total_reward = 0
-    a = 0
+    steps = 0
    while True:
+        a = heuristic(env, s)
        s, r, done, info = env.step(a)
+        env.render()
        total_reward += r
        if steps % 20 == 0 or done:
            print(["{:+0.2f}".format(x) for x in s])
            print("step {} total_reward {:+0.2f}".format(steps, total_reward))
        steps += 1
-
-        angle_targ = s[0]*0.5 + s[2]*1.0         # angle should point towards center (s[0] is horizontal coordinate, s[2] hor speed)
-        if angle_targ >  0.4: angle_targ =  0.4  # more than 0.4 radians (22 degrees) is bad
-        if angle_targ < -0.4: angle_targ = -0.4
-        hover_targ = 0.55*np.abs(s[0])           # target y should be proporional to horizontal offset
-
-        # PID controller: s[4] angle, s[5] angularSpeed
-        angle_todo = (angle_targ - s[4])*0.5 - (s[5])*1.0
-        #print("angle_targ=%0.2f, angle_todo=%0.2f" % (angle_targ, angle_todo))
-
-        # PID controller: s[1] vertical coordinate s[3] vertical speed
-        hover_todo = (hover_targ - s[1])*0.5 - (s[3])*0.5
-        #print("hover_targ=%0.2f, hover_todo=%0.2f" % (hover_targ, hover_todo))
-
-        if s[6] or s[7]: # legs have contact
-            angle_todo = 0
-            hover_todo = -(s[3])*0.5  # override to reduce fall speed, that's all we need after contact
-
-        a = 0
-        if hover_todo > np.abs(angle_todo) and hover_todo > 0.05: a = 2
-        elif angle_todo < -0.05: a = 3
-        elif angle_todo > +0.05: a = 1
-
-        env.render()
        if done: break
--- a/gym/scoreboard/init.py
+++ b/gym/scoreboard/init.py
@@ -303,6 +303,27 @@ comes to rest, receiving additional -100 or +100 points. Each leg ground contact
 engine is -0.3 points each frame. Solved is 200 points.
 Landing outside landing pad is possible. Fuel is infinite, so an agent can learn to fly and then land
 on its first attempt.
+Four discrete actions available: do nothing, fire left orientation engine, fire main engine, fire
+right orientation engine.
+""")
+
+add_task(
+    id='LunarLanderContinuous-v2',
+    group='box2d',
+    experimental=True,
+    contributor='olegklimov',
+    summary='Navigate a lander to its landing pad.',
+    description="""
+Landing pad is always at coordinates (0,0). Coordinates are the first two numbers in state vector.
+Reward for moving from the top of the screen to landing pad and zero speed is about 100..140 points.
+If lander moves away from landing pad it loses reward back. Episode finishes if the lander crashes or
+comes to rest, receiving additional -100 or +100 points. Each leg ground contact is +10. Firing main
+engine is -0.3 points each frame. Solved is 200 points.
+Landing outside landing pad is possible. Fuel is infinite, so an agent can learn to fly and then land
+on its first attempt.
+Action is two real values vector from -1 to +1. First controls main engine, -1..0 off, 0..+1 throttle
+from 50% to 100% power. Engine can't work with less than 50% power. Second value -1.0..-0.5 fire left
+engine, +0.5..+1.0 fire right engine, -0.5..0.5 off.
 """)

 add_task(