From ee2c0243c088dcdb749030f8b3fb60b718652449 Mon Sep 17 00:00:00 2001
From: Oleg Klimov <omgtech@gmail.com>
Date: Thu, 25 Aug 2016 02:08:32 +0300
Subject: [PATCH] LunarLanderContinuous (#307)

* New LunarLanderContinuous, LunarLander-v2 remains exactly the same, no version bump.

* keyboard_agent.py works again.
---
 examples/agents/keyboard_agent.py |   4 +-
 gym/envs/__init__.py              |   7 ++
 gym/envs/box2d/__init__.py        |   1 +
 gym/envs/box2d/lunar_lander.py    | 133 +++++++++++++++++++-----------
 gym/scoreboard/__init__.py        |  21 +++++
 5 files changed, 114 insertions(+), 52 deletions(-)

diff --git a/examples/agents/keyboard_agent.py b/examples/agents/keyboard_agent.py
index aea5b8d79..d4e9d5cf1 100644
--- a/examples/agents/keyboard_agent.py
+++ b/examples/agents/keyboard_agent.py
@@ -22,13 +22,13 @@ def key_press(key, mod):
     global human_agent_action, human_wants_restart, human_sets_pause
     if key==0xff0d: human_wants_restart = True
     if key==32: human_sets_pause = not human_sets_pause
-    a = key - ord('0')
+    a = int( key - ord('0') )
     if a <= 0 or a >= ACTIONS: return
     human_agent_action = a
 
 def key_release(key, mod):
     global human_agent_action
-    a = key - ord('0')
+    a = int( key - ord('0') )
     if a <= 0 or a >= ACTIONS: return
     if human_agent_action == a:
         human_agent_action = 0
diff --git a/gym/envs/__init__.py b/gym/envs/__init__.py
index 210361fdf..4a491e6fc 100644
--- a/gym/envs/__init__.py
+++ b/gym/envs/__init__.py
@@ -100,6 +100,13 @@ register(
     reward_threshold=200,
 )
 
+register(
+    id='LunarLanderContinuous-v2',
+    entry_point='gym.envs.box2d:LunarLanderContinuous',
+    timestep_limit=1000,
+    reward_threshold=200,
+)
+
 register(
     id='BipedalWalker-v2',
     entry_point='gym.envs.box2d:BipedalWalker',
diff --git a/gym/envs/box2d/__init__.py b/gym/envs/box2d/__init__.py
index abcc18322..725f319ea 100644
--- a/gym/envs/box2d/__init__.py
+++ b/gym/envs/box2d/__init__.py
@@ -1,3 +1,4 @@
 from gym.envs.box2d.lunar_lander import LunarLander
+from gym.envs.box2d.lunar_lander import LunarLanderContinuous
 from gym.envs.box2d.bipedal_walker import BipedalWalker, BipedalWalkerHardcore
 from gym.envs.box2d.car_racing import CarRacing
diff --git a/gym/envs/box2d/lunar_lander.py b/gym/envs/box2d/lunar_lander.py
index 61ce6428a..4c4ee68b1 100644
--- a/gym/envs/box2d/lunar_lander.py
+++ b/gym/envs/box2d/lunar_lander.py
@@ -76,6 +76,8 @@ class LunarLander(gym.Env):
         'video.frames_per_second' : FPS
     }
 
+    continuous = False
+
     def __init__(self):
         self._seed()
         self.viewer = None
@@ -87,12 +89,18 @@ class LunarLander(gym.Env):
 
         self.prev_reward = None
 
-        # useful range is -1 .. +1
-        high = np.array([np.inf]*8)
-        # nop, fire left engine, main engine, right engine
-        self.action_space = spaces.Discrete(4)
+        high = np.array([np.inf]*8)  # useful range is -1 .. +1, but spikes can be higher
         self.observation_space = spaces.Box(-high, high)
-        
+
+        if self.continuous:
+            # Action is two floats [main engine, left-right engines].
+            # Main engine: -1..0 off, 0..+1 throttle from 50% to 100% power. Engine can't work with less than 50% power.
+            # Left-right:  -1.0..-0.5 fire left engine, +0.5..+1.0 fire right engine, -0.5..0.5 off
+            self.action_space = spaces.Box(-1, +1, (2,))
+        else:
+            # Nop, fire left engine, main engine, right engine
+            self.action_space = spaces.Discrete(4)
+
         self._reset()
 
     def _seed(self, seed=None):
@@ -203,9 +211,9 @@ class LunarLander(gym.Env):
 
         self.drawlist = [self.lander] + self.legs
 
-        return self._step(0)[0]
+        return self._step(np.array([0,0]) if self.continuous else 0)[0]
 
-    def _create_particle(self, mass, x, y):
+    def _create_particle(self, mass, x, y, ttl):
         p = self.world.CreateDynamicBody(
             position = (x,y),
             angle=0.0,
@@ -217,7 +225,7 @@ class LunarLander(gym.Env):
                 maskBits=0x001,  # collide only with ground
                 restitution=0.3)
                 )
-        p.ttl = 1
+        p.ttl = ttl
         self.particles.append(p)
         self._clean_particles(False)
         return p
@@ -233,22 +241,38 @@ class LunarLander(gym.Env):
         tip  = (math.sin(self.lander.angle), math.cos(self.lander.angle))
         side = (-tip[1], tip[0]);
         dispersion = [self.np_random.uniform(-1.0, +1.0) / SCALE for _ in range(2)]
-        if action==2: # Main engine
+
+        m_power = 0.0
+        if (self.continuous and action[0] > 0.0) or (not self.continuous and action==2):
+            # Main engine
+            if self.continuous:
+                m_power = (np.clip(action[0], 0.0,1.0) + 1.0)*0.5   # 0.5..1.0
+                assert m_power>=0.5 and m_power <= 1.0
+            else:
+                m_power = 1.0
             ox =  tip[0]*(4/SCALE + 2*dispersion[0]) + side[0]*dispersion[1]   # 4 is move a bit downwards, +-2 for randomness
             oy = -tip[1]*(4/SCALE + 2*dispersion[0]) - side[1]*dispersion[1]
             impulse_pos = (self.lander.position[0] + ox, self.lander.position[1] + oy)
-            p = self._create_particle(3.5, *impulse_pos)    # particles are just a decoration, 3.5 is here to make particle speed adequate
-            p.ApplyLinearImpulse(           ( ox*MAIN_ENGINE_POWER,  oy*MAIN_ENGINE_POWER), impulse_pos, True)
-            self.lander.ApplyLinearImpulse( (-ox*MAIN_ENGINE_POWER, -oy*MAIN_ENGINE_POWER), impulse_pos, True)
+            p = self._create_particle(3.5, impulse_pos[0], impulse_pos[1], m_power)    # particles are just a decoration, 3.5 is here to make particle speed adequate
+            p.ApplyLinearImpulse(           ( ox*MAIN_ENGINE_POWER*m_power,  oy*MAIN_ENGINE_POWER*m_power), impulse_pos, True)
+            self.lander.ApplyLinearImpulse( (-ox*MAIN_ENGINE_POWER*m_power, -oy*MAIN_ENGINE_POWER*m_power), impulse_pos, True)
 
-        if action==1 or action==3: # Orientation engines
-            direction = action-2
+        s_power = 0.0
+        if (self.continuous and np.abs(action[1]) > 0.5) or (not self.continuous and action in [1,3]):
+            # Orientation engines
+            if self.continuous:
+                direction = np.sign(action[1])
+                s_power = np.clip(np.abs(action[1]), 0.5,1.0)
+                assert s_power>=0.5 and s_power <= 1.0
+            else:
+                direction = action-2
+                s_power = 1.0
             ox =  tip[0]*dispersion[0] + side[0]*(3*dispersion[1]+direction*SIDE_ENGINE_AWAY/SCALE)
             oy = -tip[1]*dispersion[0] - side[1]*(3*dispersion[1]+direction*SIDE_ENGINE_AWAY/SCALE)
             impulse_pos = (self.lander.position[0] + ox - tip[0]*17/SCALE, self.lander.position[1] + oy + tip[1]*SIDE_ENGINE_HEIGHT/SCALE)
-            p = self._create_particle(0.7, *impulse_pos)
-            p.ApplyLinearImpulse(           ( ox*SIDE_ENGINE_POWER,  oy*SIDE_ENGINE_POWER), impulse_pos, True)
-            self.lander.ApplyLinearImpulse( (-ox*SIDE_ENGINE_POWER, -oy*SIDE_ENGINE_POWER), impulse_pos, True)
+            p = self._create_particle(0.7, impulse_pos[0], impulse_pos[1], s_power)
+            p.ApplyLinearImpulse(           ( ox*SIDE_ENGINE_POWER*s_power,  oy*SIDE_ENGINE_POWER*s_power), impulse_pos, True)
+            self.lander.ApplyLinearImpulse( (-ox*SIDE_ENGINE_POWER*s_power, -oy*SIDE_ENGINE_POWER*s_power), impulse_pos, True)
 
         self.world.Step(1.0/FPS, 6*30, 2*30)
 
@@ -276,10 +300,8 @@ class LunarLander(gym.Env):
             reward = shaping - self.prev_shaping
         self.prev_shaping = shaping
 
-        if action==2:       # main engine
-            reward -= 0.30  # less fuel spent is better, about -30 for heurisic landing
-        elif action != 0:
-            reward -= 0.03
+        reward -= m_power*0.30  # less fuel spent is better, about -30 for heurisic landing
+        reward -= s_power*0.03
 
         done = False
         if self.game_over or abs(state[0]) >= 1.0:
@@ -333,42 +355,53 @@ class LunarLander(gym.Env):
 
         return self.viewer.render(return_rgb_array = mode=='rgb_array')
 
+class LunarLanderContinuous(LunarLander):
+    continuous = True
+
+def heuristic(env, s):
+    # Heuristic for:
+    # 1. Testing. 
+    # 2. Demonstration rollout.
+    angle_targ = s[0]*0.5 + s[2]*1.0         # angle should point towards center (s[0] is horizontal coordinate, s[2] hor speed)
+    if angle_targ >  0.4: angle_targ =  0.4  # more than 0.4 radians (22 degrees) is bad
+    if angle_targ < -0.4: angle_targ = -0.4
+    hover_targ = 0.55*np.abs(s[0])           # target y should be proporional to horizontal offset
+
+    # PID controller: s[4] angle, s[5] angularSpeed
+    angle_todo = (angle_targ - s[4])*0.5 - (s[5])*1.0
+    #print("angle_targ=%0.2f, angle_todo=%0.2f" % (angle_targ, angle_todo))
+
+    # PID controller: s[1] vertical coordinate s[3] vertical speed
+    hover_todo = (hover_targ - s[1])*0.5 - (s[3])*0.5
+    #print("hover_targ=%0.2f, hover_todo=%0.2f" % (hover_targ, hover_todo))
+
+    if s[6] or s[7]: # legs have contact
+        angle_todo = 0
+        hover_todo = -(s[3])*0.5  # override to reduce fall speed, that's all we need after contact
+
+    if env.continuous:
+        a = np.array( [hover_todo*20 - 1, -angle_todo*20] )
+        a = np.clip(a, -1, +1)
+    else:
+        a = 0
+        if hover_todo > np.abs(angle_todo) and hover_todo > 0.05: a = 2
+        elif angle_todo < -0.05: a = 3
+        elif angle_todo > +0.05: a = 1
+    return a
+
 if __name__=="__main__":
-    # Heuristic for testing.
-    env = LunarLander()
-    env.reset()
-    steps = 0
+    #env = LunarLander()
+    env = LunarLanderContinuous()
+    s = env.reset()
     total_reward = 0
-    a = 0
+    steps = 0
     while True:
+        a = heuristic(env, s)
         s, r, done, info = env.step(a)
+        env.render()
         total_reward += r
         if steps % 20 == 0 or done:
             print(["{:+0.2f}".format(x) for x in s])
             print("step {} total_reward {:+0.2f}".format(steps, total_reward))
         steps += 1
-
-        angle_targ = s[0]*0.5 + s[2]*1.0         # angle should point towards center (s[0] is horizontal coordinate, s[2] hor speed)
-        if angle_targ >  0.4: angle_targ =  0.4  # more than 0.4 radians (22 degrees) is bad
-        if angle_targ < -0.4: angle_targ = -0.4
-        hover_targ = 0.55*np.abs(s[0])           # target y should be proporional to horizontal offset
-
-        # PID controller: s[4] angle, s[5] angularSpeed
-        angle_todo = (angle_targ - s[4])*0.5 - (s[5])*1.0
-        #print("angle_targ=%0.2f, angle_todo=%0.2f" % (angle_targ, angle_todo))
-
-        # PID controller: s[1] vertical coordinate s[3] vertical speed
-        hover_todo = (hover_targ - s[1])*0.5 - (s[3])*0.5
-        #print("hover_targ=%0.2f, hover_todo=%0.2f" % (hover_targ, hover_todo))
-
-        if s[6] or s[7]: # legs have contact
-            angle_todo = 0
-            hover_todo = -(s[3])*0.5  # override to reduce fall speed, that's all we need after contact
-
-        a = 0
-        if hover_todo > np.abs(angle_todo) and hover_todo > 0.05: a = 2
-        elif angle_todo < -0.05: a = 3
-        elif angle_todo > +0.05: a = 1
-
-        env.render()
         if done: break
diff --git a/gym/scoreboard/__init__.py b/gym/scoreboard/__init__.py
index 0e52c5df7..e97afb786 100644
--- a/gym/scoreboard/__init__.py
+++ b/gym/scoreboard/__init__.py
@@ -303,6 +303,27 @@ comes to rest, receiving additional -100 or +100 points. Each leg ground contact
 engine is -0.3 points each frame. Solved is 200 points.
 Landing outside landing pad is possible. Fuel is infinite, so an agent can learn to fly and then land
 on its first attempt.
+Four discrete actions available: do nothing, fire left orientation engine, fire main engine, fire
+right orientation engine.
+""")
+
+add_task(
+    id='LunarLanderContinuous-v2',
+    group='box2d',
+    experimental=True,
+    contributor='olegklimov',
+    summary='Navigate a lander to its landing pad.',
+    description="""
+Landing pad is always at coordinates (0,0). Coordinates are the first two numbers in state vector.
+Reward for moving from the top of the screen to landing pad and zero speed is about 100..140 points.
+If lander moves away from landing pad it loses reward back. Episode finishes if the lander crashes or
+comes to rest, receiving additional -100 or +100 points. Each leg ground contact is +10. Firing main
+engine is -0.3 points each frame. Solved is 200 points.
+Landing outside landing pad is possible. Fuel is infinite, so an agent can learn to fly and then land
+on its first attempt.
+Action is two real values vector from -1 to +1. First controls main engine, -1..0 off, 0..+1 throttle
+from 50% to 100% power. Engine can't work with less than 50% power. Second value -1.0..-0.5 fire left
+engine, +0.5..+1.0 fire right engine, -0.5..0.5 off.
 """)
 
 add_task(