From e57f81beccb85ad462d7841bf308d30b87068a76 Mon Sep 17 00:00:00 2001
From: Tianhong Dai <tianhongdai914@gmail.com>
Date: Wed, 17 Oct 2018 00:22:06 +0100
Subject: [PATCH 1/6] revise the readme of ddpg (#653)

---
 baselines/ddpg/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/baselines/ddpg/README.md b/baselines/ddpg/README.md
index 6e936dd..ed6d23f 100755
--- a/baselines/ddpg/README.md
+++ b/baselines/ddpg/README.md
@@ -2,4 +2,4 @@
 
 - Original paper: https://arxiv.org/abs/1509.02971
 - Baselines post: https://blog.openai.com/better-exploration-with-parameter-noise/
-- `python -m baselines.ddpg.main` runs the algorithm for 1M frames = 10M timesteps on a Mujoco environment. See help (`-h`) for more options.
\ No newline at end of file
+- `python -m baselines.run --alg=ddpg --env=HalfCheetah-v2 --num_timesteps=1e6` runs the algorithm for 1M frames = 10M timesteps on a Mujoco environment. See help (`-h`) for more options.

From a03dacd68d6fe451670f63d6d0cc204aeb72e5dd Mon Sep 17 00:00:00 2001
From: pzhokhov <peterzhokhoff@gmail.com>
Date: Tue, 16 Oct 2018 16:26:46 -0700
Subject: [PATCH 2/6] sync internal changes. Make ddpg work with vecenvs (#654)

* sync internal changes. Make ddpg work with vecenvs

* B -> nenvs for consistency with other algos, small cleanups

* eval_done[d]==True -> eval_done[d]

* flake8 and numpy.random.random_integers deprecation warning
---
 baselines/common/tests/test_identity.py |  2 +-
 baselines/common/tf_util.py             |  2 +-
 baselines/common/vec_env/__init__.py    |  1 +
 baselines/ddpg/ddpg.py                  | 73 ++++++++++++++++---------
 baselines/ddpg/ddpg_learner.py          | 13 +++--
 baselines/ddpg/memory.py                |  2 +-
 6 files changed, 59 insertions(+), 34 deletions(-)

diff --git a/baselines/common/tests/test_identity.py b/baselines/common/tests/test_identity.py
index 8624df1..744ed83 100644
--- a/baselines/common/tests/test_identity.py
+++ b/baselines/common/tests/test_identity.py
@@ -14,7 +14,7 @@ learn_kwargs = {
     'a2c' : {},
     'acktr': {},
     'deepq': {},
-    'ddpg': dict(nb_epochs=None, layer_norm=True),
+    'ddpg': dict(layer_norm=True),
     'ppo2': dict(lr=1e-3, nsteps=64, ent_coef=0.0),
     'trpo_mpi': dict(timesteps_per_batch=100, cg_iters=10, gamma=0.9, lam=1.0, max_kl=0.01)
 }
diff --git a/baselines/common/tf_util.py b/baselines/common/tf_util.py
index b293975..2699035 100644
--- a/baselines/common/tf_util.py
+++ b/baselines/common/tf_util.py
@@ -293,7 +293,7 @@ def display_var_info(vars):
         if "/Adam" in name or "beta1_power" in name or "beta2_power" in name: continue
         v_params = np.prod(v.shape.as_list())
         count_params += v_params
-        if "/b:" in name or "/biases" in name: continue    # Wx+b, bias is not interesting to look at => count params, but not print
+        if "/b:" in name or "/bias" in name: continue    # Wx+b, bias is not interesting to look at => count params, but not print
         logger.info("   %s%s %i params %s" % (name, " "*(55-len(name)), v_params, str(v.shape)))
 
     logger.info("Total model parameters: %0.2f million" % (count_params*1e-6))
diff --git a/baselines/common/vec_env/__init__.py b/baselines/common/vec_env/__init__.py
index b3cdca7..cb60531 100644
--- a/baselines/common/vec_env/__init__.py
+++ b/baselines/common/vec_env/__init__.py
@@ -104,6 +104,7 @@ class VecEnv(ABC):
         bigimg = tile_images(imgs)
         if mode == 'human':
             self.get_viewer().imshow(bigimg)
+            return self.get_viewer().isopen
         elif mode == 'rgb_array':
             return bigimg
         else:
diff --git a/baselines/ddpg/ddpg.py b/baselines/ddpg/ddpg.py
index b272aea..181f923 100755
--- a/baselines/ddpg/ddpg.py
+++ b/baselines/ddpg/ddpg.py
@@ -78,6 +78,7 @@ def learn(network, env,
 
     max_action = env.action_space.high
     logger.info('scaling actions by {} before executing in env'.format(max_action))
+
     agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape,
         gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations,
         batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg,
@@ -94,16 +95,21 @@ def learn(network, env,
     sess.graph.finalize()
 
     agent.reset()
+
     obs = env.reset()
     if eval_env is not None:
         eval_obs = eval_env.reset()
-    done = False
-    episode_reward = 0.
-    episode_step = 0
-    episodes = 0
-    t = 0
+    nenvs = obs.shape[0]
+
+    episode_reward = np.zeros(nenvs, dtype = np.float32) #vector
+    episode_step = np.zeros(nenvs, dtype = int) # vector
+    episodes = 0 #scalar
+    t = 0 # scalar
 
     epoch = 0
+
+
+
     start_time = time.time()
 
     epoch_episode_rewards = []
@@ -114,16 +120,22 @@ def learn(network, env,
     for epoch in range(nb_epochs):
         for cycle in range(nb_epoch_cycles):
             # Perform rollouts.
+            if nenvs > 1:
+                # if simulating multiple envs in parallel, impossible to reset agent at the end of the episode in each
+                # of the environments, so resetting here instead
+                agent.reset()
             for t_rollout in range(nb_rollout_steps):
                 # Predict next action.
                 action, q, _, _ = agent.step(obs, apply_noise=True, compute_Q=True)
-                assert action.shape == env.action_space.shape
 
                 # Execute next action.
                 if rank == 0 and render:
                     env.render()
-                assert max_action.shape == action.shape
+
+                # max_action is of dimension A, whereas action is dimension (nenvs, A) - the multiplication gets broadcasted to the batch
                 new_obs, r, done, info = env.step(max_action * action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
+                # note these outputs are batched from vecenv
+
                 t += 1
                 if rank == 0 and render:
                     env.render()
@@ -133,21 +145,24 @@ def learn(network, env,
                 # Book-keeping.
                 epoch_actions.append(action)
                 epoch_qs.append(q)
-                agent.store_transition(obs, action, r, new_obs, done)
+                agent.store_transition(obs, action, r, new_obs, done) #the batched data will be unrolled in memory.py's append.
+
                 obs = new_obs
 
-                if done:
-                    # Episode done.
-                    epoch_episode_rewards.append(episode_reward)
-                    episode_rewards_history.append(episode_reward)
-                    epoch_episode_steps.append(episode_step)
-                    episode_reward = 0.
-                    episode_step = 0
-                    epoch_episodes += 1
-                    episodes += 1
+                for d in range(len(done)):
+                    if done[d]:
+                        # Episode done.
+                        epoch_episode_rewards.append(episode_reward[d])
+                        episode_rewards_history.append(episode_reward[d])
+                        epoch_episode_steps.append(episode_step[d])
+                        episode_reward[d] = 0.
+                        episode_step[d] = 0
+                        epoch_episodes += 1
+                        episodes += 1
+                        if nenvs == 1:
+                            agent.reset()
+
 
-                    agent.reset()
-                    obs = env.reset()
 
             # Train.
             epoch_actor_losses = []
@@ -168,7 +183,8 @@ def learn(network, env,
             eval_episode_rewards = []
             eval_qs = []
             if eval_env is not None:
-                eval_episode_reward = 0.
+                nenvs_eval = eval_obs.shape[0]
+                eval_episode_reward = np.zeros(nenvs_eval, dtype = np.float32)
                 for t_rollout in range(nb_eval_steps):
                     eval_action, eval_q, _, _ = agent.step(eval_obs, apply_noise=False, compute_Q=True)
                     eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
@@ -177,11 +193,11 @@ def learn(network, env,
                     eval_episode_reward += eval_r
 
                     eval_qs.append(eval_q)
-                    if eval_done:
-                        eval_obs = eval_env.reset()
-                        eval_episode_rewards.append(eval_episode_reward)
-                        eval_episode_rewards_history.append(eval_episode_reward)
-                        eval_episode_reward = 0.
+                    for d in range(len(eval_done)):
+                        if eval_done[d]:
+                            eval_episode_rewards.append(eval_episode_reward[d])
+                            eval_episode_rewards_history.append(eval_episode_reward[d])
+                            eval_episode_reward[d] = 0.0
 
         mpi_size = MPI.COMM_WORLD.Get_size()
         # Log stats.
@@ -216,7 +232,8 @@ def learn(network, env,
                 return x
             else:
                 raise ValueError('expected scalar, got %s'%x)
-        combined_stats_sums = MPI.COMM_WORLD.allreduce(np.array([as_scalar(x) for x in combined_stats.values()]))
+
+        combined_stats_sums = MPI.COMM_WORLD.allreduce(np.array([ np.array(x).flatten()[0] for x in combined_stats.values()]))
         combined_stats = {k : v / mpi_size for (k,v) in zip(combined_stats.keys(), combined_stats_sums)}
 
         # Total statistics.
@@ -225,7 +242,9 @@ def learn(network, env,
 
         for key in sorted(combined_stats.keys()):
             logger.record_tabular(key, combined_stats[key])
-        logger.dump_tabular()
+
+        if rank == 0:
+            logger.dump_tabular()
         logger.info('')
         logdir = logger.get_dir()
         if rank == 0 and logdir:
diff --git a/baselines/ddpg/ddpg_learner.py b/baselines/ddpg/ddpg_learner.py
index cfaa290..44f231f 100755
--- a/baselines/ddpg/ddpg_learner.py
+++ b/baselines/ddpg/ddpg_learner.py
@@ -265,19 +265,24 @@ class DDPG(object):
         else:
             action = self.sess.run(actor_tf, feed_dict=feed_dict)
             q = None
-        action = action.flatten()
+
         if self.action_noise is not None and apply_noise:
             noise = self.action_noise()
             assert noise.shape == action.shape
             action += noise
         action = np.clip(action, self.action_range[0], self.action_range[1])
+
+
         return action, q, None, None
 
     def store_transition(self, obs0, action, reward, obs1, terminal1):
         reward *= self.reward_scale
-        self.memory.append(obs0, action, reward, obs1, terminal1)
-        if self.normalize_observations:
-            self.obs_rms.update(np.array([obs0]))
+
+        B = obs0.shape[0]
+        for b in range(B):
+            self.memory.append(obs0[b], action[b], reward[b], obs1[b], terminal1[b])
+            if self.normalize_observations:
+                self.obs_rms.update(np.array([obs0[b]]))
 
     def train(self):
         # Get a batch.
diff --git a/baselines/ddpg/memory.py b/baselines/ddpg/memory.py
index 781fa71..715f4c0 100755
--- a/baselines/ddpg/memory.py
+++ b/baselines/ddpg/memory.py
@@ -51,7 +51,7 @@ class Memory(object):
 
     def sample(self, batch_size):
         # Draw such that we always have a proceeding element.
-        batch_idxs = np.random.random_integers(self.nb_entries - 2, size=batch_size)
+        batch_idxs = np.random.randint(self.nb_entries - 2, size=batch_size)
 
         obs0_batch = self.observations0.get_batch(batch_idxs)
         obs1_batch = self.observations1.get_batch(batch_idxs)

From ef96f3835b34ae59ba6d668ffa3befeed60af6fa Mon Sep 17 00:00:00 2001
From: Matthew Rahtz <matthew.rahtz@gmail.com>
Date: Tue, 16 Oct 2018 16:28:23 -0700
Subject: [PATCH 3/6] Drop S and M args so that --play works (#636)

---
 baselines/deepq/deepq.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/baselines/deepq/deepq.py b/baselines/deepq/deepq.py
index 47fe19a..c6004b2 100644
--- a/baselines/deepq/deepq.py
+++ b/baselines/deepq/deepq.py
@@ -47,6 +47,9 @@ class ActWrapper(object):
         return self._act(*args, **kwargs)
 
     def step(self, observation, **kwargs):
+        # DQN doesn't use RNNs so we ignore states and masks
+        kwargs.pop('S', None)
+        kwargs.pop('M', None)
         return self._act([observation], **kwargs), None, None, None
 
     def save_act(self, path=None):

From 3677dc1b23c344a746c41f1929d7ae88c18afe1b Mon Sep 17 00:00:00 2001
From: Matthew Rahtz <matthew.rahtz@gmail.com>
Date: Thu, 18 Oct 2018 13:54:39 -0700
Subject: [PATCH 4/6] Set allow_growth=True for MuJoCo session (#643)

---
 baselines/run.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/baselines/run.py b/baselines/run.py
index 5dee154..8ab71ac 100644
--- a/baselines/run.py
+++ b/baselines/run.py
@@ -121,9 +121,11 @@ def build_env(args):
         env = retro_wrappers.wrap_deepmind_retro(env)
 
     else:
-       get_session(tf.ConfigProto(allow_soft_placement=True,
-                                   intra_op_parallelism_threads=1,
-                                   inter_op_parallelism_threads=1))
+       config = tf.ConfigProto(allow_soft_placement=True,
+                               intra_op_parallelism_threads=1,
+                               inter_op_parallelism_threads=1)
+       config.gpu_options.allow_growth = True
+       get_session(config=config)
 
        env = make_vec_env(env_id, env_type, args.num_env or 1, seed, reward_scale=args.reward_scale)
 

From fc7f9cec49786d50bb635228da0062972bdcf967 Mon Sep 17 00:00:00 2001
From: pzhokhov <peterzhokhoff@gmail.com>
Date: Thu, 18 Oct 2018 16:07:14 -0700
Subject: [PATCH 5/6] disable gym subpackages in setup.py (#661)

* disable gym subpackages in setup.py

* include gym[atari] in test requirements

* gym[atari] -> atari-py in test requirements
---
 setup.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 5ec1fce..726c6a3 100644
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,8 @@ if sys.version_info.major != 3:
 extras = {
     'test': [
         'filelock',
-        'pytest'
+        'pytest',
+        'atari-py'
     ],
     'bullet': [
         'pybullet',
@@ -27,7 +28,7 @@ setup(name='baselines',
       packages=[package for package in find_packages()
                 if package.startswith('baselines')],
       install_requires=[
-          'gym[mujoco,atari,classic_control,robotics]',
+          'gym',
           'scipy',
           'tqdm',
           'joblib',

From d0cc325e1414b56bcc6cbd90bbc1778d49a3e950 Mon Sep 17 00:00:00 2001
From: pzhokhov <peterzhokhoff@gmail.com>
Date: Fri, 19 Oct 2018 08:54:21 -0700
Subject: [PATCH 6/6] store session at policy creation time (#655)

* sync internal changes. Make ddpg work with vecenvs

* B -> nenvs for consistency with other algos, small cleanups

* eval_done[d]==True -> eval_done[d]

* flake8 and numpy.random.random_integers deprecation warning

* store session at policy creation time

* coexistence tests

* fix a typo

* autopep8

* ... and flake8

* updated todo links in test_serialization
---
 baselines/common/policies.py                 |  4 +--
 baselines/common/tests/envs/mnist_env.py     |  2 +-
 baselines/common/tests/test_serialization.py | 38 +++++++++++++++++++-
 3 files changed, 40 insertions(+), 4 deletions(-)

diff --git a/baselines/common/policies.py b/baselines/common/policies.py
index eeac242..9c9bb8b 100644
--- a/baselines/common/policies.py
+++ b/baselines/common/policies.py
@@ -53,7 +53,7 @@ class PolicyWithValue(object):
 
         # Calculate the neg log of our probability
         self.neglogp = self.pd.neglogp(self.action)
-        self.sess = sess
+        self.sess = sess or tf.get_default_session()
 
         if estimate_q:
             assert isinstance(env.action_space, gym.spaces.Discrete)
@@ -64,7 +64,7 @@ class PolicyWithValue(object):
             self.vf = self.vf[:,0]
 
     def _evaluate(self, variables, observation, **extra_feed):
-        sess = self.sess or tf.get_default_session()
+        sess = self.sess
         feed_dict = {self.X: adjust_shape(self.X, observation)}
         for inpt_name, data in extra_feed.items():
             if inpt_name in self.__dict__.keys():
diff --git a/baselines/common/tests/envs/mnist_env.py b/baselines/common/tests/envs/mnist_env.py
index 4f73495..473008d 100644
--- a/baselines/common/tests/envs/mnist_env.py
+++ b/baselines/common/tests/envs/mnist_env.py
@@ -1,7 +1,6 @@
 import os.path as osp
 import numpy as np
 import tempfile
-import filelock
 from gym import Env
 from gym.spaces import Discrete, Box
 
@@ -14,6 +13,7 @@ class MnistEnv(Env):
             episode_len=None,
             no_images=None
     ):
+        import filelock
         from tensorflow.examples.tutorials.mnist import input_data
         # we could use temporary directory for this with a context manager and
         # TemporaryDirecotry, but then each test that uses mnist would re-download the data
diff --git a/baselines/common/tests/test_serialization.py b/baselines/common/tests/test_serialization.py
index 4086f2b..f46b578 100644
--- a/baselines/common/tests/test_serialization.py
+++ b/baselines/common/tests/test_serialization.py
@@ -1,4 +1,5 @@
 import os
+import gym
 import tempfile
 import pytest
 import tensorflow as tf
@@ -39,7 +40,7 @@ def test_serialization(learn_fn, network_fn):
     if network_fn.endswith('lstm') and learn_fn in ['acktr', 'trpo_mpi', 'deepq']:
             # TODO make acktr work with recurrent policies
             # and test
-            # github issue: https://github.com/openai/baselines/issues/194
+            # github issue: https://github.com/openai/baselines/issues/660
             return
 
     env = DummyVecEnv([lambda: MnistEnv(10, episode_len=100)])
@@ -75,6 +76,41 @@ def test_serialization(learn_fn, network_fn):
         np.testing.assert_allclose(std1, std2, atol=0.5)
 
 
+@pytest.mark.parametrize("learn_fn", learn_kwargs.keys())
+@pytest.mark.parametrize("network_fn", ['mlp'])
+def test_coexistence(learn_fn, network_fn):
+    '''
+    Test if more than one model can exist at a time
+    '''
+
+    if learn_fn == 'deepq':
+            # TODO enable multiple DQN models to be useable at the same time
+            # github issue https://github.com/openai/baselines/issues/656
+            return
+
+    if network_fn.endswith('lstm') and learn_fn in ['acktr', 'trpo_mpi', 'deepq']:
+            # TODO make acktr work with recurrent policies
+            # and test
+            # github issue: https://github.com/openai/baselines/issues/660
+            return
+
+    env = DummyVecEnv([lambda: gym.make('CartPole-v0')])
+    learn = get_learn_function(learn_fn)
+
+    kwargs = {}
+    kwargs.update(network_kwargs[network_fn])
+    kwargs.update(learn_kwargs[learn_fn])
+
+    learn =  partial(learn, env=env, network=network_fn, total_timesteps=0, **kwargs)
+    make_session(make_default=True, graph=tf.Graph());
+    model1 = learn(seed=1)
+    make_session(make_default=True, graph=tf.Graph());
+    model2 = learn(seed=2)
+
+    model1.step(env.observation_space.sample())
+    model2.step(env.observation_space.sample())
+
+
 
 def _serialize_variables():
     sess = get_session()