From 376fd88bb820bdcbe7d192928cb50ec8c54f9842 Mon Sep 17 00:00:00 2001
From: Christopher Hesse <christopherhesse@users.noreply.github.com>
Date: Wed, 3 Apr 2019 16:28:29 -0700
Subject: [PATCH 01/14] fix vec monitor infos

---
 baselines/common/vec_env/vec_monitor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/baselines/common/vec_env/vec_monitor.py b/baselines/common/vec_env/vec_monitor.py
index a7b1ce4..efaafc9 100644
--- a/baselines/common/vec_env/vec_monitor.py
+++ b/baselines/common/vec_env/vec_monitor.py
@@ -33,7 +33,7 @@ class VecMonitor(VecEnvWrapper):
         self.eprets += rews
         self.eplens += 1
 
-        newinfos = infos[:]
+        newinfos = list(infos[:])
         for i in range(len(dones)):
             if dones[i]:
                 info = infos[i].copy()

From 5082e5d34b29e21c171cb372ffdbbe9d9e621232 Mon Sep 17 00:00:00 2001
From: Karl Cobbe <karl@openai.com>
Date: Thu, 4 Apr 2019 13:52:00 -0700
Subject: [PATCH 02/14] Workbench (#303)

* begin workbench

* cleanup

* begin procgen config integration

* arg tweaks

* more args

* parameter saving

* begin procgen enjoy

* tweaks

* more workbench

* more args sync/restore

* cleanup

* merge in master

* rework args priority

* more workbench

* more loggign

* impala cnn

* impala lstm

* tweak

* tweaks

* rl19 time logging

* misc fixes

* faster pipeline

* update local.py

* sess and log config tweaks

* num processes

* logging tweaks

* difficulty reward wrapper

* logging fixes

* gin tweaks

* tweak

* fix

* task id

* param loading

* more variable loading

* entrypoint

* tweak

* ksync

* restore lstm

* begin rl19 support

* tweak

* rl19 rnn

* more rl19 integration

* fix

* cleanup

* restore rl19 rnn

* cleanup

* cleanup

* wrappers.get_log_info

* cleanup

* cleanup

* directory cleanup

* logging, num_experiments

* fixes

* cleanup

* gin fixes

* fix local max gpu

* resid nx

* num machines and download params

* rename

* cleanup

* create workbench

* more reorg

* fix

* more logging wrappers

* lint fix

* restore train procgen

* restore train procgen

* pylint fix

* better wrapping

* config sweep

* args sweep

* test workers

* mpi_weight

* train test comm and high difficulty fix

* enjoy show returns

* removing gin, procgen_parser

* removing gin

* procgen args

* config fixes

* cleanup

* cleanup

* procgen args fix

* fix

* rcall syncing

* lint

* rename mpi_weight

* use username for sync

* fixes

* microbatch fix
---
 baselines/common/models.py             | 52 +++++++++++++++++++++++---
 baselines/common/mpi_adam_optimizer.py | 13 +++++--
 baselines/ppo2/microbatched_model.py   |  5 ++-
 baselines/ppo2/model.py                |  4 +-
 baselines/ppo2/ppo2.py                 | 28 +++++++++-----
 5 files changed, 79 insertions(+), 23 deletions(-)

diff --git a/baselines/common/models.py b/baselines/common/models.py
index 0003079..0798916 100644
--- a/baselines/common/models.py
+++ b/baselines/common/models.py
@@ -3,7 +3,6 @@ import tensorflow as tf
 from baselines.a2c import utils
 from baselines.a2c.utils import conv, fc, conv_to_fc, batch_to_seq, seq_to_batch
 from baselines.common.mpi_running_mean_std import RunningMeanStd
-import tensorflow.contrib.layers as layers
 
 mapping = {}
 
@@ -26,6 +25,42 @@ def nature_cnn(unscaled_images, **conv_kwargs):
     h3 = conv_to_fc(h3)
     return activ(fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2)))
 
+def build_impala_cnn(unscaled_images, depths=[16,32,32], **conv_kwargs):
+    """
+    Model used in the paper "IMPALA: Scalable Distributed Deep-RL with
+    Importance Weighted Actor-Learner Architectures" https://arxiv.org/abs/1802.01561
+    """
+    def conv_layer(out, depth):
+        return tf.layers.conv2d(out, depth, 3, padding='same')
+
+    def residual_block(inputs):
+        depth = inputs.get_shape()[-1].value
+
+        out = tf.nn.relu(inputs)
+
+        out = conv_layer(out, depth)
+        out = tf.nn.relu(out)
+        out = conv_layer(out, depth)
+        return out + inputs
+
+    def conv_sequence(inputs, depth):
+        out = conv_layer(inputs, depth)
+        out = tf.layers.max_pooling2d(out, pool_size=3, strides=2, padding='same')
+        out = residual_block(out)
+        out = residual_block(out)
+        return out
+
+    out = tf.cast(unscaled_images, tf.float32) / 255.
+
+    for depth in depths:
+        out = conv_sequence(out, depth)
+
+    out = tf.layers.flatten(out)
+    out = tf.nn.relu(out)
+    out = tf.layers.dense(out, 256, activation=tf.nn.relu)
+
+    return out
+
 
 @register("mlp")
 def mlp(num_layers=2, num_hidden=64, activation=tf.tanh, layer_norm=False):
@@ -65,6 +100,11 @@ def cnn(**conv_kwargs):
         return nature_cnn(X, **conv_kwargs)
     return network_fn
 
+@register("impala_cnn")
+def impala_cnn(**conv_kwargs):
+    def network_fn(X):
+        return build_impala_cnn(X)
+    return network_fn
 
 @register("cnn_small")
 def cnn_small(**conv_kwargs):
@@ -79,7 +119,6 @@ def cnn_small(**conv_kwargs):
         return h
     return network_fn
 
-
 @register("lstm")
 def lstm(nlstm=128, layer_norm=False):
     """
@@ -136,12 +175,12 @@ def lstm(nlstm=128, layer_norm=False):
 
 
 @register("cnn_lstm")
-def cnn_lstm(nlstm=128, layer_norm=False, **conv_kwargs):
+def cnn_lstm(nlstm=128, layer_norm=False, conv_fn=nature_cnn, **conv_kwargs):
     def network_fn(X, nenv=1):
         nbatch = X.shape[0]
         nsteps = nbatch // nenv
 
-        h = nature_cnn(X, **conv_kwargs)
+        h = conv_fn(X, **conv_kwargs)
 
         M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
         S = tf.placeholder(tf.float32, [nenv, 2*nlstm]) #states
@@ -161,6 +200,9 @@ def cnn_lstm(nlstm=128, layer_norm=False, **conv_kwargs):
 
     return network_fn
 
+@register("impala_cnn_lstm")
+def impala_cnn_lstm():
+    return cnn_lstm(nlstm=256, conv_fn=build_impala_cnn)
 
 @register("cnn_lnlstm")
 def cnn_lnlstm(nlstm=128, **conv_kwargs):
@@ -187,7 +229,7 @@ def conv_only(convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], **conv_kwargs):
         out = tf.cast(X, tf.float32) / 255.
         with tf.variable_scope("convnet"):
             for num_outputs, kernel_size, stride in convs:
-                out = layers.convolution2d(out,
+                out = tf.contrib.layers.convolution2d(out,
                                            num_outputs=num_outputs,
                                            kernel_size=kernel_size,
                                            stride=stride,
diff --git a/baselines/common/mpi_adam_optimizer.py b/baselines/common/mpi_adam_optimizer.py
index db7f7a2..dcbcd74 100644
--- a/baselines/common/mpi_adam_optimizer.py
+++ b/baselines/common/mpi_adam_optimizer.py
@@ -9,22 +9,27 @@ except ImportError:
 
 class MpiAdamOptimizer(tf.train.AdamOptimizer):
     """Adam optimizer that averages gradients across mpi processes."""
-    def __init__(self, comm, **kwargs):
+    def __init__(self, comm, mpi_rank_weight=1, **kwargs):
         self.comm = comm
+        self.mpi_rank_weight = mpi_rank_weight
         tf.train.AdamOptimizer.__init__(self, **kwargs)
     def compute_gradients(self, loss, var_list, **kwargs):
         grads_and_vars = tf.train.AdamOptimizer.compute_gradients(self, loss, var_list, **kwargs)
         grads_and_vars = [(g, v) for g, v in grads_and_vars if g is not None]
-        flat_grad = tf.concat([tf.reshape(g, (-1,)) for g, v in grads_and_vars], axis=0)
+        flat_grad = tf.concat([tf.reshape(g, (-1,)) for g, v in grads_and_vars], axis=0) * self.mpi_rank_weight
         shapes = [v.shape.as_list() for g, v in grads_and_vars]
         sizes = [int(np.prod(s)) for s in shapes]
-        num_tasks = self.comm.Get_size()
+
+        total_weight = np.zeros(1, np.float32)
+        self.comm.Allreduce(np.array([self.mpi_rank_weight], dtype=np.float32), total_weight, op=MPI.SUM)
+        total_weight = total_weight[0]
+
         buf = np.zeros(sum(sizes), np.float32)
         countholder = [0] # Counts how many times _collect_grads has been called
         stat = tf.reduce_sum(grads_and_vars[0][1]) # sum of first variable
         def _collect_grads(flat_grad, np_stat):
             self.comm.Allreduce(flat_grad, buf, op=MPI.SUM)
-            np.divide(buf, float(num_tasks), out=buf)
+            np.divide(buf, float(total_weight), out=buf)
             if countholder[0] % 100 == 0:
                 check_synced(np_stat, self.comm)
             countholder[0] += 1
diff --git a/baselines/ppo2/microbatched_model.py b/baselines/ppo2/microbatched_model.py
index 6735ed4..8d8b688 100644
--- a/baselines/ppo2/microbatched_model.py
+++ b/baselines/ppo2/microbatched_model.py
@@ -8,7 +8,7 @@ class MicrobatchedModel(Model):
     on the entire minibatch causes some overflow
     """
     def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train,
-                nsteps, ent_coef, vf_coef, max_grad_norm, microbatch_size):
+                nsteps, ent_coef, vf_coef, max_grad_norm, mpi_rank_weight, microbatch_size):
 
         self.nmicrobatches = nbatch_train // microbatch_size
         self.microbatch_size = microbatch_size
@@ -23,7 +23,8 @@ class MicrobatchedModel(Model):
                 nsteps=nsteps,
                 ent_coef=ent_coef,
                 vf_coef=vf_coef,
-                max_grad_norm=max_grad_norm)
+                max_grad_norm=max_grad_norm,
+                mpi_rank_weight=mpi_rank_weight)
 
         self.grads_ph = [tf.placeholder(dtype=g.dtype, shape=g.shape) for g in self.grads]
         grads_ph_and_vars = list(zip(self.grads_ph, self.var))
diff --git a/baselines/ppo2/model.py b/baselines/ppo2/model.py
index 2326b46..9370d5c 100644
--- a/baselines/ppo2/model.py
+++ b/baselines/ppo2/model.py
@@ -25,7 +25,7 @@ class Model(object):
     - Save load the model
     """
     def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train,
-                nsteps, ent_coef, vf_coef, max_grad_norm, microbatch_size=None):
+                nsteps, ent_coef, vf_coef, max_grad_norm, mpi_rank_weight=1, microbatch_size=None):
         self.sess = sess = get_session()
 
         with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE):
@@ -92,7 +92,7 @@ class Model(object):
         params = tf.trainable_variables('ppo2_model')
         # 2. Build our trainer
         if MPI is not None:
-            self.trainer = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=LR, epsilon=1e-5)
+            self.trainer = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=LR, mpi_rank_weight=mpi_rank_weight, epsilon=1e-5)
         else:
             self.trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5)
         # 3. Calculate the gradients
diff --git a/baselines/ppo2/ppo2.py b/baselines/ppo2/ppo2.py
index 7f3d204..09bc933 100644
--- a/baselines/ppo2/ppo2.py
+++ b/baselines/ppo2/ppo2.py
@@ -21,7 +21,7 @@ def constfn(val):
 def learn(*, network, env, total_timesteps, eval_env = None, seed=None, nsteps=2048, ent_coef=0.0, lr=3e-4,
             vf_coef=0.5,  max_grad_norm=0.5, gamma=0.99, lam=0.95,
             log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2,
-            save_interval=0, load_path=None, model_fn=None, **network_kwargs):
+            save_interval=0, load_path=None, model_fn=None, update_fn=None, init_fn=None, mpi_rank_weight=1, **network_kwargs):
     '''
     Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347)
 
@@ -105,7 +105,7 @@ def learn(*, network, env, total_timesteps, eval_env = None, seed=None, nsteps=2
 
     model = model_fn(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train,
                     nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef,
-                    max_grad_norm=max_grad_norm)
+                    max_grad_norm=max_grad_norm, mpi_rank_weight=mpi_rank_weight)
 
     if load_path is not None:
         model.load(load_path)
@@ -118,6 +118,9 @@ def learn(*, network, env, total_timesteps, eval_env = None, seed=None, nsteps=2
     if eval_env is not None:
         eval_epinfobuf = deque(maxlen=100)
 
+    if init_fn is not None:
+        init_fn()
+
     # Start total timer
     tfirststart = time.perf_counter()
 
@@ -176,31 +179,36 @@ def learn(*, network, env, total_timesteps, eval_env = None, seed=None, nsteps=2
         tnow = time.perf_counter()
         # Calculate the fps (frame per second)
         fps = int(nbatch / (tnow - tstart))
+
+        if update_fn is not None:
+            update_fn(update)
+
         if update % log_interval == 0 or update == 1:
             # Calculates if value function is a good predicator of the returns (ev > 1)
             # or if it's just worse than predicting nothing (ev =< 0)
             ev = explained_variance(values, returns)
-            logger.logkv("serial_timesteps", update*nsteps)
-            logger.logkv("nupdates", update)
-            logger.logkv("total_timesteps", update*nbatch)
+            logger.logkv("misc/serial_timesteps", update*nsteps)
+            logger.logkv("misc/nupdates", update)
+            logger.logkv("misc/total_timesteps", update*nbatch)
             logger.logkv("fps", fps)
-            logger.logkv("explained_variance", float(ev))
+            logger.logkv("misc/explained_variance", float(ev))
             logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf]))
             logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf]))
             if eval_env is not None:
                 logger.logkv('eval_eprewmean', safemean([epinfo['r'] for epinfo in eval_epinfobuf]) )
                 logger.logkv('eval_eplenmean', safemean([epinfo['l'] for epinfo in eval_epinfobuf]) )
-            logger.logkv('time_elapsed', tnow - tfirststart)
+            logger.logkv('misc/time_elapsed', tnow - tfirststart)
             for (lossval, lossname) in zip(lossvals, model.loss_names):
-                logger.logkv(lossname, lossval)
-            if MPI is None or MPI.COMM_WORLD.Get_rank() == 0:
-                logger.dumpkvs()
+                logger.logkv('loss/' + lossname, lossval)
+
+            logger.dumpkvs()
         if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir() and (MPI is None or MPI.COMM_WORLD.Get_rank() == 0):
             checkdir = osp.join(logger.get_dir(), 'checkpoints')
             os.makedirs(checkdir, exist_ok=True)
             savepath = osp.join(checkdir, '%.5i'%update)
             print('Saving to', savepath)
             model.save(savepath)
+
     return model
 # Avoid division error when calculate the mean (in our case if epinfo is empty returns np.nan, not return an error)
 def safemean(xs):

From 07cbf1e26a30fb0452f06b79db6a30a8f19a67fd Mon Sep 17 00:00:00 2001
From: John Schulman <joschu@openai.com>
Date: Tue, 16 Apr 2019 09:20:09 -0700
Subject: [PATCH 03/14] Grad clipping in MpiAdamOptimizer, transformer changes
 (#304)

* transformer mnist experiments

* version that only builds one model

* work on inverted mnist

* Add grad clipping to MpiAdamOptimizer

* various

* transformer changes, loading

* get rid of soft labels

* transformer baseline

* minor

* experiments involving all possible training sets

* vary training

* minor

* get ready for fine-tuning expers

* lint

* minor
---
 baselines/common/mpi_adam_optimizer.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/baselines/common/mpi_adam_optimizer.py b/baselines/common/mpi_adam_optimizer.py
index dcbcd74..3d7cee5 100644
--- a/baselines/common/mpi_adam_optimizer.py
+++ b/baselines/common/mpi_adam_optimizer.py
@@ -2,6 +2,7 @@ import numpy as np
 import tensorflow as tf
 from baselines.common import tf_util as U
 from baselines.common.tests.test_with_mpi import with_mpi
+from baselines import logger
 try:
     from mpi4py import MPI
 except ImportError:
@@ -9,8 +10,9 @@ except ImportError:
 
 class MpiAdamOptimizer(tf.train.AdamOptimizer):
     """Adam optimizer that averages gradients across mpi processes."""
-    def __init__(self, comm, mpi_rank_weight=1, **kwargs):
+    def __init__(self, comm, grad_clip=None, mpi_rank_weight=1, **kwargs):
         self.comm = comm
+        self.grad_clip = grad_clip
         self.mpi_rank_weight = mpi_rank_weight
         tf.train.AdamOptimizer.__init__(self, **kwargs)
     def compute_gradients(self, loss, var_list, **kwargs):
@@ -28,6 +30,12 @@ class MpiAdamOptimizer(tf.train.AdamOptimizer):
         countholder = [0] # Counts how many times _collect_grads has been called
         stat = tf.reduce_sum(grads_and_vars[0][1]) # sum of first variable
         def _collect_grads(flat_grad, np_stat):
+            if self.grad_clip is not None:
+                gradnorm = np.linalg.norm(flat_grad)
+                if gradnorm > 1:
+                    flat_grad /= gradnorm
+                logger.logkv_mean('gradnorm', gradnorm)
+                logger.logkv_mean('gradclipfrac', float(gradnorm > 1))
             self.comm.Allreduce(flat_grad, buf, op=MPI.SUM)
             np.divide(buf, float(total_weight), out=buf)
             if countholder[0] % 100 == 0:
@@ -56,8 +64,8 @@ def check_synced(localval, comm=None):
     comm = comm or MPI.COMM_WORLD
     vals = comm.gather(localval)
     if comm.rank == 0:
-        assert all(val==vals[0] for val in vals[1:])
-
+        assert all(val==vals[0] for val in vals[1:]),\
+            f'MpiAdamOptimizer detected that different workers have different weights: {vals}'
 
 @with_mpi(timeout=5)
 def test_nonfreeze():

From b83a66527de2f327555d267f63ad14ced5589ec1 Mon Sep 17 00:00:00 2001
From: John Schulman <joschu@openai.com>
Date: Wed, 17 Apr 2019 10:13:12 -0700
Subject: [PATCH 04/14] Add jrl19 as backend for workbench (#324)

enable jrl in workbench
minor logger changes
---
 baselines/logger.py | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/baselines/logger.py b/baselines/logger.py
index a0e75ab..4d9ffe6 100644
--- a/baselines/logger.py
+++ b/baselines/logger.py
@@ -38,8 +38,8 @@ class HumanOutputFormat(KVWriter, SeqWriter):
         # Create strings for printing
         key2str = {}
         for (key, val) in sorted(kvs.items()):
-            if isinstance(val, float):
-                valstr = '%-8.3g' % (val,)
+            if hasattr(val, '__float__'):
+                valstr = '%-8.3g' % val
             else:
                 valstr = str(val)
             key2str[self._truncate(key)] = self._truncate(valstr)
@@ -361,6 +361,15 @@ class Logger(object):
             if isinstance(fmt, SeqWriter):
                 fmt.writeseq(map(str, args))
 
+def get_rank_without_mpi_import():
+    # check environment variables here instead of importing mpi4py
+    # to avoid calling MPI_Init() when this module is imported
+    for varname in ['PMI_RANK', 'OMPI_COMM_WORLD_RANK']:
+        if varname in os.environ:
+            return int(os.environ[varname])
+    return 0
+
+
 def configure(dir=None, format_strs=None, comm=None, log_suffix=''):
     """
     If comm is provided, average all numerical stats across that comm
@@ -373,12 +382,7 @@ def configure(dir=None, format_strs=None, comm=None, log_suffix=''):
     assert isinstance(dir, str)
     os.makedirs(dir, exist_ok=True)
 
-    rank = 0
-    # check environment variables here instead of importing mpi4py
-    # to avoid calling MPI_Init() when this module is imported
-    for varname in ['PMI_RANK', 'OMPI_COMM_WORLD_RANK']:
-        if varname in os.environ:
-            rank = int(os.environ[varname])
+    rank = get_rank_without_mpi_import()
     if rank > 0:
         log_suffix = log_suffix + "-rank%03i" % rank
 

From a93dde3b2b974917d41cf4e589d6a2b0aba01b64 Mon Sep 17 00:00:00 2001
From: pzhokhov <peterzhokhoff@gmail.com>
Date: Wed, 17 Apr 2019 15:17:27 -0700
Subject: [PATCH 05/14] extra functionality in baselines.common.plot_util
 (#310)

* get plot_util from mt_experiments branch

* add labels

* unit tests for plot_util
---
 baselines/common/plot_util.py            | 42 ++++++++++++++++++++----
 baselines/common/tests/test_plot_util.py | 17 ++++++++++
 baselines/common/tests/util.py           | 13 ++++++++
 baselines/ddpg/test_smoke.py             |  8 ++---
 4 files changed, 68 insertions(+), 12 deletions(-)
 create mode 100644 baselines/common/tests/test_plot_util.py

diff --git a/baselines/common/plot_util.py b/baselines/common/plot_util.py
index 26b1613..e15c508 100644
--- a/baselines/common/plot_util.py
+++ b/baselines/common/plot_util.py
@@ -248,7 +248,10 @@ def plot_results(
     figsize=None,
     legend_outside=False,
     resample=0,
-    smooth_step=1.0
+    smooth_step=1.0,
+    tiling='vertical',
+    xlabel=None,
+    ylabel=None
 ):
     '''
     Plot multiple Results objects
@@ -300,9 +303,23 @@ def plot_results(
         sk2r[splitkey].append(result)
     assert len(sk2r) > 0
     assert isinstance(resample, int), "0: don't resample. <integer>: that many samples"
-    nrows = len(sk2r)
-    ncols = 1
-    figsize = figsize or (6, 6 * nrows)
+    if tiling == 'vertical' or tiling is None:
+        nrows = len(sk2r)
+        ncols = 1
+    elif tiling == 'horizontal':
+        ncols = len(sk2r)
+        nrows = 1
+    elif tiling == 'symmetric':
+        import math
+        N = len(sk2r)
+        largest_divisor = 1
+        for i in range(1, int(math.sqrt(N))+1):
+            if N % i == 0:
+                largest_divisor = i
+        ncols = largest_divisor
+        nrows = N // ncols
+    figsize = figsize or (6 * ncols, 6 * nrows)
+
     f, axarr = plt.subplots(nrows, ncols, sharex=False, squeeze=False, figsize=figsize)
 
     groups = list(set(group_fn(result) for result in allresults))
@@ -316,7 +333,9 @@ def plot_results(
         g2c = defaultdict(int)
         sresults = sk2r[sk]
         gresults = defaultdict(list)
-        ax = axarr[isplit][0]
+        idx_row = isplit // ncols
+        idx_col = isplit % ncols
+        ax = axarr[idx_row][idx_col]
         for result in sresults:
             group = group_fn(result)
             g2c[group] += 1
@@ -355,7 +374,7 @@ def plot_results(
                 ymean = np.mean(ys, axis=0)
                 ystd = np.std(ys, axis=0)
                 ystderr = ystd / np.sqrt(len(ys))
-                l, = axarr[isplit][0].plot(usex, ymean, color=color)
+                l, = axarr[idx_row][idx_col].plot(usex, ymean, color=color)
                 g2l[group] = l
                 if shaded_err:
                     ax.fill_between(usex, ymean - ystderr, ymean + ystderr, color=color, alpha=.4)
@@ -372,6 +391,17 @@ def plot_results(
                 loc=2 if legend_outside else None,
                 bbox_to_anchor=(1,1) if legend_outside else None)
         ax.set_title(sk)
+        # add xlabels, but only to the bottom row
+        if xlabel is not None:
+            for ax in axarr[-1]:
+                plt.sca(ax)
+                plt.xlabel(xlabel)
+        # add ylabels, but only to left column
+        if ylabel is not None:
+            for ax in axarr[:,0]:
+                plt.sca(ax)
+                plt.ylabel(ylabel)
+
     return f, axarr
 
 def regression_analysis(df):
diff --git a/baselines/common/tests/test_plot_util.py b/baselines/common/tests/test_plot_util.py
new file mode 100644
index 0000000..be33308
--- /dev/null
+++ b/baselines/common/tests/test_plot_util.py
@@ -0,0 +1,17 @@
+# smoke tests of plot_util
+from baselines.common import plot_util as pu
+from baselines.common.tests.util import smoketest
+
+
+def test_plot_util():
+    nruns = 4
+    logdirs = [smoketest('--alg=ppo2 --env=CartPole-v0 --num_timesteps=10000') for _ in range(nruns)]
+    data = pu.load_results(logdirs)
+    assert len(data) == 4
+
+    _, axes = pu.plot_results(data[:1]); assert len(axes) == 1
+    _, axes = pu.plot_results(data, tiling='vertical'); assert axes.shape==(4,1)
+    _, axes = pu.plot_results(data, tiling='horizontal'); assert axes.shape==(1,4)
+    _, axes = pu.plot_results(data, tiling='symmetric'); assert axes.shape==(2,2)
+    _, axes = pu.plot_results(data, split_fn=lambda _: ''); assert len(axes) == 1
+
diff --git a/baselines/common/tests/util.py b/baselines/common/tests/util.py
index 441e3f7..b3d31fe 100644
--- a/baselines/common/tests/util.py
+++ b/baselines/common/tests/util.py
@@ -77,3 +77,16 @@ def rollout(env, model, n_trials):
         observations.append(episode_obs)
     return observations, actions, rewards
 
+
+def smoketest(argstr, **kwargs):
+    import tempfile
+    import subprocess
+    import os
+    argstr = 'python -m baselines.run ' + argstr
+    for key, value in kwargs:
+        argstr += ' --{}={}'.format(key, value)
+    tempdir = tempfile.mkdtemp()
+    env = os.environ.copy()
+    env['OPENAI_LOGDIR'] = tempdir
+    subprocess.run(argstr.split(' '), env=env)
+    return tempdir
diff --git a/baselines/ddpg/test_smoke.py b/baselines/ddpg/test_smoke.py
index a9fdc05..bd7eba6 100644
--- a/baselines/ddpg/test_smoke.py
+++ b/baselines/ddpg/test_smoke.py
@@ -1,10 +1,6 @@
-from multiprocessing import Process
-import baselines.run
-
+from baselines.common.tests.util import smoketest
 def _run(argstr):
-    p = Process(target=baselines.run.main, args=('--alg=ddpg --env=Pendulum-v0 --num_timesteps=0 ' + argstr).split(' '))
-    p.start()
-    p.join()
+    smoketest('--alg=ddpg --env=Pendulum-v0 --num_timesteps=0 ' + argstr)
 
 def test_popart():
     _run('--normalize_returns=True --popart=True')

From 967fc8c37f8d3ad39db223c002fc0a502686d214 Mon Sep 17 00:00:00 2001
From: John Schulman <joschu@openai.com>
Date: Sat, 20 Apr 2019 10:08:09 -0700
Subject: [PATCH 06/14] Fixed sequence env minor (#333)

minor changes to FixedSequenceEnv to allow full score
---
 baselines/common/tests/envs/fixed_sequence_env.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/baselines/common/tests/envs/fixed_sequence_env.py b/baselines/common/tests/envs/fixed_sequence_env.py
index f5460d5..b3fe396 100644
--- a/baselines/common/tests/envs/fixed_sequence_env.py
+++ b/baselines/common/tests/envs/fixed_sequence_env.py
@@ -9,18 +9,16 @@ class FixedSequenceEnv(Env):
             n_actions=10,
             episode_len=100
     ):
-        self.np_random = np.random.RandomState()
-        self.sequence = None
-
         self.action_space = Discrete(n_actions)
         self.observation_space = Discrete(1)
-
+        self.np_random = np.random.RandomState(0)
         self.episode_len = episode_len
+        self.sequence = [self.np_random.randint(0, self.action_space.n)
+            for _ in range(self.episode_len)]
         self.time = 0
 
+
     def reset(self):
-        if self.sequence is None:
-            self.sequence = [self.np_random.randint(0, self.action_space.n-1) for _ in range(self.episode_len)]
         self.time = 0
         return 0
 
@@ -29,7 +27,6 @@ class FixedSequenceEnv(Env):
         self._choose_next_state()
         done = False
         if self.episode_len and self.time >= self.episode_len:
-            rew = 0
             done = True
 
         return 0, rew, done, {}

From bc4eef60531b317bc6e00ae2c5a8e2792318191c Mon Sep 17 00:00:00 2001
From: Christopher Hesse <christopherhesse@users.noreply.github.com>
Date: Sat, 20 Apr 2019 15:06:18 -0700
Subject: [PATCH 07/14] fix tests (#335)

---
 baselines/logger.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/baselines/logger.py b/baselines/logger.py
index 4d9ffe6..36b0c98 100644
--- a/baselines/logger.py
+++ b/baselines/logger.py
@@ -92,7 +92,6 @@ class JSONOutputFormat(KVWriter):
     def writekvs(self, kvs):
         for k, v in sorted(kvs.items()):
             if hasattr(v, 'dtype'):
-                v = v.tolist()
                 kvs[k] = float(v)
         self.file.write(json.dumps(kvs) + '\n')
         self.file.flush()

From ddcab1606d779c7e66c742b0d1e6dccc278305a6 Mon Sep 17 00:00:00 2001
From: Karl Cobbe <karl@openai.com>
Date: Mon, 22 Apr 2019 13:12:45 -0700
Subject: [PATCH 08/14] Procgen Benchmark Updates (#328)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* directory cleanup

* logging, num_experiments

* fixes

* cleanup

* gin fixes

* fix local max gpu

* resid nx

* tweak

* num machines and download params

* rename

* cleanup

* create workbench

* more reorg

* fix

* more logging wrappers

* lint fix

* restore train procgen

* restore train procgen

* pylint fix

* better wrapping

* whackamole walls

* config sweep

* tweak

* args sweep

* tweak

* test workers

* mpi_weight

* train test comm and high difficulty fix

* enjoy show returns

* better joint training

* tweak

* Add —update to args and add gin-config to requirements.txt

* add username to download_file

* removing gin, procgen_parser

* removing gin

* procgen args

* config fixes

* cleanup

* cleanup

* procgen args fix

* fix

* rcall syncing

* lint

* rename mpi_weight

* begin composable game

* more composable game

* tweak

* background alpha

* use username for sync

* fixes

* microbatch fix

* lure composable game

* merge

* proc trans update

* proc trans update (#307)

* finetuning experiment

* Change is_local to use `use_rcall` and fix error of `enjoy.py` with multiple ends

* graphing help

* add --local

* change args_dict['env_name'] to ENV_NAME

* finetune experiments

* tweak

* tweak

* reorg wrappers, remove is_local

* workdir/local fixes

* move finetune experiments

* default dir and graphing

* more graphing

* fix

* pooled syncing

* tweaks

* dir fix

* tweak

* wrapper mpi fix

* wind and turrets

* composability cleanup

* radius cleanup

* composable reorg

* laser gates

* composable tweaks

* soft walls

* tweak

* begin swamp

* more swamp

* more swamp

* fix

* hidden mines

* use maze layout

* tweak

* laser gate tweaks

* tweaks

* tweaks

* lure/propel updates

* composable midnight

* composable coinmaze

* composability difficulty

* tweak

* add step to save_params

* composable offsets

* composable boxpush

* composable combiner

* tweak

* tweak

* always choose correct number of mechanics

* fix

* rcall local fix

* add steps when dump and save parmas

* loading rank 1,2,3.. error fix

* add experiments.py

* fix loading latest weight with no -rest

* support more complex run_id and add more examples

* fix typo

* move post_run_id into experiments.py

* add hp_search example

* error fix

* joint experiments in progress

* joint hp finished

* typo

* error fix

* edit experiments

* Save experiments set up in code and  save weights per step (#319)

* add step to save_params

* add steps when dump and save parmas

* loading rank 1,2,3.. error fix

* add experiments.py

* fix loading latest weight with no -rest

* support more complex run_id and add more examples

* fix typo

* move post_run_id into experiments.py

* add hp_search example

* error fix

* joint experiments in progress

* joint hp finished

* typo

* error fix

* edit experiments

* tweaks

* graph exp WIP

* depth tweaks

* move save_all

* fix

* restore_dir name

* restore depth

* choose max mechanics

* use override mode

* tweak frogger

* lstm default

* fix

* patience is composable

* hunter is composable

* fixed asset seed cleanup

* minesweeper is composable

* eggcatch is composable

* tweak

* applesort is composable

* chaser game

* begin lighter

* lighter game

* tractor game

* boxgather game

* plumber game

* hitcher game

* doorbell game

* lawnmower game

* connecter game

* cannonaim

* outrun game

* encircle game

* spinner game

* tweak

* tweak

* detonator game

* driller

* driller

* mixer

* conveyor

* conveyor game

* joint pcg experiments

* fixes

* pcg sweep experiment

* cannonaim fix

* combiner fix

* store save time

* laseraim fix

* lightup fix

* detonator tweaks

* detonator fixes

* driller fix

* lawnmower calibration

* spinner calibration

* propel fix

* train experiment

* print load time

* system independent hashing

* remove gin configurable

* task ids fix

* test_pcg experiment

* connecter dense reward

* hard_pcg

* num train comms

* mpi splits envs

* tweaks

* tweaks

* graph tweaks

* graph tweaks

* lint fix

* fix tests

* load bugfix

* difficulty timeout tweak

* tweaks

* more graphing

* graph tweaks

* tweak

* download file fix

* pcg train envs list

* cleanup

* tweak

* manually name impala layers

* tweak

* expect fps

* backend arg

* args tweak

* workbench cleanup

* move graph files

* workbench cleanup

* split env name by comma

* workbench cleanup

* ema graph

* remove Dict

* use tf.io.gfile

* comments for auto-killing jobs

* lint fix

* write latest file when not saving all and load it when step=None
---
 baselines/common/models.py           | 13 +++++++++++--
 baselines/ppo2/microbatched_model.py |  5 +++--
 baselines/ppo2/model.py              |  9 ++++++---
 baselines/ppo2/ppo2.py               |  4 ++--
 4 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/baselines/common/models.py b/baselines/common/models.py
index 0798916..a6fe467 100644
--- a/baselines/common/models.py
+++ b/baselines/common/models.py
@@ -30,8 +30,17 @@ def build_impala_cnn(unscaled_images, depths=[16,32,32], **conv_kwargs):
     Model used in the paper "IMPALA: Scalable Distributed Deep-RL with
     Importance Weighted Actor-Learner Architectures" https://arxiv.org/abs/1802.01561
     """
+
+    layer_num = 0
+
+    def get_layer_num_str():
+        nonlocal layer_num
+        num_str = str(layer_num)
+        layer_num += 1
+        return num_str
+
     def conv_layer(out, depth):
-        return tf.layers.conv2d(out, depth, 3, padding='same')
+        return tf.layers.conv2d(out, depth, 3, padding='same', name='layer_' + get_layer_num_str())
 
     def residual_block(inputs):
         depth = inputs.get_shape()[-1].value
@@ -57,7 +66,7 @@ def build_impala_cnn(unscaled_images, depths=[16,32,32], **conv_kwargs):
 
     out = tf.layers.flatten(out)
     out = tf.nn.relu(out)
-    out = tf.layers.dense(out, 256, activation=tf.nn.relu)
+    out = tf.layers.dense(out, 256, activation=tf.nn.relu, name='layer_' + get_layer_num_str())
 
     return out
 
diff --git a/baselines/ppo2/microbatched_model.py b/baselines/ppo2/microbatched_model.py
index 8d8b688..a35b830 100644
--- a/baselines/ppo2/microbatched_model.py
+++ b/baselines/ppo2/microbatched_model.py
@@ -8,7 +8,7 @@ class MicrobatchedModel(Model):
     on the entire minibatch causes some overflow
     """
     def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train,
-                nsteps, ent_coef, vf_coef, max_grad_norm, mpi_rank_weight, microbatch_size):
+                nsteps, ent_coef, vf_coef, max_grad_norm, mpi_rank_weight, comm, microbatch_size):
 
         self.nmicrobatches = nbatch_train // microbatch_size
         self.microbatch_size = microbatch_size
@@ -24,7 +24,8 @@ class MicrobatchedModel(Model):
                 ent_coef=ent_coef,
                 vf_coef=vf_coef,
                 max_grad_norm=max_grad_norm,
-                mpi_rank_weight=mpi_rank_weight)
+                mpi_rank_weight=mpi_rank_weight,
+                comm=comm)
 
         self.grads_ph = [tf.placeholder(dtype=g.dtype, shape=g.shape) for g in self.grads]
         grads_ph_and_vars = list(zip(self.grads_ph, self.var))
diff --git a/baselines/ppo2/model.py b/baselines/ppo2/model.py
index 9370d5c..35a883d 100644
--- a/baselines/ppo2/model.py
+++ b/baselines/ppo2/model.py
@@ -25,9 +25,12 @@ class Model(object):
     - Save load the model
     """
     def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train,
-                nsteps, ent_coef, vf_coef, max_grad_norm, mpi_rank_weight=1, microbatch_size=None):
+                nsteps, ent_coef, vf_coef, max_grad_norm, mpi_rank_weight=1, comm=None, microbatch_size=None):
         self.sess = sess = get_session()
 
+        if MPI is not None and comm is None:
+            comm = MPI.COMM_WORLD
+
         with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE):
             # CREATE OUR TWO MODELS
             # act_model that is used for sampling
@@ -91,8 +94,8 @@ class Model(object):
         # 1. Get the model parameters
         params = tf.trainable_variables('ppo2_model')
         # 2. Build our trainer
-        if MPI is not None:
-            self.trainer = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=LR, mpi_rank_weight=mpi_rank_weight, epsilon=1e-5)
+        if comm is not None and comm.Get_size() > 1:
+            self.trainer = MpiAdamOptimizer(comm, learning_rate=LR, mpi_rank_weight=mpi_rank_weight, epsilon=1e-5)
         else:
             self.trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5)
         # 3. Calculate the gradients
diff --git a/baselines/ppo2/ppo2.py b/baselines/ppo2/ppo2.py
index 09bc933..f3a69d8 100644
--- a/baselines/ppo2/ppo2.py
+++ b/baselines/ppo2/ppo2.py
@@ -21,7 +21,7 @@ def constfn(val):
 def learn(*, network, env, total_timesteps, eval_env = None, seed=None, nsteps=2048, ent_coef=0.0, lr=3e-4,
             vf_coef=0.5,  max_grad_norm=0.5, gamma=0.99, lam=0.95,
             log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2,
-            save_interval=0, load_path=None, model_fn=None, update_fn=None, init_fn=None, mpi_rank_weight=1, **network_kwargs):
+            save_interval=0, load_path=None, model_fn=None, update_fn=None, init_fn=None, mpi_rank_weight=1, comm=None, **network_kwargs):
     '''
     Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347)
 
@@ -105,7 +105,7 @@ def learn(*, network, env, total_timesteps, eval_env = None, seed=None, nsteps=2
 
     model = model_fn(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train,
                     nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef,
-                    max_grad_norm=max_grad_norm, mpi_rank_weight=mpi_rank_weight)
+                    max_grad_norm=max_grad_norm, comm=comm, mpi_rank_weight=mpi_rank_weight)
 
     if load_path is not None:
         model.load(load_path)

From 8e0282ee941c70ddc732556845f8ac12b5bd7f19 Mon Sep 17 00:00:00 2001
From: pzhokhov <peterzhokhoff@gmail.com>
Date: Mon, 22 Apr 2019 14:41:46 -0700
Subject: [PATCH 09/14] ci/runtests.sh - pass all folders to pytest (#342)

* ci/runtests.sh - pass all folders to pytest

* mpi_optimizer_test precision 1e-4

* fixes to tests

* search for tests in the entire jax folder, also remove unnecessary humor
---
 baselines/common/tests/test_with_mpi.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/baselines/common/tests/test_with_mpi.py b/baselines/common/tests/test_with_mpi.py
index 86be475..9388078 100644
--- a/baselines/common/tests/test_with_mpi.py
+++ b/baselines/common/tests/test_with_mpi.py
@@ -4,6 +4,7 @@ import subprocess
 import cloudpickle
 import base64
 import pytest
+from functools import wraps
 
 try:
     from mpi4py import MPI
@@ -12,6 +13,7 @@ except ImportError:
 
 def with_mpi(nproc=2, timeout=30, skip_if_no_mpi=True):
     def outer_thunk(fn):
+        @wraps(fn)
         def thunk(*args, **kwargs):
             serialized_fn = base64.b64encode(cloudpickle.dumps(lambda: fn(*args, **kwargs)))
             subprocess.check_call([

From f5daca8c22d964a703beabbcab44e11d75881a34 Mon Sep 17 00:00:00 2001
From: John Schulman <joschu@openai.com>
Date: Mon, 22 Apr 2019 14:45:01 -0700
Subject: [PATCH 10/14] delete unnecessary stuff (#338)

---
 setup.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/setup.py b/setup.py
index 130cdb5..6074009 100644
--- a/setup.py
+++ b/setup.py
@@ -36,8 +36,6 @@ setup(name='baselines',
           'scipy',
           'tqdm',
           'joblib',
-          'dill',
-          'progressbar2',
           'cloudpickle',
           'click',
           'opencv-python'

From 64dfabb8eb53664d300c573dbd456fbd85ce4aad Mon Sep 17 00:00:00 2001
From: Greg Brockman <gdb@gregbrockman.com>
Date: Tue, 23 Apr 2019 13:40:08 -0700
Subject: [PATCH 11/14] Add initializer for process-level setup in
 SubprocVecEnv (#276)

* Add initializer for process-level setup in SubprocVecEnv

Use case: run logger.configure() in each subprocess

* Add option to force dummy vec env
---
 baselines/common/cmd_util.py | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/baselines/common/cmd_util.py b/baselines/common/cmd_util.py
index 016df93..99ec11c 100644
--- a/baselines/common/cmd_util.py
+++ b/baselines/common/cmd_util.py
@@ -25,7 +25,9 @@ def make_vec_env(env_id, env_type, num_env, seed,
                  start_index=0,
                  reward_scale=1.0,
                  flatten_dict_observations=True,
-                 gamestate=None):
+                 gamestate=None,
+                 initializer=None,
+                 force_dummy=False):
     """
     Create a wrapped, monitored SubprocVecEnv for Atari and MuJoCo.
     """
@@ -34,7 +36,7 @@ def make_vec_env(env_id, env_type, num_env, seed,
     mpi_rank = MPI.COMM_WORLD.Get_rank() if MPI else 0
     seed = seed + 10000 * mpi_rank if seed is not None else None
     logger_dir = logger.get_dir()
-    def make_thunk(rank):
+    def make_thunk(rank, initializer=None):
         return lambda: make_env(
             env_id=env_id,
             env_type=env_type,
@@ -46,17 +48,21 @@ def make_vec_env(env_id, env_type, num_env, seed,
             flatten_dict_observations=flatten_dict_observations,
             wrapper_kwargs=wrapper_kwargs,
             env_kwargs=env_kwargs,
-            logger_dir=logger_dir
+            logger_dir=logger_dir,
+            initializer=initializer
         )
 
     set_global_seeds(seed)
-    if num_env > 1:
-        return SubprocVecEnv([make_thunk(i + start_index) for i in range(num_env)])
+    if not force_dummy and num_env > 1:
+        return SubprocVecEnv([make_thunk(i + start_index, initializer=initializer) for i in range(num_env)])
     else:
-        return DummyVecEnv([make_thunk(start_index)])
+        return DummyVecEnv([make_thunk(i + start_index, initializer=None) for i in range(num_env)])
 
 
-def make_env(env_id, env_type, mpi_rank=0, subrank=0, seed=None, reward_scale=1.0, gamestate=None, flatten_dict_observations=True, wrapper_kwargs=None, env_kwargs=None, logger_dir=None):
+def make_env(env_id, env_type, mpi_rank=0, subrank=0, seed=None, reward_scale=1.0, gamestate=None, flatten_dict_observations=True, wrapper_kwargs=None, env_kwargs=None, logger_dir=None, initializer=None):
+    if initializer is not None:
+        initializer(mpi_rank=mpi_rank, subrank=subrank)
+
     wrapper_kwargs = wrapper_kwargs or {}
     env_kwargs = env_kwargs or {}
     if ':' in env_id:

From 07536451eee9ecb9d019156272ff139e1ee1284f Mon Sep 17 00:00:00 2001
From: Karl Cobbe <karl@openai.com>
Date: Tue, 23 Apr 2019 16:33:30 -0700
Subject: [PATCH 12/14] Procgen fixes (#352)

* tweak

* documentation

* rely on log_comm, remove mpi averaging from wrappers

* pass comm for ppo2 initialization

* ppo2 logging

* experiment tweaks

* auto launch tensorboard when using local backend

* graph tweaks

* pass caller to config

* configure logger and tensorboard

* make parent dir if necessary

* parentdir tweak
---
 baselines/ppo2/model.py | 2 +-
 baselines/ppo2/ppo2.py  | 8 +++++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/baselines/ppo2/model.py b/baselines/ppo2/model.py
index 35a883d..3d56bc9 100644
--- a/baselines/ppo2/model.py
+++ b/baselines/ppo2/model.py
@@ -128,7 +128,7 @@ class Model(object):
         initialize()
         global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="")
         if MPI is not None:
-            sync_from_root(sess, global_variables) #pylint: disable=E1101
+            sync_from_root(sess, global_variables, comm=comm) #pylint: disable=E1101
 
     def train(self, lr, cliprange, obs, returns, masks, actions, values, neglogpacs, states=None):
         # Here we calculate advantage A(s,a) = R + yV(s') - V(s)
diff --git a/baselines/ppo2/ppo2.py b/baselines/ppo2/ppo2.py
index f3a69d8..d307e9b 100644
--- a/baselines/ppo2/ppo2.py
+++ b/baselines/ppo2/ppo2.py
@@ -97,6 +97,7 @@ def learn(*, network, env, total_timesteps, eval_env = None, seed=None, nsteps=2
     # Calculate the batch_size
     nbatch = nenvs * nsteps
     nbatch_train = nbatch // nminibatches
+    is_mpi_root = (MPI is None or MPI.COMM_WORLD.Get_rank() == 0)
 
     # Instantiate the model object (that creates act_model and train_model)
     if model_fn is None:
@@ -134,11 +135,16 @@ def learn(*, network, env, total_timesteps, eval_env = None, seed=None, nsteps=2
         lrnow = lr(frac)
         # Calculate the cliprange
         cliprangenow = cliprange(frac)
+
+        if update % log_interval == 0 and is_mpi_root: logger.info('Stepping environment...')
+
         # Get minibatch
         obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run() #pylint: disable=E0632
         if eval_env is not None:
             eval_obs, eval_returns, eval_masks, eval_actions, eval_values, eval_neglogpacs, eval_states, eval_epinfos = eval_runner.run() #pylint: disable=E0632
 
+        if update % log_interval == 0 and is_mpi_root: logger.info('Done.')
+
         epinfobuf.extend(epinfos)
         if eval_env is not None:
             eval_epinfobuf.extend(eval_epinfos)
@@ -202,7 +208,7 @@ def learn(*, network, env, total_timesteps, eval_env = None, seed=None, nsteps=2
                 logger.logkv('loss/' + lossname, lossval)
 
             logger.dumpkvs()
-        if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir() and (MPI is None or MPI.COMM_WORLD.Get_rank() == 0):
+        if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir() and is_mpi_root:
             checkdir = osp.join(logger.get_dir(), 'checkpoints')
             os.makedirs(checkdir, exist_ok=True)
             savepath = osp.join(checkdir, '%.5i'%update)

From 1fa6ac38f1a01a43c7911697712826f29d223df4 Mon Sep 17 00:00:00 2001
From: pzhokhov <peterz@openai.com>
Date: Wed, 24 Apr 2019 17:04:36 -0700
Subject: [PATCH 13/14] JRL PPO test with delayed identity env (#355)

* add a custom delay to identity_env

* min reward 0.8 in delayed identity test

* seed the tests, perfect score on delayed_identity_test

* delay=1 in delayed_identity_test

* flake8 complaints

* increased number of steps in fixed_seq_test

* seed identity tests to ensure reproducibility

* docstrings
---
 baselines/common/tests/envs/identity_env.py   | 55 ++++++++++---------
 .../common/tests/envs/identity_env_test.py    | 36 ++++++++++++
 2 files changed, 66 insertions(+), 25 deletions(-)
 create mode 100644 baselines/common/tests/envs/identity_env_test.py

diff --git a/baselines/common/tests/envs/identity_env.py b/baselines/common/tests/envs/identity_env.py
index 79e6c48..fb2dca6 100644
--- a/baselines/common/tests/envs/identity_env.py
+++ b/baselines/common/tests/envs/identity_env.py
@@ -2,43 +2,45 @@ import numpy as np
 from abc import abstractmethod
 from gym import Env
 from gym.spaces import MultiDiscrete, Discrete, Box
-
+from collections import deque
 
 class IdentityEnv(Env):
     def __init__(
             self,
-            episode_len=None
+            episode_len=None,
+            delay=0,
+            zero_first_rewards=True
     ):
 
         self.observation_space = self.action_space
         self.episode_len = episode_len
         self.time = 0
-        self.reset()
+        self.delay = delay
+        self.zero_first_rewards = zero_first_rewards
+        self.q = deque(maxlen=delay+1)
 
     def reset(self):
-        self._choose_next_state()
+        self.q.clear()
+        for _ in range(self.delay + 1):
+            self.q.append(self.action_space.sample())
         self.time = 0
 
-        return self.state
+        return self.q[-1]
 
     def step(self, actions):
-        rew = self._get_reward(actions)
-        self._choose_next_state()
-        done = False
-        if self.episode_len and self.time >= self.episode_len:
-            done = True
-
-        return self.state, rew, done, {}
+        rew = self._get_reward(self.q.popleft(), actions)
+        if self.zero_first_rewards and self.time < self.delay:
+            rew = 0
+        self.q.append(self.action_space.sample())
+        self.time += 1
+        done = self.episode_len is not None and self.time >= self.episode_len
+        return self.q[-1], rew, done, {}
 
     def seed(self, seed=None):
         self.action_space.seed(seed)
 
-    def _choose_next_state(self):
-        self.state = self.action_space.sample()
-        self.time += 1
-
     @abstractmethod
-    def _get_reward(self, actions):
+    def _get_reward(self, state, actions):
         raise NotImplementedError
 
 
@@ -47,26 +49,29 @@ class DiscreteIdentityEnv(IdentityEnv):
             self,
             dim,
             episode_len=None,
+            delay=0,
+            zero_first_rewards=True
     ):
 
         self.action_space = Discrete(dim)
-        super().__init__(episode_len=episode_len)
+        super().__init__(episode_len=episode_len, delay=delay, zero_first_rewards=zero_first_rewards)
 
-    def _get_reward(self, actions):
-        return 1 if self.state == actions else 0
+    def _get_reward(self, state, actions):
+        return 1 if state == actions else 0
 
 class MultiDiscreteIdentityEnv(IdentityEnv):
     def __init__(
             self,
             dims,
             episode_len=None,
+            delay=0,
     ):
 
         self.action_space = MultiDiscrete(dims)
-        super().__init__(episode_len=episode_len)
+        super().__init__(episode_len=episode_len, delay=delay)
 
-    def _get_reward(self, actions):
-        return 1 if all(self.state == actions) else 0
+    def _get_reward(self, state, actions):
+        return 1 if all(state == actions) else 0
 
 
 class BoxIdentityEnv(IdentityEnv):
@@ -79,7 +84,7 @@ class BoxIdentityEnv(IdentityEnv):
         self.action_space = Box(low=-1.0, high=1.0, shape=shape, dtype=np.float32)
         super().__init__(episode_len=episode_len)
 
-    def _get_reward(self, actions):
-        diff = actions - self.state
+    def _get_reward(self, state, actions):
+        diff = actions - state
         diff = diff[:]
         return -0.5 * np.dot(diff, diff)
diff --git a/baselines/common/tests/envs/identity_env_test.py b/baselines/common/tests/envs/identity_env_test.py
new file mode 100644
index 0000000..c73ee57
--- /dev/null
+++ b/baselines/common/tests/envs/identity_env_test.py
@@ -0,0 +1,36 @@
+from baselines.common.tests.envs.identity_env import DiscreteIdentityEnv
+
+
+def test_discrete_nodelay():
+    nsteps = 100
+    eplen = 50
+    env = DiscreteIdentityEnv(10, episode_len=eplen)
+    ob = env.reset()
+    for t in range(nsteps):
+        action = env.action_space.sample()
+        next_ob, rew, done, info = env.step(action)
+        assert rew == (1 if action == ob else 0)
+        if (t + 1) % eplen == 0:
+            assert done
+            next_ob = env.reset()
+        else:
+            assert not done
+        ob = next_ob
+
+def test_discrete_delay1():
+    eplen = 50
+    env = DiscreteIdentityEnv(10, episode_len=eplen, delay=1)
+    ob = env.reset()
+    prev_ob = None
+    for t in range(eplen):
+        action = env.action_space.sample()
+        next_ob, rew, done, info = env.step(action)
+        if t > 0:
+            assert rew == (1 if action == prev_ob else 0)
+        else:
+            assert rew == 0
+        prev_ob = ob
+        ob = next_ob
+        if t < eplen - 1:
+            assert not done
+    assert done

From ef7ac116cb7a75ce7bf44c421aea499fc1945697 Mon Sep 17 00:00:00 2001
From: John Schulman <joschu@openai.com>
Date: Fri, 26 Apr 2019 17:42:43 -0700
Subject: [PATCH 14/14] (onp, np) -> (np, jp), switch jax code to use mark_slow
 decorator (#363)

switch to mark_slow decorator
---
 .travis.yml                                   | 2 +-
 baselines/common/tests/__init__.py            | 2 ++
 baselines/common/tests/test_cartpole.py       | 3 ++-
 baselines/common/tests/test_fetchreach.py     | 3 ++-
 baselines/common/tests/test_fixed_sequence.py | 4 +++-
 baselines/common/tests/test_identity.py       | 7 ++++---
 baselines/common/tests/test_mnist.py          | 4 ++--
 7 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 712fc84..c68bfc1 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -11,4 +11,4 @@ install:
 
 script:
     - flake8 . --show-source --statistics
-    - docker run baselines-test pytest -v .
+    - docker run -e RUNSLOW=1 baselines-test pytest -v .
diff --git a/baselines/common/tests/__init__.py b/baselines/common/tests/__init__.py
index e69de29..a6561a2 100644
--- a/baselines/common/tests/__init__.py
+++ b/baselines/common/tests/__init__.py
@@ -0,0 +1,2 @@
+import os, pytest
+mark_slow = pytest.mark.skipif(not os.getenv('RUNSLOW'), reason='slow')
\ No newline at end of file
diff --git a/baselines/common/tests/test_cartpole.py b/baselines/common/tests/test_cartpole.py
index 475ad1d..f9d5ac6 100644
--- a/baselines/common/tests/test_cartpole.py
+++ b/baselines/common/tests/test_cartpole.py
@@ -3,6 +3,7 @@ import gym
 
 from baselines.run import get_learn_function
 from baselines.common.tests.util import reward_per_episode_test
+from baselines.common.tests import mark_slow
 
 common_kwargs = dict(
     total_timesteps=30000,
@@ -20,7 +21,7 @@ learn_kwargs = {
     'trpo_mpi': {}
 }
 
-@pytest.mark.slow
+@mark_slow
 @pytest.mark.parametrize("alg", learn_kwargs.keys())
 def test_cartpole(alg):
     '''
diff --git a/baselines/common/tests/test_fetchreach.py b/baselines/common/tests/test_fetchreach.py
index be73663..8bcd32b 100644
--- a/baselines/common/tests/test_fetchreach.py
+++ b/baselines/common/tests/test_fetchreach.py
@@ -3,6 +3,7 @@ import gym
 
 from baselines.run import get_learn_function
 from baselines.common.tests.util import reward_per_episode_test
+from baselines.common.tests import mark_slow
 
 pytest.importorskip('mujoco_py')
 
@@ -15,7 +16,7 @@ learn_kwargs = {
     'her': dict(total_timesteps=2000)
 }
 
-@pytest.mark.slow
+@mark_slow
 @pytest.mark.parametrize("alg", learn_kwargs.keys())
 def test_fetchreach(alg):
     '''
diff --git a/baselines/common/tests/test_fixed_sequence.py b/baselines/common/tests/test_fixed_sequence.py
index 061c375..68ee8d3 100644
--- a/baselines/common/tests/test_fixed_sequence.py
+++ b/baselines/common/tests/test_fixed_sequence.py
@@ -3,6 +3,8 @@ from baselines.common.tests.envs.fixed_sequence_env import FixedSequenceEnv
 
 from baselines.common.tests.util import simple_test
 from baselines.run import get_learn_function
+from baselines.common.tests import mark_slow
+
 
 common_kwargs = dict(
     seed=0,
@@ -21,7 +23,7 @@ learn_kwargs = {
 alg_list = learn_kwargs.keys()
 rnn_list = ['lstm']
 
-@pytest.mark.slow
+@mark_slow
 @pytest.mark.parametrize("alg", alg_list)
 @pytest.mark.parametrize("rnn", rnn_list)
 def test_fixed_sequence(alg, rnn):
diff --git a/baselines/common/tests/test_identity.py b/baselines/common/tests/test_identity.py
index c950e5a..6b66a66 100644
--- a/baselines/common/tests/test_identity.py
+++ b/baselines/common/tests/test_identity.py
@@ -2,6 +2,7 @@ import pytest
 from baselines.common.tests.envs.identity_env import DiscreteIdentityEnv, BoxIdentityEnv, MultiDiscreteIdentityEnv
 from baselines.run import get_learn_function
 from baselines.common.tests.util import simple_test
+from baselines.common.tests import mark_slow
 
 common_kwargs = dict(
     total_timesteps=30000,
@@ -24,7 +25,7 @@ algos_disc = ['a2c', 'acktr', 'deepq', 'ppo2', 'trpo_mpi']
 algos_multidisc = ['a2c', 'acktr', 'ppo2', 'trpo_mpi']
 algos_cont = ['a2c', 'acktr', 'ddpg',  'ppo2', 'trpo_mpi']
 
-@pytest.mark.slow
+@mark_slow
 @pytest.mark.parametrize("alg", algos_disc)
 def test_discrete_identity(alg):
     '''
@@ -39,7 +40,7 @@ def test_discrete_identity(alg):
     env_fn = lambda: DiscreteIdentityEnv(10, episode_len=100)
     simple_test(env_fn, learn_fn, 0.9)
 
-@pytest.mark.slow
+@mark_slow
 @pytest.mark.parametrize("alg", algos_multidisc)
 def test_multidiscrete_identity(alg):
     '''
@@ -54,7 +55,7 @@ def test_multidiscrete_identity(alg):
     env_fn = lambda: MultiDiscreteIdentityEnv((3,3), episode_len=100)
     simple_test(env_fn, learn_fn, 0.9)
 
-@pytest.mark.slow
+@mark_slow
 @pytest.mark.parametrize("alg", algos_cont)
 def test_continuous_identity(alg):
     '''
diff --git a/baselines/common/tests/test_mnist.py b/baselines/common/tests/test_mnist.py
index bacc914..06a4e2b 100644
--- a/baselines/common/tests/test_mnist.py
+++ b/baselines/common/tests/test_mnist.py
@@ -4,7 +4,7 @@ import pytest
 from baselines.common.tests.envs.mnist_env import MnistEnv
 from baselines.common.tests.util import simple_test
 from baselines.run import get_learn_function
-
+from baselines.common.tests import mark_slow
 
 # TODO investigate a2c and ppo2 failures - is it due to bad hyperparameters for this problem?
 # GitHub issue https://github.com/openai/baselines/issues/189
@@ -28,7 +28,7 @@ learn_args = {
 #tests pass, but are too slow on travis. Same algorithms are covered
 # by other tests with less compute-hungry nn's and by benchmarks
 @pytest.mark.skip
-@pytest.mark.slow
+@mark_slow
 @pytest.mark.parametrize("alg", learn_args.keys())
 def test_mnist(alg):
     '''