From 97e039127f068a1475988d4921169d002cca4156 Mon Sep 17 00:00:00 2001
From: pzhokhov <peterzhokhoff@gmail.com>
Date: Mon, 26 Nov 2018 17:56:41 -0800
Subject: [PATCH 1/3] Fix ppo2 with MPI bug, other minor fixes (#735)

* joshim5 changes (width and height to WarpFrame wrapper)

* match network output with action distribution via a linear layer only if necessary (#167)

* support color vs. grayscale option in WarpFrame wrapper (#166)

* support color vs. grayscale option in WarpFrame wrapper

* Support color in other wrappers

* Updated per Peters suggestions

* fixing test failures

* ppo2 with microbatches (#168)

* pass microbatch_size to the model during construction

* microbatch fixes and test (#169)

* microbatch fixes and test

* tiny cleanup

* added assertions to the test

* vpg-related fix

* Peterz joshim5 subclass ppo2 model (#170)

* microbatch fixes and test

* tiny cleanup

* added assertions to the test

* vpg-related fix

* subclassing the model to make microbatched version of model WIP

* made microbatched model a subclass of ppo2 Model

* flake8 complaint

* mpi-less ppo2 (resolving merge conflict)

* flake8 and mpi4py imports in ppo2/model.py

* more un-mpying

* merge master

* updates to the benchmark viewer code + autopep8 (#184)

* viz docs and syntactic sugar wip

* update viewer yaml to use persistent volume claims

* move plot_util to baselines.common, update links

* use 1Tb hard drive for results viewer

* small updates to benchmark vizualizer code

* autopep8

* autopep8

* any folder can be a benchmark

* massage games image a little bit

* fixed --preload option in app.py

* remove preload from run_viewer.sh

* remove pdb breakpoints

* update bench-viewer.yaml

* fixed bug (#185)

* fixed bug

it's wrong to do the else statement, because no other nodes would start.

* changed the fix slightly
---
 baselines/ppo2/model.py      |  7 +++----
 baselines/results_plotter.py | 25 ++++++++++++-------------
 2 files changed, 15 insertions(+), 17 deletions(-)

diff --git a/baselines/ppo2/model.py b/baselines/ppo2/model.py
index 2ce6344..2326b46 100644
--- a/baselines/ppo2/model.py
+++ b/baselines/ppo2/model.py
@@ -122,10 +122,9 @@ class Model(object):
         self.save = functools.partial(save_variables, sess=sess)
         self.load = functools.partial(load_variables, sess=sess)
 
-        if MPI is None or MPI.COMM_WORLD.Get_rank() == 0:
-            initialize()
-        else:
-            global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="")
+        initialize()
+        global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="")
+        if MPI is not None:
             sync_from_root(sess, global_variables) #pylint: disable=E1101
 
     def train(self, lr, cliprange, obs, returns, masks, actions, values, neglogpacs, states=None):
diff --git a/baselines/results_plotter.py b/baselines/results_plotter.py
index 057f946..66f09bd 100644
--- a/baselines/results_plotter.py
+++ b/baselines/results_plotter.py
@@ -5,7 +5,7 @@ matplotlib.use('TkAgg') # Can change to 'Agg' for non-interactive mode
 import matplotlib.pyplot as plt
 plt.rcParams['svg.fonttype'] = 'none'
 
-from baselines.bench.monitor import load_results
+from baselines.common import plot_util
 
 X_TIMESTEPS = 'timesteps'
 X_EPISODES = 'episodes'
@@ -16,7 +16,7 @@ POSSIBLE_X_AXES = [X_TIMESTEPS, X_EPISODES, X_WALLTIME]
 EPISODES_WINDOW = 100
 COLORS = ['blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'black', 'purple', 'pink',
         'brown', 'orange', 'teal', 'coral', 'lightblue', 'lime', 'lavender', 'turquoise',
-        'darkgreen', 'tan', 'salmon', 'gold', 'lightpurple', 'darkred', 'darkblue']
+        'darkgreen', 'tan', 'salmon', 'gold', 'darkred', 'darkblue']
 
 def rolling_window(a, window):
     shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
@@ -50,7 +50,7 @@ def plot_curves(xy_list, xaxis, yaxis, title):
     maxx = max(xy[0][-1] for xy in xy_list)
     minx = 0
     for (i, (x, y)) in enumerate(xy_list):
-        color = COLORS[i]
+        color = COLORS[i % len(COLORS)]
         plt.scatter(x, y, s=2)
         x, y_mean = window_func(x, y, EPISODES_WINDOW, np.mean) #So returns average of last EPISODE_WINDOW episodes
         plt.plot(x, y_mean, color=color)
@@ -62,19 +62,18 @@ def plot_curves(xy_list, xaxis, yaxis, title):
     fig.canvas.mpl_connect('resize_event', lambda event: plt.tight_layout())
     plt.grid(True)
 
-def plot_results(dirs, num_timesteps, xaxis, yaxis, task_name):
-    tslist = []
-    for dir in dirs:
-        ts = load_results(dir)
-        ts = ts[ts.l.cumsum() <= num_timesteps]
-        tslist.append(ts)
-    xy_list = [ts2xy(ts, xaxis, yaxis) for ts in tslist]
-    plot_curves(xy_list, xaxis, yaxis, task_name)
+
+def split_by_task(taskpath):
+    return taskpath['dirname'].split('/')[-1].split('-')[0]
+
+def plot_results(dirs, num_timesteps=10e6, xaxis=X_TIMESTEPS, yaxis=Y_REWARD, title='', split_fn=split_by_task):
+    results = plot_util.load_results(dirs)
+    plot_util.plot_results(results, xy_fn=lambda r: ts2xy(r['monitor'], xaxis, yaxis), split_fn=split_fn, average_group=True, resample=int(1e6))
 
 # Example usage in jupyter-notebook
-# from baselines import results_plotter
+# from baselines.results_plotter import plot_results
 # %matplotlib inline
-# results_plotter.plot_results(["./log"], 10e6, results_plotter.X_TIMESTEPS, "Breakout")
+# plot_results("./log")
 # Here ./log is a directory containing the monitor.csv files
 
 def main():

From f3a5abaeeb1c1c9136a01c9dbfebc173dc311fef Mon Sep 17 00:00:00 2001
From: pzhokhov <peterzhokhoff@gmail.com>
Date: Mon, 26 Nov 2018 17:57:25 -0800
Subject: [PATCH 2/3] added smoke tests of ddpg (#734)

---
 baselines/ddpg/test_smoke.py | 17 +++++++++++++++++
 baselines/run.py             |  8 +++++---
 2 files changed, 22 insertions(+), 3 deletions(-)
 create mode 100644 baselines/ddpg/test_smoke.py

diff --git a/baselines/ddpg/test_smoke.py b/baselines/ddpg/test_smoke.py
new file mode 100644
index 0000000..b5bf866
--- /dev/null
+++ b/baselines/ddpg/test_smoke.py
@@ -0,0 +1,17 @@
+from baselines.run import main as M
+
+def _run(argstr):
+    M(('--alg=ddpg --env=Pendulum-v0 --num_timesteps=0 ' + argstr).split(' '))
+
+def test_popart():
+    _run('--normalize_returns=True --popart=True')
+
+def test_noise_normal():
+    _run('--noise_type=normal_0.1')
+
+def test_noise_ou():
+    _run('--noise_type=ou_0.1')
+
+def test_noise_adaptive():
+    _run('--noise_type=adaptive-param_0.2,normal_0.1')
+
diff --git a/baselines/run.py b/baselines/run.py
index c0298f3..609de6e 100644
--- a/baselines/run.py
+++ b/baselines/run.py
@@ -181,11 +181,11 @@ def parse_cmdline_kwargs(args):
 
 
 
-def main():
+def main(args):
     # configure logger, disable logging in child MPI processes (with rank > 0)
 
     arg_parser = common_arg_parser()
-    args, unknown_args = arg_parser.parse_known_args()
+    args, unknown_args = arg_parser.parse_known_args(args)
     extra_args = parse_cmdline_kwargs(unknown_args)
 
     if MPI is None or MPI.COMM_WORLD.Get_rank() == 0:
@@ -220,5 +220,7 @@ def main():
 
         env.close()
 
+    return model
+
 if __name__ == '__main__':
-    main()
+    main(sys.argv)

From 146bbf886ba533fe08b07e01d1c0356aaf7fcc80 Mon Sep 17 00:00:00 2001
From: Timothy Lee <45348789+timeous@users.noreply.github.com>
Date: Thu, 29 Nov 2018 20:28:09 -0500
Subject: [PATCH 3/3] Removed code that prevented changes to actor loss when
 training with demos (#740)

---
 baselines/her/ddpg.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/baselines/her/ddpg.py b/baselines/her/ddpg.py
index 96384da..91e91f3 100644
--- a/baselines/her/ddpg.py
+++ b/baselines/her/ddpg.py
@@ -367,8 +367,6 @@ class DDPG(object):
             self.pi_loss_tf = -tf.reduce_mean(self.main.Q_pi_tf)
             self.pi_loss_tf += self.action_l2 * tf.reduce_mean(tf.square(self.main.pi_tf / self.max_u))
 
-        self.pi_loss_tf = -tf.reduce_mean(self.main.Q_pi_tf)
-        self.pi_loss_tf += self.action_l2 * tf.reduce_mean(tf.square(self.main.pi_tf / self.max_u))
         Q_grads_tf = tf.gradients(self.Q_loss_tf, self._vars('main/Q'))
         pi_grads_tf = tf.gradients(self.pi_loss_tf, self._vars('main/pi'))
         assert len(self._vars('main/Q')) == len(Q_grads_tf)