install mpi4py in mpi dockerfile

fix .travis.yml
CI dockerfiles with and without mpi
2018-10-31 11:34:10 -07:00 · 2018-10-31 11:32:03 -07:00 · 2018-10-31 11:27:45 -07:00 · 2018-10-31 09:48:41 -07:00 · 2018-10-30 14:45:20 -07:00 · 2018-10-30 14:11:38 -07:00
36 changed files with 313 additions and 1817 deletions
--- a/.travis.yml
+++ b/.travis.yml
@@ -5,10 +5,14 @@ python:
 services:
    - docker

+env:
+  - DOCKER_SUFFIX=py36-nompi
+  - DOCKER_SUFFIX=py36-mpi
+
 install:
-    - pip install flake8
-    - docker build . -t baselines-test
+  - pip install flake8
+  - docker build -f test.dockerfile.${DOCKER_SUFFIX} -t baselines-test .

 script:
-    - flake8 . --show-source --statistics
-    - docker run baselines-test pytest -v --forked .
+  - flake8 . --show-source --statistics
+  - docker run baselines-test pytest -v .
--- a/README.md
+++ b/README.md
@@ -1,5 +1,3 @@
-**Status:** Active (under active development, breaking changes may occur)
-
 <img src="data/logo.jpg" width=25% align="right" /> [![Build status](https://travis-ci.org/openai/baselines.svg?branch=master)](https://travis-ci.org/openai/baselines)

 # Baselines
@@ -111,9 +109,17 @@ python -m baselines.run --alg=ppo2 --env=PongNoFrameskip-v4 --num_timesteps=0 --

 *NOTE:* At the moment Mujoco training uses VecNormalize wrapper for the environment which is not being saved correctly; so loading the models trained on Mujoco will not work well if the environment is recreated. If necessary, you can work around that by replacing RunningMeanStd by TfRunningMeanStd in [baselines/common/vec_env/vec_normalize.py](baselines/common/vec_env/vec_normalize.py#L12). This way, mean and std of environment normalizing wrapper will be saved in tensorflow variables and included in the model file; however, training is slower that way - hence not including it by default

-## Loading and vizualizing learning curves and other training metrics
-See [here](docs/viz/viz.ipynb) for instructions on how to load and display the training data. 

+## Using baselines with TensorBoard
+Baselines logger can save data in the TensorBoard format. To do so, set environment variables `OPENAI_LOG_FORMAT` and `OPENAI_LOGDIR`:
+```bash
+export OPENAI_LOG_FORMAT='stdout,log,csv,tensorboard' # formats are comma-separated, but for tensorboard you only really need the last one
+export OPENAI_LOGDIR=path/to/tensorboard/data
+```
+And you can now start TensorBoard with:
+```bash
+tensorboard --logdir=$OPENAI_LOGDIR
+```
 ## Subpackages

 - [A2C](baselines/a2c)
--- a/baselines/a2c/runner.py
+++ b/baselines/a2c/runner.py
@@ -37,6 +37,9 @@ class Runner(AbstractEnvRunner):
            obs, rewards, dones, _ = self.env.step(actions)
            self.states = states
            self.dones = dones
+            for n, done in enumerate(dones):
+                if done:
+                    self.obs[n] = self.obs[n]*0
            self.obs = obs
            mb_rewards.append(rewards)
        mb_dones.append(self.dones)
--- a/baselines/common/atari_wrappers.py
+++ b/baselines/common/atari_wrappers.py
@@ -72,8 +72,8 @@ class EpisodicLifeEnv(gym.Wrapper):
        # then update lives to handle bonus lives
        lives = self.env.unwrapped.ale.lives()
        if lives < self.lives and lives > 0:
-            # for Qbert sometimes we stay in lives == 0 condition for a few frames
-            # so it's important to keep lives > 0, so that we only reset once
+            # for Qbert sometimes we stay in lives == 0 condtion for a few frames
+            # so its important to keep lives > 0, so that we only reset once
            # the environment advertises done.
            done = True
        self.lives = lives
@@ -129,26 +129,18 @@ class ClipRewardEnv(gym.RewardWrapper):
        return np.sign(reward)

 class WarpFrame(gym.ObservationWrapper):
-    def __init__(self, env, width=84, height=84, grayscale=True):
+    def __init__(self, env):
        """Warp frames to 84x84 as done in the Nature paper and later work."""
        gym.ObservationWrapper.__init__(self, env)
-        self.width = width
-        self.height = height
-        self.grayscale = grayscale
-        if self.grayscale:
-            self.observation_space = spaces.Box(low=0, high=255,
-                shape=(self.height, self.width, 1), dtype=np.uint8)
-        else:
-            self.observation_space = spaces.Box(low=0, high=255,
-                shape=(self.height, self.width, 3), dtype=np.uint8)
+        self.width = 84
+        self.height = 84
+        self.observation_space = spaces.Box(low=0, high=255,
+            shape=(self.height, self.width, 1), dtype=np.uint8)

    def observation(self, frame):
-        if self.grayscale:
-            frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
+        frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
        frame = cv2.resize(frame, (self.width, self.height), interpolation=cv2.INTER_AREA)
-        if self.grayscale:
-            frame = np.expand_dims(frame, -1)
-        return frame
+        return frame[:, :, None]

 class FrameStack(gym.Wrapper):
    def __init__(self, env, k):
@@ -164,7 +156,7 @@ class FrameStack(gym.Wrapper):
        self.k = k
        self.frames = deque([], maxlen=k)
        shp = env.observation_space.shape
-        self.observation_space = spaces.Box(low=0, high=255, shape=(shp[:-1] + (shp[-1] * k,)), dtype=env.observation_space.dtype)
+        self.observation_space = spaces.Box(low=0, high=255, shape=(shp[0], shp[1], shp[2] * k), dtype=env.observation_space.dtype)

    def reset(self):
        ob = self.env.reset()
@@ -205,7 +197,7 @@ class LazyFrames(object):

    def _force(self):
        if self._out is None:
-            self._out = np.concatenate(self._frames, axis=-1)
+            self._out = np.concatenate(self._frames, axis=2)
            self._frames = None
        return self._out

--- a/baselines/common/cmd_util.py
+++ b/baselines/common/cmd_util.py
@@ -60,14 +60,12 @@ def make_env(env_id, env_type, subrank=0, seed=None, reward_scale=1.0, gamestate
                  allow_early_resets=True)

    if env_type == 'atari':
-        env = wrap_deepmind(env, **wrapper_kwargs)
-    elif env_type == 'retro':
-        env = retro_wrappers.wrap_deepmind_retro(env, **wrapper_kwargs)
+         return wrap_deepmind(env, **wrapper_kwargs)
+    elif reward_scale != 1:
+         return retro_wrappers.RewardScaler(env, reward_scale)
+    else:
+        return env

-    if reward_scale != 1:
-        env = retro_wrappers.RewardScaler(env, reward_scale)
-
-    return env


 def make_mujoco_env(env_id, seed, reward_scale=1.0):
@@ -131,8 +129,6 @@ def common_arg_parser():
    parser.add_argument('--num_env', help='Number of environment copies being run in parallel. When not specified, set to number of cpus for Atari, and to 1 for Mujoco', default=None, type=int)
    parser.add_argument('--reward_scale', help='Reward scale factor. Default: 1.0', default=1.0, type=float)
    parser.add_argument('--save_path', help='Path to save trained model to', default=None, type=str)
-    parser.add_argument('--save_video_interval', help='Save video every x steps (0 = disabled)', default=0, type=int)
-    parser.add_argument('--save_video_length', help='Length of recorded video. Default: 200', default=200, type=int)
    parser.add_argument('--play', default=False, action='store_true')
    return parser

--- a/baselines/common/distributions.py
+++ b/baselines/common/distributions.py
@@ -62,7 +62,7 @@ class CategoricalPdType(PdType):
    def pdclass(self):
        return CategoricalPd
    def pdfromlatent(self, latent_vector, init_scale=1.0, init_bias=0.0):
-        pdparam = _matching_fc(latent_vector, 'pi', self.ncat, init_scale=init_scale, init_bias=init_bias)
+        pdparam = fc(latent_vector, 'pi', self.ncat, init_scale=init_scale, init_bias=init_bias)
        return self.pdfromflat(pdparam), pdparam

    def param_shape(self):
@@ -82,7 +82,7 @@ class MultiCategoricalPdType(PdType):
        return MultiCategoricalPd(self.ncats, flat)

    def pdfromlatent(self, latent, init_scale=1.0, init_bias=0.0):
-        pdparam = _matching_fc(latent, 'pi', self.ncats.sum(), init_scale=init_scale, init_bias=init_bias)
+        pdparam = fc(latent, 'pi', self.ncats.sum(), init_scale=init_scale, init_bias=init_bias)
        return self.pdfromflat(pdparam), pdparam

    def param_shape(self):
@@ -99,7 +99,7 @@ class DiagGaussianPdType(PdType):
        return DiagGaussianPd

    def pdfromlatent(self, latent_vector, init_scale=1.0, init_bias=0.0):
-        mean = _matching_fc(latent_vector, 'pi', self.size, init_scale=init_scale, init_bias=init_bias)
+        mean = fc(latent_vector, 'pi', self.size, init_scale=init_scale, init_bias=init_bias)
        logstd = tf.get_variable(name='pi/logstd', shape=[1, self.size], initializer=tf.zeros_initializer())
        pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
        return self.pdfromflat(pdparam), mean
@@ -123,7 +123,7 @@ class BernoulliPdType(PdType):
    def sample_dtype(self):
        return tf.int32
    def pdfromlatent(self, latent_vector, init_scale=1.0, init_bias=0.0):
-        pdparam = _matching_fc(latent_vector, 'pi', self.size, init_scale=init_scale, init_bias=init_bias)
+        pdparam = fc(latent_vector, 'pi', self.size, init_scale=init_scale, init_bias=init_bias)
        return self.pdfromflat(pdparam), pdparam

 # WRONG SECOND DERIVATIVES
@@ -345,9 +345,3 @@ def validate_probtype(probtype, pdparam):
    assert np.abs(klval - klval_ll) < 3 * klval_ll_stderr # within 3 sigmas
    print('ok on', probtype, pdparam)

-
-def _matching_fc(tensor, name, size, init_scale, init_bias):
-    if tensor.shape[-1] == size:
-        return tensor
-    else:
-        return fc(tensor, name, size, init_scale=init_scale, init_bias=init_bias)
--- a/baselines/common/plot_util.py
+++ b/baselines/common/plot_util.py
@@ -1,401 +0,0 @@
-import matplotlib.pyplot as plt
-import os.path as osp
-import json
-import os
-import numpy as np
-import pandas
-from collections import defaultdict, namedtuple
-from baselines.bench import monitor
-from baselines.logger import read_json, read_csv
-
-def smooth(y, radius, mode='two_sided', valid_only=False):
-    '''
-    Smooth signal y, where radius is determines the size of the window
-
-    mode='twosided':
-        average over the window [max(index - radius, 0), min(index + radius, len(y)-1)]
-    mode='causal':
-        average over the window [max(index - radius, 0), index]
-
-    valid_only: put nan in entries where the full-sized window is not available
-
-    '''
-    assert mode in ('two_sided', 'causal')
-    if len(y) < 2*radius+1:
-        return np.ones_like(y) * y.mean()
-    elif mode == 'two_sided':
-        convkernel = np.ones(2 * radius+1)
-        out = np.convolve(y, convkernel,mode='same') / np.convolve(np.ones_like(y), convkernel, mode='same')
-        if valid_only:
-            out[:radius] = out[-radius:] = np.nan
-    elif mode == 'causal':
-        convkernel = np.ones(radius)
-        out = np.convolve(y, convkernel,mode='full') / np.convolve(np.ones_like(y), convkernel, mode='full')
-        out = out[:-radius+1]
-        if valid_only:
-            out[:radius] = np.nan
-    return out
-
-def one_sided_ema(xolds, yolds, low=None, high=None, n=512, decay_steps=1., low_counts_threshold=1e-8):
-    '''
-    perform one-sided (causal) EMA (exponential moving average)
-    smoothing and resampling to an even grid with n points.
-    Does not do extrapolation, so we assume
-    xolds[0] <= low && high <= xolds[-1]
-
-    Arguments:
-
-    xolds: array or list  - x values of data. Needs to be sorted in ascending order
-    yolds: array of list  - y values of data. Has to have the same length as xolds
-
-    low: float            - min value of the new x grid. By default equals to xolds[0]
-    high: float           - max value of the new x grid. By default equals to xolds[-1]
-
-    n: int                - number of points in new x grid
-
-    decay_steps: float    - EMA decay factor, expressed in new x grid steps.
-
-    low_counts_threshold: float or int
-                          - y values with counts less than this value will be set to NaN
-
-    Returns:
-        tuple sum_ys, count_ys where
-            xs        - array with new x grid
-            ys        - array of EMA of y at each point of the new x grid
-            count_ys  - array of EMA of y counts at each point of the new x grid
-
-    '''
-
-    low = xolds[0] if low is None else low
-    high = xolds[-1] if high is None else high
-
-    assert xolds[0] <= low, 'low = {} < xolds[0] = {} - extrapolation not permitted!'.format(low, xolds[0])
-    assert xolds[-1] >= high, 'high = {} > xolds[-1] = {}  - extrapolation not permitted!'.format(high, xolds[-1])
-    assert len(xolds) == len(yolds), 'length of xolds ({}) and yolds ({}) do not match!'.format(len(xolds), len(yolds))
-
-
-    xolds = xolds.astype('float64')
-    yolds = yolds.astype('float64')
-
-    luoi = 0 # last unused old index
-    sum_y = 0.
-    count_y = 0.
-    xnews = np.linspace(low, high, n)
-    decay_period = (high - low) / (n - 1) * decay_steps
-    interstep_decay = np.exp(- 1. / decay_steps)
-    sum_ys = np.zeros_like(xnews)
-    count_ys = np.zeros_like(xnews)
-    for i in range(n):
-        xnew = xnews[i]
-        sum_y *= interstep_decay
-        count_y *= interstep_decay
-        while True:
-            xold = xolds[luoi]
-            if xold <= xnew:
-                decay = np.exp(- (xnew - xold) / decay_period)
-                sum_y += decay * yolds[luoi]
-                count_y += decay
-                luoi += 1
-            else:
-                break
-            if luoi >= len(xolds):
-                break
-        sum_ys[i] = sum_y
-        count_ys[i] = count_y
-
-    ys = sum_ys / count_ys
-    ys[count_ys < low_counts_threshold] = np.nan
-
-    return xnews, ys, count_ys
-
-def symmetric_ema(xolds, yolds, low=None, high=None, n=512, decay_steps=1., low_counts_threshold=1e-8):
-    '''
-    perform symmetric EMA (exponential moving average)
-    smoothing and resampling to an even grid with n points.
-    Does not do extrapolation, so we assume
-    xolds[0] <= low && high <= xolds[-1]
-
-    Arguments:
-
-    xolds: array or list  - x values of data. Needs to be sorted in ascending order
-    yolds: array of list  - y values of data. Has to have the same length as xolds
-
-    low: float            - min value of the new x grid. By default equals to xolds[0]
-    high: float           - max value of the new x grid. By default equals to xolds[-1]
-
-    n: int                - number of points in new x grid
-
-    decay_steps: float    - EMA decay factor, expressed in new x grid steps.
-
-    low_counts_threshold: float or int
-                          - y values with counts less than this value will be set to NaN
-
-    Returns:
-        tuple sum_ys, count_ys where
-            xs        - array with new x grid
-            ys        - array of EMA of y at each point of the new x grid
-            count_ys  - array of EMA of y counts at each point of the new x grid
-
-    '''
-    xs, ys1, count_ys1 = one_sided_ema(xolds, yolds, low, high, n, decay_steps, low_counts_threshold=0)
-    _,  ys2, count_ys2 = one_sided_ema(-xolds[::-1], yolds[::-1], -high, -low, n, decay_steps, low_counts_threshold=0)
-    ys2 = ys2[::-1]
-    count_ys2 = count_ys2[::-1]
-    count_ys = count_ys1 + count_ys2
-    ys = (ys1 * count_ys1 + ys2 * count_ys2) / count_ys
-    ys[count_ys < low_counts_threshold] = np.nan
-    return xs, ys, count_ys
-
-Result = namedtuple('Result', 'monitor progress dirname metadata')
-Result.__new__.__defaults__ = (None,) * len(Result._fields)
-
-def load_results(root_dir_or_dirs, enable_progress=True, enable_monitor=True, verbose=False):
-    '''
-    load summaries of runs from a list of directories (including subdirectories)
-    Arguments:
-
-    enable_progress: bool - if True, will attempt to load data from progress.csv files (data saved by logger). Default: True
-
-    enable_monitor: bool - if True, will attempt to load data from monitor.csv files (data saved by Monitor environment wrapper). Default: True
-
-    verbose: bool - if True, will print out list of directories from which the data is loaded. Default: False
-
-
-    Returns:
-    List of Result objects with the following fields:
-         - dirname - path to the directory data was loaded from
-         - metadata - run metadata (such as command-line arguments and anything else in metadata.json file
-         - monitor - if enable_monitor is True, this field contains pandas dataframe with loaded monitor.csv file (or aggregate of all *.monitor.csv files in the directory)
-         - progress - if enable_progress is True, this field contains pandas dataframe with loaded progress.csv file
-    '''
-    if isinstance(root_dir_or_dirs, str):
-        rootdirs = [osp.expanduser(root_dir_or_dirs)]
-    else:
-        rootdirs = [osp.expanduser(d) for d in root_dir_or_dirs]
-    allresults = []
-    for rootdir in rootdirs:
-        assert osp.exists(rootdir), "%s doesn't exist"%rootdir
-        for dirname, dirs, files in os.walk(rootdir):
-            if '-proc' in dirname:
-                files[:] = []
-                continue
-            if set(['metadata.json', 'monitor.json', 'monitor.csv', 'progress.json', 'progress.csv']).intersection(files):
-                # used to be uncommented, which means do not go deeper than current directory if any of the data files
-                # are found
-                # dirs[:] = []
-                result = {'dirname' : dirname}
-                if "metadata.json" in files:
-                    with open(osp.join(dirname, "metadata.json"), "r") as fh:
-                        result['metadata'] = json.load(fh)
-                progjson = osp.join(dirname, "progress.json")
-                progcsv = osp.join(dirname, "progress.csv")
-                if enable_progress:
-                    if osp.exists(progjson):
-                        result['progress'] = pandas.DataFrame(read_json(progjson))
-                    elif osp.exists(progcsv):
-                        try:
-                            result['progress'] = read_csv(progcsv)
-                        except pandas.errors.EmptyDataError:
-                            print('skipping progress file in ', dirname, 'empty data')
-                    else:
-                        if verbose: print('skipping %s: no progress file'%dirname)
-
-                if enable_monitor:
-                    try:
-                        result['monitor'] = pandas.DataFrame(monitor.load_results(dirname))
-                    except monitor.LoadMonitorResultsError:
-                        print('skipping %s: no monitor files'%dirname)
-                    except Exception as e:
-                        print('exception loading monitor file in %s: %s'%(dirname, e))
-
-                if result.get('monitor') is not None or result.get('progress') is not None:
-                    allresults.append(Result(**result))
-                    if verbose:
-                        print('successfully loaded %s'%dirname)
-
-    if verbose: print('loaded %i results'%len(allresults))
-    return allresults
-
-COLORS = ['blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'black', 'purple', 'pink',
-        'brown', 'orange', 'teal',  'lightblue', 'lime', 'lavender', 'turquoise',
-        'darkgreen', 'tan', 'salmon', 'gold',  'darkred', 'darkblue']
-
-
-def default_xy_fn(r):
-    x = np.cumsum(r.monitor.l)
-    y = smooth(r.monitor.r, radius=10)
-    return x,y
-
-def default_split_fn(r):
-    import re
-    # match name between slash and -<digits> at the end of the string
-    # (slash in the beginning or -<digits> in the end or either may be missing)
-    match = re.search(r'[^/-]+(?=(-\d+)?\Z)', r.dirname)
-    if match:
-        return match.group(0)
-
-def plot_results(
-    allresults, *,
-    xy_fn=default_xy_fn,
-    split_fn=default_split_fn,
-    group_fn=default_split_fn,
-    average_group=False,
-    shaded_std=True,
-    shaded_err=True,
-    figsize=None,
-    legend_outside=False,
-    resample=0,
-    smooth_step=1.0,
-):
-    '''
-    Plot multiple Results objects
-
-    xy_fn: function Result -> x,y           - function that converts results objects into tuple of x and y values.
-                                              By default, x is cumsum of episode lengths, and y is episode rewards
-
-    split_fn: function Result -> hashable   - function that converts results objects into keys to split curves into sub-panels by.
-                                              That is, the results r for which split_fn(r) is different will be put on different sub-panels.
-                                              By default, the portion of r.dirname between last / and -<digits> is returned. The sub-panels are
-                                              stacked vertically in the figure.
-
-    group_fn: function Result -> hashable   - function that converts results objects into keys to group curves by.
-                                              That is, the results r for which group_fn(r) is the same will be put into the same group.
-                                              Curves in the same group have the same color (if average_group is False), or averaged over
-                                              (if average_group is True). The default value is the same as default value for split_fn
-
-    average_group: bool                     - if True, will average the curves in the same group and plot the mean. Enables resampling
-                                              (if resample = 0, will use 512 steps)
-
-    shaded_std: bool                        - if True (default), the shaded region corresponding to standard deviation of the group of curves will be
-                                              shown (only applicable if average_group = True)
-
-    shaded_err: bool                        - if True (default), the shaded region corresponding to error in mean estimate of the group of curves
-                                              (that is, standard deviation divided by square root of number of curves) will be
-                                              shown (only applicable if average_group = True)
-
-    figsize: tuple or None                  - size of the resulting figure (including sub-panels). By default, width is 6 and height is 6 times number of
-                                              sub-panels.
-
-
-    legend_outside: bool                    - if True, will place the legend outside of the sub-panels.
-
-    resample: int                           - if not zero, size of the uniform grid in x direction to resample onto. Resampling is performed via symmetric
-                                              EMA smoothing (see the docstring for symmetric_ema).
-                                              Default is zero (no resampling). Note that if average_group is True, resampling is necessary; in that case, default
-                                              value is 512.
-
-    smooth_step: float                      - when resampling (i.e. when resample > 0 or average_group is True), use this EMA decay parameter (in units of the new grid step).
-                                              See docstrings for decay_steps in symmetric_ema or one_sided_ema functions.
-
-    '''
-
-    if split_fn is None: split_fn = lambda _ : ''
-    if group_fn is None: group_fn = lambda _ : ''
-    sk2r = defaultdict(list) # splitkey2results
-    for result in allresults:
-        splitkey = split_fn(result)
-        sk2r[splitkey].append(result)
-    assert len(sk2r) > 0
-    assert isinstance(resample, int), "0: don't resample. <integer>: that many samples"
-    nrows = len(sk2r)
-    ncols = 1
-    figsize = figsize or (6, 6 * nrows)
-    f, axarr = plt.subplots(nrows, ncols, sharex=False, squeeze=False, figsize=figsize)
-
-    groups = list(set(group_fn(result) for result in allresults))
-
-    default_samples = 512
-    if average_group:
-        resample = resample or default_samples
-
-    for (isplit, sk) in enumerate(sorted(sk2r.keys())):
-        g2l = {}
-        g2c = defaultdict(int)
-        sresults = sk2r[sk]
-        gresults = defaultdict(list)
-        ax = axarr[isplit][0]
-        for result in sresults:
-            group = group_fn(result)
-            g2c[group] += 1
-            x, y = xy_fn(result)
-            if x is None: x = np.arange(len(y))
-            x, y = map(np.asarray, (x, y))
-            if average_group:
-                gresults[group].append((x,y))
-            else:
-                if resample:
-                    x, y, counts = symmetric_ema(x, y, x[0], x[-1], resample, decay_steps=smooth_step)
-                l, = ax.plot(x, y, color=COLORS[groups.index(group) % len(COLORS)])
-                g2l[group] = l
-        if average_group:
-            for group in sorted(groups):
-                xys = gresults[group]
-                if not any(xys):
-                    continue
-                color = COLORS[groups.index(group) % len(COLORS)]
-                origxs = [xy[0] for xy in xys]
-                minxlen = min(map(len, origxs))
-                def allequal(qs):
-                    return all((q==qs[0]).all() for q in qs[1:])
-                if resample:
-                    low  = max(x[0] for x in origxs)
-                    high = min(x[-1] for x in origxs)
-                    usex = np.linspace(low, high, resample)
-                    ys = []
-                    for (x, y) in xys:
-                        ys.append(symmetric_ema(x, y, low, high, resample, decay_steps=smooth_step)[1])
-                else:
-                    assert allequal([x[:minxlen] for x in origxs]),\
-                        'If you want to average unevenly sampled data, set resample=<number of samples you want>'
-                    usex = origxs[0]
-                    ys = [xy[1][:minxlen] for xy in xys]
-                ymean = np.mean(ys, axis=0)
-                ystd = np.std(ys, axis=0)
-                ystderr = ystd / np.sqrt(len(ys))
-                l, = axarr[isplit][0].plot(usex, ymean, color=color)
-                g2l[group] = l
-                if shaded_err:
-                    ax.fill_between(usex, ymean - ystderr, ymean + ystderr, color=color, alpha=.4)
-                if shaded_std:
-                    ax.fill_between(usex, ymean - ystd,    ymean + ystd,    color=color, alpha=.2)
-
-
-        # https://matplotlib.org/users/legend_guide.html
-        plt.tight_layout()
-        if any(g2l.keys()):
-            ax.legend(
-                g2l.values(),
-                ['%s (%i)'%(g, g2c[g]) for g in g2l] if average_group else g2l.keys(),
-                loc=2 if legend_outside else None,
-                bbox_to_anchor=(1,1) if legend_outside else None)
-        ax.set_title(sk)
-    return f, axarr
-
-def regression_analysis(df):
-    xcols = list(df.columns.copy())
-    xcols.remove('score')
-    ycols = ['score']
-    import statsmodels.api as sm
-    mod = sm.OLS(df[ycols], sm.add_constant(df[xcols]), hasconst=False)
-    res = mod.fit()
-    print(res.summary())
-
-def test_smooth():
-    norig = 100
-    nup = 300
-    ndown = 30
-    xs = np.cumsum(np.random.rand(norig) * 10 / norig)
-    yclean = np.sin(xs)
-    ys = yclean + .1 * np.random.randn(yclean.size)
-    xup, yup, _ = symmetric_ema(xs, ys, xs.min(), xs.max(), nup, decay_steps=nup/ndown)
-    xdown, ydown, _ = symmetric_ema(xs, ys, xs.min(), xs.max(), ndown, decay_steps=ndown/ndown)
-    xsame, ysame, _ = symmetric_ema(xs, ys, xs.min(), xs.max(), norig, decay_steps=norig/ndown)
-    plt.plot(xs, ys, label='orig', marker='x')
-    plt.plot(xup, yup, label='up', marker='x')
-    plt.plot(xdown, ydown, label='down', marker='x')
-    plt.plot(xsame, ysame, label='same', marker='x')
-    plt.plot(xs, yclean, label='clean', marker='x')
-    plt.legend()
-    plt.show()
-
-
--- a/baselines/common/retro_wrappers.py
+++ b/baselines/common/retro_wrappers.py
@@ -132,8 +132,10 @@ class MovieRecord(gym.Wrapper):
        self.epcount = 0
    def reset(self):
        if self.epcount % self.k == 0:
+            print('saving movie this episode', self.savedir)
            self.env.unwrapped.movie_path = self.savedir
        else:
+            print('not saving this episode')
            self.env.unwrapped.movie_path = None
            self.env.unwrapped.movie = None
        self.epcount += 1
--- a/baselines/common/tests/test_serialization.py
+++ b/baselines/common/tests/test_serialization.py
@@ -103,9 +103,9 @@ def test_coexistence(learn_fn, network_fn):
    kwargs.update(learn_kwargs[learn_fn])

    learn =  partial(learn, env=env, network=network_fn, total_timesteps=0, **kwargs)
-    make_session(make_default=True, graph=tf.Graph())
+    make_session(make_default=True, graph=tf.Graph());
    model1 = learn(seed=1)
-    make_session(make_default=True, graph=tf.Graph())
+    make_session(make_default=True, graph=tf.Graph());
    model2 = learn(seed=2)

    model1.step(env.observation_space.sample())
--- a/baselines/common/tf_util.py
+++ b/baselines/common/tf_util.py
@@ -165,10 +165,6 @@ def function(inputs, outputs, updates=None, givens=None):
    outputs: [tf.Variable] or tf.Variable
        list of outputs or a single output to be returned from function. Returned
        value will also have the same shape.
-    updates: [tf.Operation] or tf.Operation
-        list of update functions or single update function that will be run whenever
-        the function is called. The return is ignored.
-
    """
    if isinstance(outputs, list):
        return _Function(inputs, outputs, updates, givens=givens)
--- a/baselines/common/vec_env/init.py
+++ b/baselines/common/vec_env/init.py
@@ -32,11 +32,6 @@ class VecEnv(ABC):
    """
    closed = False
    viewer = None
-
-    metadata = {
-        'render.modes': ['human', 'rgb_array']
-    }
-
    def __init__(self, num_envs, observation_space, action_space):
        self.num_envs = num_envs
        self.observation_space = observation_space
--- a/baselines/common/vec_env/dummy_vec_env.py
+++ b/baselines/common/vec_env/dummy_vec_env.py
@@ -20,6 +20,9 @@ class DummyVecEnv(VecEnv):
        env = self.envs[0]
        VecEnv.__init__(self, len(env_fns), env.observation_space, env.action_space)
        obs_space = env.observation_space
+        if isinstance(obs_space, spaces.MultiDiscrete):
+            obs_space.shape = obs_space.shape[0]
+
        self.keys, shapes, dtypes = obs_space_info(obs_space)

        self.buf_obs = { k: np.zeros((self.num_envs,) + tuple(shapes[k]), dtype=dtypes[k]) for k in self.keys }
@@ -76,6 +79,6 @@ class DummyVecEnv(VecEnv):

    def render(self, mode='human'):
        if self.num_envs == 1:
-            return self.envs[0].render(mode=mode)
+            self.envs[0].render(mode=mode)
        else:
-            return super().render(mode=mode)
+            super().render(mode=mode)
--- a/baselines/common/vec_env/test_video_recorder.py
+++ b/baselines/common/vec_env/test_video_recorder.py
@@ -1,49 +0,0 @@
-"""
-Tests for asynchronous vectorized environments.
-"""
-
-import gym
-import pytest
-import os
-import glob
-import tempfile
-
-from .dummy_vec_env import DummyVecEnv
-from .shmem_vec_env import ShmemVecEnv
-from .subproc_vec_env import SubprocVecEnv
-from .vec_video_recorder import VecVideoRecorder
-
-@pytest.mark.parametrize('klass', (DummyVecEnv, ShmemVecEnv, SubprocVecEnv))
-@pytest.mark.parametrize('num_envs', (1, 4))
-@pytest.mark.parametrize('video_length', (10, 100))
-@pytest.mark.parametrize('video_interval', (1, 50))
-def test_video_recorder(klass, num_envs, video_length, video_interval):
-    """
-    Wrap an existing VecEnv with VevVideoRecorder,
-    Make (video_interval + video_length + 1) steps,
-    then check that the file is present
-    """
-
-    def make_fn():
-        env = gym.make('PongNoFrameskip-v4')
-        return env
-    fns = [make_fn for _ in range(num_envs)]
-    env = klass(fns)
-
-    with tempfile.TemporaryDirectory() as video_path:
-        env = VecVideoRecorder(env, video_path, record_video_trigger=lambda x: x % video_interval == 0, video_length=video_length)
-
-        env.reset()
-        for _ in range(video_interval + video_length + 1):
-            env.step([0] * num_envs)
-        env.close()
-
-
-        recorded_video = glob.glob(os.path.join(video_path, "*.mp4"))
-
-        # first and second step
-        assert len(recorded_video) == 2
-        # Files are not empty
-        assert all(os.stat(p).st_size != 0 for p in recorded_video)
-
-
--- a/baselines/common/vec_env/vec_video_recorder.py
+++ b/baselines/common/vec_env/vec_video_recorder.py
@@ -1,89 +0,0 @@
-import os
-from baselines import logger
-from baselines.common.vec_env import VecEnvWrapper
-from gym.wrappers.monitoring import video_recorder
-
-
-class VecVideoRecorder(VecEnvWrapper):
-    """
-    Wrap VecEnv to record rendered image as mp4 video.
-    """
-
-    def __init__(self, venv, directory, record_video_trigger, video_length=200):
-        """
-        # Arguments
-            venv: VecEnv to wrap
-            directory: Where to save videos
-            record_video_trigger:
-                Function that defines when to start recording.
-                The function takes the current number of step,
-                and returns whether we should start recording or not.
-            video_length: Length of recorded video
-        """
-
-        VecEnvWrapper.__init__(self, venv)
-        self.record_video_trigger = record_video_trigger
-        self.video_recorder = None
-
-        self.directory = os.path.abspath(directory)
-        if not os.path.exists(self.directory): os.mkdir(self.directory)
-
-        self.file_prefix = "vecenv"
-        self.file_infix = '{}'.format(os.getpid())
-        self.step_id = 0
-        self.video_length = video_length
-
-        self.recording = False
-        self.recorded_frames = 0
-
-    def reset(self):
-        obs = self.venv.reset()
-
-        self.start_video_recorder()
-
-        return obs
-
-    def start_video_recorder(self):
-        self.close_video_recorder()
-
-        base_path = os.path.join(self.directory, '{}.video.{}.video{:06}'.format(self.file_prefix, self.file_infix, self.step_id))
-        self.video_recorder = video_recorder.VideoRecorder(
-                env=self.venv,
-                base_path=base_path,
-                metadata={'step_id': self.step_id}
-                )
-
-        self.video_recorder.capture_frame()
-        self.recorded_frames = 1
-        self.recording = True
-
-    def _video_enabled(self):
-        return self.record_video_trigger(self.step_id)
-
-    def step_wait(self):
-        obs, rews, dones, infos = self.venv.step_wait()
-
-        self.step_id += 1
-        if self.recording:
-            self.video_recorder.capture_frame()
-            self.recorded_frames += 1
-            if self.recorded_frames > self.video_length:
-                logger.info("Saving video to ", self.video_recorder.path)
-                self.close_video_recorder()
-        elif self._video_enabled():
-                self.start_video_recorder()
-
-        return obs, rews, dones, infos
-
-    def close_video_recorder(self):
-        if self.recording:
-            self.video_recorder.close()
-        self.recording = False
-        self.recorded_frames = 0
-
-    def close(self):
-        VecEnvWrapper.close(self)
-        self.close_video_recorder()
-
-    def __del__(self):
-        self.close()
--- a/baselines/ddpg/ddpg.py
+++ b/baselines/ddpg/ddpg.py
@@ -66,6 +66,7 @@ def learn(network, env,

    action_noise = None
    param_noise = None
+    nb_actions = env.action_space.shape[-1]
    if noise_type is not None:
        for current_noise_type in noise_type.split(','):
            current_noise_type = current_noise_type.strip()
--- a/baselines/ddpg/ddpg_learner.py
+++ b/baselines/ddpg/ddpg_learner.py
@@ -67,6 +67,7 @@ class DDPG(object):
    def __init__(self, actor, critic, memory, observation_shape, action_shape, param_noise=None, action_noise=None,
        gamma=0.99, tau=0.001, normalize_returns=False, enable_popart=False, normalize_observations=True,
        batch_size=128, observation_range=(-5., 5.), action_range=(-1., 1.), return_range=(-np.inf, np.inf),
+        adaptive_param_noise=True, adaptive_param_noise_policy_threshold=.1,
        critic_l2_reg=0., actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1.):
        # Inputs.
        self.obs0 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs0')
@@ -185,7 +186,7 @@ class DDPG(object):
        normalized_critic_target_tf = tf.clip_by_value(normalize(self.critic_target, self.ret_rms), self.return_range[0], self.return_range[1])
        self.critic_loss = tf.reduce_mean(tf.square(self.normalized_critic_tf - normalized_critic_target_tf))
        if self.critic_l2_reg > 0.:
-            critic_reg_vars = [var for var in self.critic.trainable_vars if var.name.endswith('/w:0') and 'output' not in var.name]
+            critic_reg_vars = [var for var in self.critic.trainable_vars if 'kernel' in var.name and 'output' not in var.name]
            for var in critic_reg_vars:
                logger.info('  regularizing: {}'.format(var.name))
            logger.info('  applying l2 regularization with {}'.format(self.critic_l2_reg))
@@ -270,7 +271,7 @@ class DDPG(object):

        if self.action_noise is not None and apply_noise:
            noise = self.action_noise()
-            assert noise.shape == action[0].shape
+            assert noise.shape == action.shape
            action += noise
        action = np.clip(action, self.action_range[0], self.action_range[1])

--- a/baselines/ddpg/models.py
+++ b/baselines/ddpg/models.py
@@ -42,7 +42,7 @@ class Critic(Model):
        with tf.variable_scope(self.name, reuse=tf.AUTO_REUSE):
            x = tf.concat([obs, action], axis=-1) # this assumes observation and action can be concatenated
            x = self.network_builder(x)
-            x = tf.layers.dense(x, 1, kernel_initializer=tf.random_uniform_initializer(minval=-3e-3, maxval=3e-3), name='output')
+            x = tf.layers.dense(x, 1, kernel_initializer=tf.random_uniform_initializer(minval=-3e-3, maxval=3e-3))
        return x

    @property
--- a/baselines/ddpg/test_smoke.py
+++ b/baselines/ddpg/test_smoke.py
@@ -1,17 +0,0 @@
-from baselines.run import main as M
-
-def _run(argstr):
-    M(('--alg=ddpg --env=Pendulum-v0 --num_timesteps=0 ' + argstr).split(' '))
-
-def test_popart():
-    _run('--normalize_returns=True --popart=True')
-
-def test_noise_normal():
-    _run('--noise_type=normal_0.1')
-
-def test_noise_ou():
-    _run('--noise_type=ou_0.1')
-
-def test_noise_adaptive():
-    _run('--noise_type=adaptive-param_0.2,normal_0.1')
-
--- a/baselines/deepq/init.py
+++ b/baselines/deepq/init.py
@@ -5,4 +5,4 @@ from baselines.deepq.replay_buffer import ReplayBuffer, PrioritizedReplayBuffer

 def wrap_atari_dqn(env):
    from baselines.common.atari_wrappers import wrap_deepmind
-    return wrap_deepmind(env, frame_stack=True, scale=False)
+    return wrap_deepmind(env, frame_stack=True, scale=True)
--- a/baselines/deepq/build_graph.py
+++ b/baselines/deepq/build_graph.py
@@ -33,7 +33,7 @@ The functions in this file can are used to create the following functions:
    stochastic: bool
        if set to False all the actions are always deterministic (default False)
    update_eps_ph: float
-        update epsilon to a new value, if negative no update happens
+        update epsilon a new value, if negative not update happens
        (default: no update)
    reset_ph: bool
        reset the perturbed policy by sampling a new perturbation
--- a/baselines/deepq/models.py
+++ b/baselines/deepq/models.py
@@ -2,9 +2,9 @@ import tensorflow as tf
 import tensorflow.contrib.layers as layers


-def _mlp(hiddens, input_, num_actions, scope, reuse=False, layer_norm=False):
+def _mlp(hiddens, inpt, num_actions, scope, reuse=False, layer_norm=False):
    with tf.variable_scope(scope, reuse=reuse):
-        out = input_
+        out = inpt
        for hidden in hiddens:
            out = layers.fully_connected(out, num_outputs=hidden, activation_fn=None)
            if layer_norm:
@@ -21,9 +21,6 @@ def mlp(hiddens=[], layer_norm=False):
    ----------
    hiddens: [int]
        list of sizes of hidden layers
-    layer_norm: bool
-        if true applies layer normalization for every layer
-        as described in https://arxiv.org/abs/1607.06450

    Returns
    -------
@@ -33,9 +30,9 @@ def mlp(hiddens=[], layer_norm=False):
    return lambda *args, **kwargs: _mlp(hiddens, layer_norm=layer_norm, *args, **kwargs)


-def _cnn_to_mlp(convs, hiddens, dueling, input_, num_actions, scope, reuse=False, layer_norm=False):
+def _cnn_to_mlp(convs, hiddens, dueling, inpt, num_actions, scope, reuse=False, layer_norm=False):
    with tf.variable_scope(scope, reuse=reuse):
-        out = input_
+        out = inpt
        with tf.variable_scope("convnet"):
            for num_outputs, kernel_size, stride in convs:
                out = layers.convolution2d(out,
@@ -75,7 +72,7 @@ def cnn_to_mlp(convs, hiddens, dueling=False, layer_norm=False):

    Parameters
    ----------
-    convs: [(int, int, int)]
+    convs: [(int, int int)]
        list of convolutional layers in form of
        (num_outputs, kernel_size, stride)
    hiddens: [int]
@@ -83,9 +80,6 @@ def cnn_to_mlp(convs, hiddens, dueling=False, layer_norm=False):
    dueling: bool
        if true double the output MLP to compute a baseline
        for action scores
-    layer_norm: bool
-        if true applies layer normalization for every layer
-        as described in https://arxiv.org/abs/1607.06450

    Returns
    -------
--- a/baselines/her/ddpg.py
+++ b/baselines/her/ddpg.py
@@ -367,6 +367,8 @@ class DDPG(object):
            self.pi_loss_tf = -tf.reduce_mean(self.main.Q_pi_tf)
            self.pi_loss_tf += self.action_l2 * tf.reduce_mean(tf.square(self.main.pi_tf / self.max_u))

+        self.pi_loss_tf = -tf.reduce_mean(self.main.Q_pi_tf)
+        self.pi_loss_tf += self.action_l2 * tf.reduce_mean(tf.square(self.main.pi_tf / self.max_u))
        Q_grads_tf = tf.gradients(self.Q_loss_tf, self._vars('main/Q'))
        pi_grads_tf = tf.gradients(self.pi_loss_tf, self._vars('main/pi'))
        assert len(self._vars('main/Q')) == len(Q_grads_tf)
--- a/baselines/logger.py
+++ b/baselines/logger.py
@@ -54,7 +54,7 @@ class HumanOutputFormat(KVWriter, SeqWriter):
        # Write out the data
        dashes = '-' * (keywidth + valwidth + 7)
        lines = [dashes]
-        for (key, val) in sorted(key2str.items(), key=lambda kv: kv[0].lower()):
+        for (key, val) in sorted(key2str.items()):
            lines.append('| %s%s | %s%s |' % (
                key,
                ' ' * (keywidth - len(key)),
--- a/baselines/ppo1/pposgd_simple.py
+++ b/baselines/ppo1/pposgd_simple.py
@@ -97,7 +97,7 @@ def learn(env, policy_fn, *,
    ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return

    lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule
-    clip_param = clip_param * lrmult # Annealed clipping parameter epsilon
+    clip_param = clip_param * lrmult # Annealed cliping parameter epislon

    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])
--- a/baselines/ppo2/defaults.py
+++ b/baselines/ppo2/defaults.py
@@ -20,6 +20,3 @@ def atari():
        lr=lambda f : f * 2.5e-4,
        cliprange=lambda f : f * 0.1,
    )
-
-def retro():
-    return atari()
--- a/baselines/ppo2/microbatched_model.py
+++ b/baselines/ppo2/microbatched_model.py
@@ -1,76 +0,0 @@
-import tensorflow as tf
-import numpy as np
-from baselines.ppo2.model import Model
-
-class MicrobatchedModel(Model):
-    """
-    Model that does training one microbatch at a time - when gradient computation
-    on the entire minibatch causes some overflow
-    """
-    def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train,
-                nsteps, ent_coef, vf_coef, max_grad_norm, microbatch_size):
-
-        self.nmicrobatches = nbatch_train // microbatch_size
-        self.microbatch_size = microbatch_size
-        assert nbatch_train % microbatch_size == 0, 'microbatch_size ({}) should divide nbatch_train ({}) evenly'.format(microbatch_size, nbatch_train)
-
-        super().__init__(
-                policy=policy,
-                ob_space=ob_space,
-                ac_space=ac_space,
-                nbatch_act=nbatch_act,
-                nbatch_train=microbatch_size,
-                nsteps=nsteps,
-                ent_coef=ent_coef,
-                vf_coef=vf_coef,
-                max_grad_norm=max_grad_norm)
-
-        self.grads_ph = [tf.placeholder(dtype=g.dtype, shape=g.shape) for g in self.grads]
-        grads_ph_and_vars = list(zip(self.grads_ph, self.var))
-        self._apply_gradients_op = self.trainer.apply_gradients(grads_ph_and_vars)
-
-
-    def train(self, lr, cliprange, obs, returns, masks, actions, values, neglogpacs, states=None):
-        assert states is None, "microbatches with recurrent models are not supported yet"
-
-        # Here we calculate advantage A(s,a) = R + yV(s') - V(s)
-        # Returns = R + yV(s')
-        advs = returns - values
-
-        # Normalize the advantages
-        advs = (advs - advs.mean()) / (advs.std() + 1e-8)
-
-        # Initialize empty list for per-microbatch stats like pg_loss, vf_loss, entropy, approxkl (whatever is in self.stats_list)
-        stats_vs = []
-
-        for microbatch_idx in range(self.nmicrobatches):
-            _sli = range(microbatch_idx * self.microbatch_size, (microbatch_idx+1) * self.microbatch_size)
-            td_map = {
-                self.train_model.X: obs[_sli],
-                self.A:actions[_sli],
-                self.ADV:advs[_sli],
-                self.R:returns[_sli],
-                self.CLIPRANGE:cliprange,
-                self.OLDNEGLOGPAC:neglogpacs[_sli],
-                self.OLDVPRED:values[_sli]
-            }
-
-            # Compute gradient on a microbatch (note that variables do not change here) ...
-            grad_v, stats_v  = self.sess.run([self.grads, self.stats_list], td_map)
-            if microbatch_idx == 0:
-                sum_grad_v = grad_v
-            else:
-                # .. and add to the total of the gradients
-                for i, g in enumerate(grad_v):
-                    sum_grad_v[i] += g
-            stats_vs.append(stats_v)
-
-        feed_dict = {ph: sum_g / self.nmicrobatches for ph, sum_g in zip(self.grads_ph, sum_grad_v)}
-        feed_dict[self.LR] = lr
-        # Update variables using average of the gradients
-        self.sess.run(self._apply_gradients_op, feed_dict)
-        # Return average of the stats
-        return np.mean(np.array(stats_vs), axis=0).tolist()
-
-
-
--- a/baselines/ppo2/model.py
+++ b/baselines/ppo2/model.py
@@ -1,156 +0,0 @@
-import tensorflow as tf
-import functools
-
-from baselines.common.tf_util import get_session, save_variables, load_variables
-from baselines.common.tf_util import initialize
-
-try:
-    from baselines.common.mpi_adam_optimizer import MpiAdamOptimizer
-    from mpi4py import MPI
-    from baselines.common.mpi_util import sync_from_root
-except ImportError:
-    MPI = None
-
-class Model(object):
-    """
-    We use this object to :
-    __init__:
-    - Creates the step_model
-    - Creates the train_model
-
-    train():
-    - Make the training part (feedforward and retropropagation of gradients)
-
-    save/load():
-    - Save load the model
-    """
-    def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train,
-                nsteps, ent_coef, vf_coef, max_grad_norm, microbatch_size=None):
-        self.sess = sess = get_session()
-
-        with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE):
-            # CREATE OUR TWO MODELS
-            # act_model that is used for sampling
-            act_model = policy(nbatch_act, 1, sess)
-
-            # Train model for training
-            if microbatch_size is None:
-                train_model = policy(nbatch_train, nsteps, sess)
-            else:
-                train_model = policy(microbatch_size, nsteps, sess)
-
-        # CREATE THE PLACEHOLDERS
-        self.A = A = train_model.pdtype.sample_placeholder([None])
-        self.ADV = ADV = tf.placeholder(tf.float32, [None])
-        self.R = R = tf.placeholder(tf.float32, [None])
-        # Keep track of old actor
-        self.OLDNEGLOGPAC = OLDNEGLOGPAC = tf.placeholder(tf.float32, [None])
-        # Keep track of old critic
-        self.OLDVPRED = OLDVPRED = tf.placeholder(tf.float32, [None])
-        self.LR = LR = tf.placeholder(tf.float32, [])
-        # Cliprange
-        self.CLIPRANGE = CLIPRANGE = tf.placeholder(tf.float32, [])
-
-        neglogpac = train_model.pd.neglogp(A)
-
-        # Calculate the entropy
-        # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy.
-        entropy = tf.reduce_mean(train_model.pd.entropy())
-
-        # CALCULATE THE LOSS
-        # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss
-
-        # Clip the value to reduce variability during Critic training
-        # Get the predicted value
-        vpred = train_model.vf
-        vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, - CLIPRANGE, CLIPRANGE)
-        # Unclipped value
-        vf_losses1 = tf.square(vpred - R)
-        # Clipped value
-        vf_losses2 = tf.square(vpredclipped - R)
-
-        vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))
-
-        # Calculate ratio (pi current policy / pi old policy)
-        ratio = tf.exp(OLDNEGLOGPAC - neglogpac)
-
-        # Defining Loss = - J is equivalent to max J
-        pg_losses = -ADV * ratio
-
-        pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE)
-
-        # Final PG loss
-        pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2))
-        approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC))
-        clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE)))
-
-        # Total loss
-        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef
-
-        # UPDATE THE PARAMETERS USING LOSS
-        # 1. Get the model parameters
-        params = tf.trainable_variables('ppo2_model')
-        # 2. Build our trainer
-        if MPI is not None:
-            self.trainer = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=LR, epsilon=1e-5)
-        else:
-            self.trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5)
-        # 3. Calculate the gradients
-        grads_and_var = self.trainer.compute_gradients(loss, params)
-        grads, var = zip(*grads_and_var)
-
-        if max_grad_norm is not None:
-            # Clip the gradients (normalize)
-            grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
-        grads_and_var = list(zip(grads, var))
-        # zip aggregate each gradient with parameters associated
-        # For instance zip(ABCD, xyza) => Ax, By, Cz, Da
-
-        self.grads = grads
-        self.var = var
-        self._train_op = self.trainer.apply_gradients(grads_and_var)
-        self.loss_names = ['policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac']
-        self.stats_list = [pg_loss, vf_loss, entropy, approxkl, clipfrac]
-
-
-        self.train_model = train_model
-        self.act_model = act_model
-        self.step = act_model.step
-        self.value = act_model.value
-        self.initial_state = act_model.initial_state
-
-        self.save = functools.partial(save_variables, sess=sess)
-        self.load = functools.partial(load_variables, sess=sess)
-
-        initialize()
-        global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="")
-        if MPI is not None:
-            sync_from_root(sess, global_variables) #pylint: disable=E1101
-
-    def train(self, lr, cliprange, obs, returns, masks, actions, values, neglogpacs, states=None):
-        # Here we calculate advantage A(s,a) = R + yV(s') - V(s)
-        # Returns = R + yV(s')
-        advs = returns - values
-
-        # Normalize the advantages
-        advs = (advs - advs.mean()) / (advs.std() + 1e-8)
-
-        td_map = {
-            self.train_model.X : obs,
-            self.A : actions,
-            self.ADV : advs,
-            self.R : returns,
-            self.LR : lr,
-            self.CLIPRANGE : cliprange,
-            self.OLDNEGLOGPAC : neglogpacs,
-            self.OLDVPRED : values
-        }
-        if states is not None:
-            td_map[self.train_model.S] = states
-            td_map[self.train_model.M] = masks
-
-        return self.sess.run(
-            self.stats_list + [self._train_op],
-            td_map
-        )[:-1]
-
--- a/baselines/ppo2/ppo2.py
+++ b/baselines/ppo2/ppo2.py
@@ -1,17 +1,226 @@
 import os
 import time
+import functools
 import numpy as np
 import os.path as osp
+import tensorflow as tf
 from baselines import logger
 from collections import deque
 from baselines.common import explained_variance, set_global_seeds
 from baselines.common.policies import build_policy
+from baselines.common.runners import AbstractEnvRunner
+from baselines.common.tf_util import get_session, save_variables, load_variables
+
 try:
+    from baselines.common.mpi_adam_optimizer import MpiAdamOptimizer
    from mpi4py import MPI
+    from baselines.common.mpi_util import sync_from_root
 except ImportError:
    MPI = None
-from baselines.ppo2.runner import Runner

+from baselines.common.tf_util import initialize
+
+class Model(object):
+    """
+    We use this object to :
+    __init__:
+    - Creates the step_model
+    - Creates the train_model
+
+    train():
+    - Make the training part (feedforward and retropropagation of gradients)
+
+    save/load():
+    - Save load the model
+    """
+    def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train,
+                nsteps, ent_coef, vf_coef, max_grad_norm):
+        sess = get_session()
+
+        with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE):
+            # CREATE OUR TWO MODELS
+            # act_model that is used for sampling
+            act_model = policy(nbatch_act, 1, sess)
+
+            # Train model for training
+            train_model = policy(nbatch_train, nsteps, sess)
+
+        # CREATE THE PLACEHOLDERS
+        A = train_model.pdtype.sample_placeholder([None])
+        ADV = tf.placeholder(tf.float32, [None])
+        R = tf.placeholder(tf.float32, [None])
+        # Keep track of old actor
+        OLDNEGLOGPAC = tf.placeholder(tf.float32, [None])
+        # Keep track of old critic
+        OLDVPRED = tf.placeholder(tf.float32, [None])
+        LR = tf.placeholder(tf.float32, [])
+        # Cliprange
+        CLIPRANGE = tf.placeholder(tf.float32, [])
+
+        neglogpac = train_model.pd.neglogp(A)
+
+        # Calculate the entropy
+        # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy.
+        entropy = tf.reduce_mean(train_model.pd.entropy())
+
+        # CALCULATE THE LOSS
+        # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss
+
+        # Clip the value to reduce variability during Critic training
+        # Get the predicted value
+        vpred = train_model.vf
+        vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, - CLIPRANGE, CLIPRANGE)
+        # Unclipped value
+        vf_losses1 = tf.square(vpred - R)
+        # Clipped value
+        vf_losses2 = tf.square(vpredclipped - R)
+
+        vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))
+
+        # Calculate ratio (pi current policy / pi old policy)
+        ratio = tf.exp(OLDNEGLOGPAC - neglogpac)
+
+        # Defining Loss = - J is equivalent to max J
+        pg_losses = -ADV * ratio
+
+        pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE)
+
+        # Final PG loss
+        pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2))
+        approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC))
+        clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE)))
+
+        # Total loss
+        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef
+
+        # UPDATE THE PARAMETERS USING LOSS
+        # 1. Get the model parameters
+        params = tf.trainable_variables('ppo2_model')
+        # 2. Build our trainer
+        if MPI is not None:
+            trainer = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=LR, epsilon=1e-5)
+        else:
+            trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5)
+        # 3. Calculate the gradients
+        grads_and_var = trainer.compute_gradients(loss, params)
+        grads, var = zip(*grads_and_var)
+
+        if max_grad_norm is not None:
+            # Clip the gradients (normalize)
+            grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
+        grads_and_var = list(zip(grads, var))
+        # zip aggregate each gradient with parameters associated
+        # For instance zip(ABCD, xyza) => Ax, By, Cz, Da
+
+        _train = trainer.apply_gradients(grads_and_var)
+
+        def train(lr, cliprange, obs, returns, masks, actions, values, neglogpacs, states=None):
+            # Here we calculate advantage A(s,a) = R + yV(s') - V(s)
+            # Returns = R + yV(s')
+            advs = returns - values
+
+            # Normalize the advantages
+            advs = (advs - advs.mean()) / (advs.std() + 1e-8)
+            td_map = {train_model.X:obs, A:actions, ADV:advs, R:returns, LR:lr,
+                    CLIPRANGE:cliprange, OLDNEGLOGPAC:neglogpacs, OLDVPRED:values}
+            if states is not None:
+                td_map[train_model.S] = states
+                td_map[train_model.M] = masks
+            return sess.run(
+                [pg_loss, vf_loss, entropy, approxkl, clipfrac, _train],
+                td_map
+            )[:-1]
+        self.loss_names = ['policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac']
+
+
+        self.train = train
+        self.train_model = train_model
+        self.act_model = act_model
+        self.step = act_model.step
+        self.value = act_model.value
+        self.initial_state = act_model.initial_state
+
+        self.save = functools.partial(save_variables, sess=sess)
+        self.load = functools.partial(load_variables, sess=sess)
+
+        if MPI is None or MPI.COMM_WORLD.Get_rank() == 0:
+            initialize()
+        global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="")
+
+        if MPI is not None:
+            sync_from_root(sess, global_variables) #pylint: disable=E1101
+
+class Runner(AbstractEnvRunner):
+    """
+    We use this object to make a mini batch of experiences
+    __init__:
+    - Initialize the runner
+
+    run():
+    - Make a mini batch
+    """
+    def __init__(self, *, env, model, nsteps, gamma, lam):
+        super().__init__(env=env, model=model, nsteps=nsteps)
+        # Lambda used in GAE (General Advantage Estimation)
+        self.lam = lam
+        # Discount rate
+        self.gamma = gamma
+
+    def run(self):
+        # Here, we init the lists that will contain the mb of experiences
+        mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, mb_neglogpacs = [],[],[],[],[],[]
+        mb_states = self.states
+        epinfos = []
+        # For n in range number of steps
+        for _ in range(self.nsteps):
+            # Given observations, get action value and neglopacs
+            # We already have self.obs because Runner superclass run self.obs[:] = env.reset() on init
+            actions, values, self.states, neglogpacs = self.model.step(self.obs, S=self.states, M=self.dones)
+            mb_obs.append(self.obs.copy())
+            mb_actions.append(actions)
+            mb_values.append(values)
+            mb_neglogpacs.append(neglogpacs)
+            mb_dones.append(self.dones)
+
+            # Take actions in env and look the results
+            # Infos contains a ton of useful informations
+            self.obs[:], rewards, self.dones, infos = self.env.step(actions)
+            for info in infos:
+                maybeepinfo = info.get('episode')
+                if maybeepinfo: epinfos.append(maybeepinfo)
+            mb_rewards.append(rewards)
+        #batch of steps to batch of rollouts
+        mb_obs = np.asarray(mb_obs, dtype=self.obs.dtype)
+        mb_rewards = np.asarray(mb_rewards, dtype=np.float32)
+        mb_actions = np.asarray(mb_actions)
+        mb_values = np.asarray(mb_values, dtype=np.float32)
+        mb_neglogpacs = np.asarray(mb_neglogpacs, dtype=np.float32)
+        mb_dones = np.asarray(mb_dones, dtype=np.bool)
+        last_values = self.model.value(self.obs, S=self.states, M=self.dones)
+
+        # discount/bootstrap off value fn
+        mb_returns = np.zeros_like(mb_rewards)
+        mb_advs = np.zeros_like(mb_rewards)
+        lastgaelam = 0
+        for t in reversed(range(self.nsteps)):
+            if t == self.nsteps - 1:
+                nextnonterminal = 1.0 - self.dones
+                nextvalues = last_values
+            else:
+                nextnonterminal = 1.0 - mb_dones[t+1]
+                nextvalues = mb_values[t+1]
+            delta = mb_rewards[t] + self.gamma * nextvalues * nextnonterminal - mb_values[t]
+            mb_advs[t] = lastgaelam = delta + self.gamma * self.lam * nextnonterminal * lastgaelam
+        mb_returns = mb_advs + mb_values
+        return (*map(sf01, (mb_obs, mb_returns, mb_dones, mb_actions, mb_values, mb_neglogpacs)),
+            mb_states, epinfos)
+# obs, returns, masks, actions, values, neglogpacs, states = runner.run()
+def sf01(arr):
+    """
+    swap and then flatten axes 0 and 1
+    """
+    s = arr.shape
+    return arr.swapaxes(0, 1).reshape(s[0] * s[1], *s[2:])

 def constfn(val):
    def f(_):
@@ -21,7 +230,7 @@ def constfn(val):
 def learn(*, network, env, total_timesteps, eval_env = None, seed=None, nsteps=2048, ent_coef=0.0, lr=3e-4,
            vf_coef=0.5,  max_grad_norm=0.5, gamma=0.99, lam=0.95,
            log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2,
-            save_interval=0, load_path=None, model_fn=None, **network_kwargs):
+            save_interval=0, load_path=None, **network_kwargs):
    '''
    Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347)

@@ -99,14 +308,10 @@ def learn(*, network, env, total_timesteps, eval_env = None, seed=None, nsteps=2
    nbatch_train = nbatch // nminibatches

    # Instantiate the model object (that creates act_model and train_model)
-    if model_fn is None:
-        from baselines.ppo2.model import Model
-        model_fn = Model
-
-    model = model_fn(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train,
+    make_model = lambda : Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train,
                    nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef,
                    max_grad_norm=max_grad_norm)
-
+    model = make_model()
    if load_path is not None:
        model.load(load_path)
    # Instantiate the runner object
@@ -114,6 +319,8 @@ def learn(*, network, env, total_timesteps, eval_env = None, seed=None, nsteps=2
    if eval_env is not None:
        eval_runner = Runner(env = eval_env, model = model, nsteps = nsteps, gamma = gamma, lam= lam)

+
+
    epinfobuf = deque(maxlen=100)
    if eval_env is not None:
        eval_epinfobuf = deque(maxlen=100)
--- a/baselines/ppo2/runner.py
+++ b/baselines/ppo2/runner.py
@@ -1,76 +0,0 @@
-import numpy as np
-from baselines.common.runners import AbstractEnvRunner
-
-class Runner(AbstractEnvRunner):
-    """
-    We use this object to make a mini batch of experiences
-    __init__:
-    - Initialize the runner
-
-    run():
-    - Make a mini batch
-    """
-    def __init__(self, *, env, model, nsteps, gamma, lam):
-        super().__init__(env=env, model=model, nsteps=nsteps)
-        # Lambda used in GAE (General Advantage Estimation)
-        self.lam = lam
-        # Discount rate
-        self.gamma = gamma
-
-    def run(self):
-        # Here, we init the lists that will contain the mb of experiences
-        mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, mb_neglogpacs = [],[],[],[],[],[]
-        mb_states = self.states
-        epinfos = []
-        # For n in range number of steps
-        for _ in range(self.nsteps):
-            # Given observations, get action value and neglopacs
-            # We already have self.obs because Runner superclass run self.obs[:] = env.reset() on init
-            actions, values, self.states, neglogpacs = self.model.step(self.obs, S=self.states, M=self.dones)
-            mb_obs.append(self.obs.copy())
-            mb_actions.append(actions)
-            mb_values.append(values)
-            mb_neglogpacs.append(neglogpacs)
-            mb_dones.append(self.dones)
-
-            # Take actions in env and look the results
-            # Infos contains a ton of useful informations
-            self.obs[:], rewards, self.dones, infos = self.env.step(actions)
-            for info in infos:
-                maybeepinfo = info.get('episode')
-                if maybeepinfo: epinfos.append(maybeepinfo)
-            mb_rewards.append(rewards)
-        #batch of steps to batch of rollouts
-        mb_obs = np.asarray(mb_obs, dtype=self.obs.dtype)
-        mb_rewards = np.asarray(mb_rewards, dtype=np.float32)
-        mb_actions = np.asarray(mb_actions)
-        mb_values = np.asarray(mb_values, dtype=np.float32)
-        mb_neglogpacs = np.asarray(mb_neglogpacs, dtype=np.float32)
-        mb_dones = np.asarray(mb_dones, dtype=np.bool)
-        last_values = self.model.value(self.obs, S=self.states, M=self.dones)
-
-        # discount/bootstrap off value fn
-        mb_returns = np.zeros_like(mb_rewards)
-        mb_advs = np.zeros_like(mb_rewards)
-        lastgaelam = 0
-        for t in reversed(range(self.nsteps)):
-            if t == self.nsteps - 1:
-                nextnonterminal = 1.0 - self.dones
-                nextvalues = last_values
-            else:
-                nextnonterminal = 1.0 - mb_dones[t+1]
-                nextvalues = mb_values[t+1]
-            delta = mb_rewards[t] + self.gamma * nextvalues * nextnonterminal - mb_values[t]
-            mb_advs[t] = lastgaelam = delta + self.gamma * self.lam * nextnonterminal * lastgaelam
-        mb_returns = mb_advs + mb_values
-        return (*map(sf01, (mb_obs, mb_returns, mb_dones, mb_actions, mb_values, mb_neglogpacs)),
-            mb_states, epinfos)
-# obs, returns, masks, actions, values, neglogpacs, states = runner.run()
-def sf01(arr):
-    """
-    swap and then flatten axes 0 and 1
-    """
-    s = arr.shape
-    return arr.swapaxes(0, 1).reshape(s[0] * s[1], *s[2:])
-
-
--- a/baselines/ppo2/test_microbatches.py
+++ b/baselines/ppo2/test_microbatches.py
@@ -1,34 +0,0 @@
-import gym
-import tensorflow as tf
-import numpy as np
-from functools import partial
-
-from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
-from baselines.common.tf_util import make_session
-from baselines.ppo2.ppo2 import learn
-
-from baselines.ppo2.microbatched_model import MicrobatchedModel
-
-def test_microbatches():
-    def env_fn():
-        env = gym.make('CartPole-v0')
-        env.seed(0)
-        return env
-
-    learn_fn = partial(learn, network='mlp', nsteps=32, total_timesteps=32, seed=0)
-
-    env_ref = DummyVecEnv([env_fn])
-    sess_ref = make_session(make_default=True, graph=tf.Graph())
-    learn_fn(env=env_ref)
-    vars_ref = {v.name: sess_ref.run(v) for v in tf.trainable_variables()}
-
-    env_test = DummyVecEnv([env_fn])
-    sess_test = make_session(make_default=True, graph=tf.Graph())
-    learn_fn(env=env_test, model_fn=partial(MicrobatchedModel, microbatch_size=2))
-    vars_test = {v.name: sess_test.run(v) for v in tf.trainable_variables()}
-
-    for v in vars_ref:
-        np.testing.assert_allclose(vars_ref[v], vars_test[v], atol=1e-3)
-
-if __name__ == '__main__':
-    test_microbatches()
--- a/baselines/results_plotter.py
+++ b/baselines/results_plotter.py
@@ -5,7 +5,7 @@ matplotlib.use('TkAgg') # Can change to 'Agg' for non-interactive mode
 import matplotlib.pyplot as plt
 plt.rcParams['svg.fonttype'] = 'none'

-from baselines.common import plot_util
+from baselines.bench.monitor import load_results

 X_TIMESTEPS = 'timesteps'
 X_EPISODES = 'episodes'
@@ -16,7 +16,7 @@ POSSIBLE_X_AXES = [X_TIMESTEPS, X_EPISODES, X_WALLTIME]
 EPISODES_WINDOW = 100
 COLORS = ['blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'black', 'purple', 'pink',
        'brown', 'orange', 'teal', 'coral', 'lightblue', 'lime', 'lavender', 'turquoise',
-        'darkgreen', 'tan', 'salmon', 'gold', 'darkred', 'darkblue']
+        'darkgreen', 'tan', 'salmon', 'gold', 'lightpurple', 'darkred', 'darkblue']

 def rolling_window(a, window):
    shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
@@ -50,7 +50,7 @@ def plot_curves(xy_list, xaxis, yaxis, title):
    maxx = max(xy[0][-1] for xy in xy_list)
    minx = 0
    for (i, (x, y)) in enumerate(xy_list):
-        color = COLORS[i % len(COLORS)]
+        color = COLORS[i]
        plt.scatter(x, y, s=2)
        x, y_mean = window_func(x, y, EPISODES_WINDOW, np.mean) #So returns average of last EPISODE_WINDOW episodes
        plt.plot(x, y_mean, color=color)
@@ -62,18 +62,19 @@ def plot_curves(xy_list, xaxis, yaxis, title):
    fig.canvas.mpl_connect('resize_event', lambda event: plt.tight_layout())
    plt.grid(True)

-
-def split_by_task(taskpath):
-    return taskpath['dirname'].split('/')[-1].split('-')[0]
-
-def plot_results(dirs, num_timesteps=10e6, xaxis=X_TIMESTEPS, yaxis=Y_REWARD, title='', split_fn=split_by_task):
-    results = plot_util.load_results(dirs)
-    plot_util.plot_results(results, xy_fn=lambda r: ts2xy(r['monitor'], xaxis, yaxis), split_fn=split_fn, average_group=True, resample=int(1e6))
+def plot_results(dirs, num_timesteps, xaxis, yaxis, task_name):
+    tslist = []
+    for dir in dirs:
+        ts = load_results(dir)
+        ts = ts[ts.l.cumsum() <= num_timesteps]
+        tslist.append(ts)
+    xy_list = [ts2xy(ts, xaxis, yaxis) for ts in tslist]
+    plot_curves(xy_list, xaxis, yaxis, task_name)

 # Example usage in jupyter-notebook
-# from baselines.results_plotter import plot_results
+# from baselines import log_viewer
 # %matplotlib inline
-# plot_results("./log")
+# log_viewer.plot_results(["./log"], 10e6, log_viewer.X_TIMESTEPS, "Breakout")
 # Here ./log is a directory containing the monitor.csv files

 def main():
--- a/baselines/run.py
+++ b/baselines/run.py
@@ -6,7 +6,6 @@ from collections import defaultdict
 import tensorflow as tf
 import numpy as np

-from baselines.common.vec_env.vec_video_recorder import VecVideoRecorder
 from baselines.common.vec_env.vec_frame_stack import VecFrameStack
 from baselines.common.cmd_util import common_arg_parser, parse_unknown_args, make_vec_env, make_env
 from baselines.common.tf_util import get_session
@@ -63,8 +62,6 @@ def train(args, extra_args):
    alg_kwargs.update(extra_args)

    env = build_env(args)
-    if args.save_video_interval != 0:
-        env = VecVideoRecorder(env, osp.join(logger.Logger.CURRENT.dir, "videos"), record_video_trigger=lambda x: x % args.save_video_interval == 0, video_length=args.save_video_length)

    if args.network:
        alg_kwargs['network'] = args.network
@@ -181,11 +178,11 @@ def parse_cmdline_kwargs(args):



-def main(args):
+def main():
    # configure logger, disable logging in child MPI processes (with rank > 0)

    arg_parser = common_arg_parser()
-    args, unknown_args = arg_parser.parse_known_args(args)
+    args, unknown_args = arg_parser.parse_known_args()
    extra_args = parse_cmdline_kwargs(unknown_args)

    if MPI is None or MPI.COMM_WORLD.Get_rank() == 0:
@@ -220,7 +217,5 @@ def main(args):

        env.close()

-    return model
-
 if __name__ == '__main__':
-    main(sys.argv)
+    main()
--- a/docs/viz/viz.ipynb
+++ b/docs/viz/viz.ipynb
--- a/setup.py
+++ b/setup.py
@@ -11,7 +11,6 @@ extras = {
    'test': [
        'filelock',
        'pytest',
-        'pytest-forked',
        'atari-py'
    ],
    'bullet': [
--- a/test.dockerfile.py36-mpi
+++ b/test.dockerfile.py36-mpi
@@ -0,0 +1,16 @@
+FROM python:3.6
+
+RUN apt-get -y update && apt-get -y install ffmpeg libopenmpi-dev
+ENV CODE_DIR /root/code
+
+COPY . $CODE_DIR/baselines
+WORKDIR $CODE_DIR/baselines
+
+# Clean up pycache and pyc files
+RUN rm -rf __pycache__ && \
+    find . -name "*.pyc" -delete && \
+    pip install tensorflow && \
+    pip install -e .[test,mpi]
+
+
+CMD /bin/bash
--- a/test.dockerfile.py36-nompi
+++ b/test.dockerfile.py36-nompi
@@ -1,8 +1,6 @@
 FROM python:3.6

 RUN apt-get -y update && apt-get -y install ffmpeg
-# RUN apt-get -y update && apt-get -y install git wget python-dev python3-dev libopenmpi-dev python-pip zlib1g-dev cmake python-opencv
-
 ENV CODE_DIR /root/code

 COPY . $CODE_DIR/baselines
Author	SHA1	Message	Date
Peter Zhokhov	58801032fc	install mpi4py in mpi dockerfile	2018-10-31 11:34:10 -07:00
Peter Zhokhov	b4a149a75f	fix .travis.yml	2018-10-31 11:32:03 -07:00
Peter Zhokhov	c248bf9a46	CI dockerfiles with and without mpi	2018-10-31 11:27:45 -07:00
Peter Zhokhov	d1f7d12743	mpiless ddpg	2018-10-31 09:48:41 -07:00
Peter Zhokhov	f0d49fb67d	add assertion to test in mpi_adam; fix trpo_mpi failure without MPI on cartpole	2018-10-30 14:45:20 -07:00
Peter Zhokhov	ef2e7246c9	autopep8	2018-10-30 14:11:38 -07:00
Peter Zhokhov	3e3e2b7998	MpiAdam becomes regular Adam if Mpi not present	2018-10-30 14:04:30 -07:00
Peter Zhokhov	d00f3bce34	syntax and flake8	2018-10-30 09:47:39 -07:00
Peter Zhokhov	72aa2f1251	more MPI removal	2018-10-29 15:43:56 -07:00
Peter Zhokhov	ea7a52b652	further removing MPI references where unnecessary	2018-10-29 15:38:16 -07:00
Peter Zhokhov	064c45fa76	Merge branch 'master' of github.com:openai/baselines into peterz_mpiless	2018-10-29 15:31:37 -07:00
Peter Zhokhov	6f148fdb0d	squash-merged latest master	2018-10-29 15:28:59 -07:00
Peter Zhokhov	d96e20ff27	make baselines run without mpi wip	2018-10-19 17:00:41 -07:00