add an argument for importing extra modules from run

Recognize nightly tf builds (#763 )
* Recognize nightly tf builds * Use LooseVersion instead of StrictVersion to recongnize nightly build numbers Nightly version numbers are of the form `1.3.0.dev20181215` but it's not a valid version number for `StrictVersion`, while `LooseVersion` still recognizes it.
2019-01-03 11:33:31 -08:00 · 2018-12-21 12:47:48 -08:00 · 2018-12-19 14:44:08 -08:00 · 2018-11-29 17:28:08 -08:00 · 2018-11-26 17:57:25 -08:00 · 2018-11-26 17:56:41 -08:00
43 changed files with 1714 additions and 787 deletions
--- a/README.md
+++ b/README.md
@@ -1,3 +1,5 @@
+**Status:** Active (under active development, breaking changes may occur)
+
 <img src="data/logo.jpg" width=25% align="right" /> [![Build status](https://travis-ci.org/openai/baselines.svg?branch=master)](https://travis-ci.org/openai/baselines)

 # Baselines
@@ -110,7 +112,7 @@ python -m baselines.run --alg=ppo2 --env=PongNoFrameskip-v4 --num_timesteps=0 --
 *NOTE:* At the moment Mujoco training uses VecNormalize wrapper for the environment which is not being saved correctly; so loading the models trained on Mujoco will not work well if the environment is recreated. If necessary, you can work around that by replacing RunningMeanStd by TfRunningMeanStd in [baselines/common/vec_env/vec_normalize.py](baselines/common/vec_env/vec_normalize.py#L12). This way, mean and std of environment normalizing wrapper will be saved in tensorflow variables and included in the model file; however, training is slower that way - hence not including it by default

 ## Loading and vizualizing learning curves and other training metrics
-See [here](docs/viz/viz.md) for instructions on how to load and display the training data. 
+See [here](docs/viz/viz.ipynb) for instructions on how to load and display the training data. 

 ## Subpackages

--- a/baselines/bench/benchmarks.py
+++ b/baselines/bench/benchmarks.py
@@ -156,9 +156,10 @@ register_benchmark({

 # HER DDPG

+_fetch_tasks = ['FetchReach-v1', 'FetchPush-v1', 'FetchSlide-v1']
 register_benchmark({
-    'name': 'HerDdpg',
-    'description': 'Smoke-test only benchmark of HER',
-    'tasks': [{'trials': 1, 'env_id': 'FetchReach-v1'}]
+    'name': 'Fetch1M',
+    'description': 'Fetch* benchmarks for 1M timesteps',
+    'tasks': [{'trials': 6, 'env_id': env_id, 'num_timesteps': int(1e6)} for env_id in _fetch_tasks]
 })

--- a/baselines/common/atari_wrappers.py
+++ b/baselines/common/atari_wrappers.py
@@ -72,8 +72,8 @@ class EpisodicLifeEnv(gym.Wrapper):
        # then update lives to handle bonus lives
        lives = self.env.unwrapped.ale.lives()
        if lives < self.lives and lives > 0:
-            # for Qbert sometimes we stay in lives == 0 condtion for a few frames
-            # so its important to keep lives > 0, so that we only reset once
+            # for Qbert sometimes we stay in lives == 0 condition for a few frames
+            # so it's important to keep lives > 0, so that we only reset once
            # the environment advertises done.
            done = True
        self.lives = lives
@@ -129,18 +129,26 @@ class ClipRewardEnv(gym.RewardWrapper):
        return np.sign(reward)

 class WarpFrame(gym.ObservationWrapper):
-    def __init__(self, env):
+    def __init__(self, env, width=84, height=84, grayscale=True):
        """Warp frames to 84x84 as done in the Nature paper and later work."""
        gym.ObservationWrapper.__init__(self, env)
-        self.width = 84
-        self.height = 84
-        self.observation_space = spaces.Box(low=0, high=255,
-            shape=(self.height, self.width, 1), dtype=np.uint8)
+        self.width = width
+        self.height = height
+        self.grayscale = grayscale
+        if self.grayscale:
+            self.observation_space = spaces.Box(low=0, high=255,
+                shape=(self.height, self.width, 1), dtype=np.uint8)
+        else:
+            self.observation_space = spaces.Box(low=0, high=255,
+                shape=(self.height, self.width, 3), dtype=np.uint8)

    def observation(self, frame):
-        frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
+        if self.grayscale:
+            frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
        frame = cv2.resize(frame, (self.width, self.height), interpolation=cv2.INTER_AREA)
-        return frame[:, :, None]
+        if self.grayscale:
+            frame = np.expand_dims(frame, -1)
+        return frame

 class FrameStack(gym.Wrapper):
    def __init__(self, env, k):
@@ -156,7 +164,7 @@ class FrameStack(gym.Wrapper):
        self.k = k
        self.frames = deque([], maxlen=k)
        shp = env.observation_space.shape
-        self.observation_space = spaces.Box(low=0, high=255, shape=(shp[0], shp[1], shp[2] * k), dtype=env.observation_space.dtype)
+        self.observation_space = spaces.Box(low=0, high=255, shape=(shp[:-1] + (shp[-1] * k,)), dtype=env.observation_space.dtype)

    def reset(self):
        ob = self.env.reset()
@@ -197,7 +205,7 @@ class LazyFrames(object):

    def _force(self):
        if self._out is None:
-            self._out = np.concatenate(self._frames, axis=2)
+            self._out = np.concatenate(self._frames, axis=-1)
            self._frames = None
        return self._out

--- a/baselines/common/cmd_util.py
+++ b/baselines/common/cmd_util.py
@@ -18,11 +18,16 @@ from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
 from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
 from baselines.common import retro_wrappers

-def make_vec_env(env_id, env_type, num_env, seed, wrapper_kwargs=None, start_index=0, reward_scale=1.0, gamestate=None):
+def make_vec_env(env_id, env_type, num_env, seed,
+                 wrapper_kwargs=None,
+                 start_index=0,
+                 reward_scale=1.0,
+                 flatten_dict_observations=True,
+                 gamestate=None):
    """
    Create a wrapped, monitored SubprocVecEnv for Atari and MuJoCo.
    """
-    if wrapper_kwargs is None: wrapper_kwargs = {}
+    wrapper_kwargs = wrapper_kwargs or {}
    mpi_rank = MPI.COMM_WORLD.Get_rank() if MPI else 0
    seed = seed + 10000 * mpi_rank if seed is not None else None
    def make_thunk(rank):
@@ -33,6 +38,7 @@ def make_vec_env(env_id, env_type, num_env, seed, wrapper_kwargs=None, start_ind
            seed=seed,
            reward_scale=reward_scale,
            gamestate=gamestate,
+            flatten_dict_observations=flatten_dict_observations,
            wrapper_kwargs=wrapper_kwargs
        )

@@ -43,8 +49,9 @@ def make_vec_env(env_id, env_type, num_env, seed, wrapper_kwargs=None, start_ind
        return DummyVecEnv([make_thunk(start_index)])


-def make_env(env_id, env_type, subrank=0, seed=None, reward_scale=1.0, gamestate=None, wrapper_kwargs={}):
+def make_env(env_id, env_type, subrank=0, seed=None, reward_scale=1.0, gamestate=None, flatten_dict_observations=True, wrapper_kwargs=None):
    mpi_rank = MPI.COMM_WORLD.Get_rank() if MPI else 0
+    wrapper_kwargs = wrapper_kwargs or {}
    if env_type == 'atari':
        env = make_atari(env_id)
    elif env_type == 'retro':
@@ -54,6 +61,10 @@ def make_env(env_id, env_type, subrank=0, seed=None, reward_scale=1.0, gamestate
    else:
        env = gym.make(env_id)

+    if flatten_dict_observations and isinstance(env.observation_space, gym.spaces.Dict):
+        keys = env.observation_space.spaces.keys()
+        env = gym.wrappers.FlattenDictWrapper(env, dict_keys=list(keys))
+
    env.seed(seed + subrank if seed is not None else None)
    env = Monitor(env,
                  logger.get_dir() and os.path.join(logger.get_dir(), str(mpi_rank) + '.' + str(subrank)),
@@ -134,6 +145,7 @@ def common_arg_parser():
    parser.add_argument('--save_video_interval', help='Save video every x steps (0 = disabled)', default=0, type=int)
    parser.add_argument('--save_video_length', help='Length of recorded video. Default: 200', default=200, type=int)
    parser.add_argument('--play', default=False, action='store_true')
+    parser.add_argument('--extra_import', help='Extra module to import to access external environments', type=str, default=None)
    return parser

 def robotics_arg_parser():
--- a/baselines/common/distributions.py
+++ b/baselines/common/distributions.py
@@ -62,7 +62,7 @@ class CategoricalPdType(PdType):
    def pdclass(self):
        return CategoricalPd
    def pdfromlatent(self, latent_vector, init_scale=1.0, init_bias=0.0):
-        pdparam = fc(latent_vector, 'pi', self.ncat, init_scale=init_scale, init_bias=init_bias)
+        pdparam = _matching_fc(latent_vector, 'pi', self.ncat, init_scale=init_scale, init_bias=init_bias)
        return self.pdfromflat(pdparam), pdparam

    def param_shape(self):
@@ -82,7 +82,7 @@ class MultiCategoricalPdType(PdType):
        return MultiCategoricalPd(self.ncats, flat)

    def pdfromlatent(self, latent, init_scale=1.0, init_bias=0.0):
-        pdparam = fc(latent, 'pi', self.ncats.sum(), init_scale=init_scale, init_bias=init_bias)
+        pdparam = _matching_fc(latent, 'pi', self.ncats.sum(), init_scale=init_scale, init_bias=init_bias)
        return self.pdfromflat(pdparam), pdparam

    def param_shape(self):
@@ -99,7 +99,7 @@ class DiagGaussianPdType(PdType):
        return DiagGaussianPd

    def pdfromlatent(self, latent_vector, init_scale=1.0, init_bias=0.0):
-        mean = fc(latent_vector, 'pi', self.size, init_scale=init_scale, init_bias=init_bias)
+        mean = _matching_fc(latent_vector, 'pi', self.size, init_scale=init_scale, init_bias=init_bias)
        logstd = tf.get_variable(name='pi/logstd', shape=[1, self.size], initializer=tf.zeros_initializer())
        pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
        return self.pdfromflat(pdparam), mean
@@ -123,7 +123,7 @@ class BernoulliPdType(PdType):
    def sample_dtype(self):
        return tf.int32
    def pdfromlatent(self, latent_vector, init_scale=1.0, init_bias=0.0):
-        pdparam = fc(latent_vector, 'pi', self.size, init_scale=init_scale, init_bias=init_bias)
+        pdparam = _matching_fc(latent_vector, 'pi', self.size, init_scale=init_scale, init_bias=init_bias)
        return self.pdfromflat(pdparam), pdparam

 # WRONG SECOND DERIVATIVES
@@ -345,3 +345,9 @@ def validate_probtype(probtype, pdparam):
    assert np.abs(klval - klval_ll) < 3 * klval_ll_stderr # within 3 sigmas
    print('ok on', probtype, pdparam)

+
+def _matching_fc(tensor, name, size, init_scale, init_bias):
+    if tensor.shape[-1] == size:
+        return tensor
+    else:
+        return fc(tensor, name, size, init_scale=init_scale, init_bias=init_bias)
--- a/baselines/common/plot_util.py
+++ b/baselines/common/plot_util.py
@@ -168,6 +168,7 @@ def load_results(root_dir_or_dirs, enable_progress=True, enable_monitor=True, ve
         - monitor - if enable_monitor is True, this field contains pandas dataframe with loaded monitor.csv file (or aggregate of all *.monitor.csv files in the directory)
         - progress - if enable_progress is True, this field contains pandas dataframe with loaded progress.csv file
    '''
+    import re
    if isinstance(root_dir_or_dirs, str):
        rootdirs = [osp.expanduser(root_dir_or_dirs)]
    else:
@@ -179,7 +180,9 @@ def load_results(root_dir_or_dirs, enable_progress=True, enable_monitor=True, ve
            if '-proc' in dirname:
                files[:] = []
                continue
-            if set(['metadata.json', 'monitor.json', 'monitor.csv', 'progress.json', 'progress.csv']).intersection(files):
+            monitor_re = re.compile(r'(\d+\.)?(\d+\.)?monitor\.csv')
+            if set(['metadata.json', 'monitor.json', 'progress.json', 'progress.csv']).intersection(files) or \
+               any([f for f in files if monitor_re.match(f)]):  # also match monitor files like 0.1.monitor.csv
                # used to be uncommented, which means do not go deeper than current directory if any of the data files
                # are found
                # dirs[:] = []
@@ -332,7 +335,7 @@ def plot_results(
                xys = gresults[group]
                if not any(xys):
                    continue
-                color = COLORS[groups.index(group)]
+                color = COLORS[groups.index(group) % len(COLORS)]
                origxs = [xy[0] for xy in xys]
                minxlen = min(map(len, origxs))
                def allequal(qs):
--- a/baselines/common/retro_wrappers.py
+++ b/baselines/common/retro_wrappers.py
@@ -132,10 +132,8 @@ class MovieRecord(gym.Wrapper):
        self.epcount = 0
    def reset(self):
        if self.epcount % self.k == 0:
-            print('saving movie this episode', self.savedir)
            self.env.unwrapped.movie_path = self.savedir
        else:
-            print('not saving this episode')
            self.env.unwrapped.movie_path = None
            self.env.unwrapped.movie = None
        self.epcount += 1
--- a/baselines/common/tests/test_fetchreach.py
+++ b/baselines/common/tests/test_fetchreach.py
@@ -0,0 +1,39 @@
+import pytest
+import gym
+
+from baselines.run import get_learn_function
+from baselines.common.tests.util import reward_per_episode_test
+
+pytest.importorskip('mujoco_py')
+
+common_kwargs = dict(
+    network='mlp',
+    seed=0,
+)
+
+learn_kwargs = {
+    'her': dict(total_timesteps=2000)
+}
+
+@pytest.mark.slow
+@pytest.mark.parametrize("alg", learn_kwargs.keys())
+def test_fetchreach(alg):
+    '''
+    Test if the algorithm (with an mlp policy)
+    can learn the FetchReach task
+    '''
+
+    kwargs = common_kwargs.copy()
+    kwargs.update(learn_kwargs[alg])
+
+    learn_fn = lambda e: get_learn_function(alg)(env=e, **kwargs)
+    def env_fn():
+
+        env = gym.make('FetchReach-v1')
+        env.seed(0)
+        return env
+
+    reward_per_episode_test(env_fn, learn_fn, -15)
+
+if __name__ == '__main__':
+    test_fetchreach('her')
--- a/baselines/common/tests/test_serialization.py
+++ b/baselines/common/tests/test_serialization.py
@@ -103,9 +103,9 @@ def test_coexistence(learn_fn, network_fn):
    kwargs.update(learn_kwargs[learn_fn])

    learn =  partial(learn, env=env, network=network_fn, total_timesteps=0, **kwargs)
-    make_session(make_default=True, graph=tf.Graph());
+    make_session(make_default=True, graph=tf.Graph())
    model1 = learn(seed=1)
-    make_session(make_default=True, graph=tf.Graph());
+    make_session(make_default=True, graph=tf.Graph())
    model2 = learn(seed=2)

    model1.step(env.observation_space.sample())
--- a/baselines/common/tests/util.py
+++ b/baselines/common/tests/util.py
@@ -63,7 +63,7 @@ def rollout(env, model, n_trials):

    for i in range(n_trials):
        obs = env.reset()
-        state = model.initial_state
+        state = model.initial_state if hasattr(model, 'initial_state') else None
        episode_rew = []
        episode_actions = []
        episode_obs = []
--- a/baselines/common/tf_util.py
+++ b/baselines/common/tf_util.py
@@ -165,6 +165,10 @@ def function(inputs, outputs, updates=None, givens=None):
    outputs: [tf.Variable] or tf.Variable
        list of outputs or a single output to be returned from function. Returned
        value will also have the same shape.
+    updates: [tf.Operation] or tf.Operation
+        list of update functions or single update function that will be run whenever
+        the function is called. The return is ignored.
+
    """
    if isinstance(outputs, list):
        return _Function(inputs, outputs, updates, givens=givens)
@@ -333,7 +337,7 @@ def save_state(fname, sess=None):

 def save_variables(save_path, variables=None, sess=None):
    sess = sess or get_session()
-    variables = variables or tf.trainable_variables()
+    variables = variables or tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)

    ps = sess.run(variables)
    save_dict = {v.name: value for v, value in zip(variables, ps)}
@@ -344,7 +348,7 @@ def save_variables(save_path, variables=None, sess=None):

 def load_variables(load_path, variables=None, sess=None):
    sess = sess or get_session()
-    variables = variables or tf.trainable_variables()
+    variables = variables or tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)

    loaded_params = joblib.load(os.path.expanduser(load_path))
    restores = []
--- a/baselines/common/vec_env/dummy_vec_env.py
+++ b/baselines/common/vec_env/dummy_vec_env.py
@@ -27,6 +27,7 @@ class DummyVecEnv(VecEnv):
        self.buf_rews  = np.zeros((self.num_envs,), dtype=np.float32)
        self.buf_infos = [{} for _ in range(self.num_envs)]
        self.actions = None
+        self.specs = [e.spec for e in self.envs]

    def step_async(self, actions):
        listify = True
--- a/baselines/common/vec_env/shmem_vec_env.py
+++ b/baselines/common/vec_env/shmem_vec_env.py
@@ -54,6 +54,7 @@ class ShmemVecEnv(VecEnv):
            proc.start()
            child_pipe.close()
        self.waiting_step = False
+        self.specs = [f().spec for f in env_fns]
        self.viewer = None

    def reset(self):
--- a/baselines/common/vec_env/subproc_vec_env.py
+++ b/baselines/common/vec_env/subproc_vec_env.py
@@ -57,6 +57,7 @@ class SubprocVecEnv(VecEnv):
        self.remotes[0].send(('get_spaces', None))
        observation_space, action_space = self.remotes[0].recv()
        self.viewer = None
+        self.specs = [f().spec for f in env_fns]
        VecEnv.__init__(self, len(env_fns), observation_space, action_space)

    def step_async(self, actions):
@@ -70,13 +71,13 @@ class SubprocVecEnv(VecEnv):
        results = [remote.recv() for remote in self.remotes]
        self.waiting = False
        obs, rews, dones, infos = zip(*results)
-        return np.stack(obs), np.stack(rews), np.stack(dones), infos
+        return _flatten_obs(obs), np.stack(rews), np.stack(dones), infos

    def reset(self):
        self._assert_not_closed()
        for remote in self.remotes:
            remote.send(('reset', None))
-        return np.stack([remote.recv() for remote in self.remotes])
+        return _flatten_obs([remote.recv() for remote in self.remotes])

    def close_extras(self):
        self.closed = True
@@ -97,3 +98,17 @@ class SubprocVecEnv(VecEnv):

    def _assert_not_closed(self):
        assert not self.closed, "Trying to operate on a SubprocVecEnv after calling close()"
+
+
+def _flatten_obs(obs):
+    assert isinstance(obs, list) or isinstance(obs, tuple)
+    assert len(obs) > 0
+
+    if isinstance(obs[0], dict):
+        import collections
+        assert isinstance(obs, collections.OrderedDict)
+        keys = obs[0].keys()
+        return {k: np.stack([o[k] for o in obs]) for k in keys}
+    else:
+        return np.stack(obs)
+
--- a/baselines/ddpg/ddpg_learner.py
+++ b/baselines/ddpg/ddpg_learner.py
@@ -67,7 +67,6 @@ class DDPG(object):
    def __init__(self, actor, critic, memory, observation_shape, action_shape, param_noise=None, action_noise=None,
        gamma=0.99, tau=0.001, normalize_returns=False, enable_popart=False, normalize_observations=True,
        batch_size=128, observation_range=(-5., 5.), action_range=(-1., 1.), return_range=(-np.inf, np.inf),
-        adaptive_param_noise=True, adaptive_param_noise_policy_threshold=.1,
        critic_l2_reg=0., actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1.):
        # Inputs.
        self.obs0 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs0')
@@ -186,7 +185,7 @@ class DDPG(object):
        normalized_critic_target_tf = tf.clip_by_value(normalize(self.critic_target, self.ret_rms), self.return_range[0], self.return_range[1])
        self.critic_loss = tf.reduce_mean(tf.square(self.normalized_critic_tf - normalized_critic_target_tf))
        if self.critic_l2_reg > 0.:
-            critic_reg_vars = [var for var in self.critic.trainable_vars if 'kernel' in var.name and 'output' not in var.name]
+            critic_reg_vars = [var for var in self.critic.trainable_vars if var.name.endswith('/w:0') and 'output' not in var.name]
            for var in critic_reg_vars:
                logger.info('  regularizing: {}'.format(var.name))
            logger.info('  applying l2 regularization with {}'.format(self.critic_l2_reg))
--- a/baselines/ddpg/models.py
+++ b/baselines/ddpg/models.py
@@ -42,7 +42,7 @@ class Critic(Model):
        with tf.variable_scope(self.name, reuse=tf.AUTO_REUSE):
            x = tf.concat([obs, action], axis=-1) # this assumes observation and action can be concatenated
            x = self.network_builder(x)
-            x = tf.layers.dense(x, 1, kernel_initializer=tf.random_uniform_initializer(minval=-3e-3, maxval=3e-3))
+            x = tf.layers.dense(x, 1, kernel_initializer=tf.random_uniform_initializer(minval=-3e-3, maxval=3e-3), name='output')
        return x

    @property
--- a/baselines/ddpg/test_smoke.py
+++ b/baselines/ddpg/test_smoke.py
@@ -0,0 +1,17 @@
+from baselines.run import main as M
+
+def _run(argstr):
+    M(('--alg=ddpg --env=Pendulum-v0 --num_timesteps=0 ' + argstr).split(' '))
+
+def test_popart():
+    _run('--normalize_returns=True --popart=True')
+
+def test_noise_normal():
+    _run('--noise_type=normal_0.1')
+
+def test_noise_ou():
+    _run('--noise_type=ou_0.1')
+
+def test_noise_adaptive():
+    _run('--noise_type=adaptive-param_0.2,normal_0.1')
+
--- a/baselines/deepq/init.py
+++ b/baselines/deepq/init.py
@@ -5,4 +5,4 @@ from baselines.deepq.replay_buffer import ReplayBuffer, PrioritizedReplayBuffer

 def wrap_atari_dqn(env):
    from baselines.common.atari_wrappers import wrap_deepmind
-    return wrap_deepmind(env, frame_stack=True, scale=True)
+    return wrap_deepmind(env, frame_stack=True, scale=False)
--- a/baselines/deepq/build_graph.py
+++ b/baselines/deepq/build_graph.py
@@ -33,7 +33,7 @@ The functions in this file can are used to create the following functions:
    stochastic: bool
        if set to False all the actions are always deterministic (default False)
    update_eps_ph: float
-        update epsilon a new value, if negative not update happens
+        update epsilon to a new value, if negative no update happens
        (default: no update)
    reset_ph: bool
        reset the perturbed policy by sampling a new perturbation
--- a/baselines/deepq/models.py
+++ b/baselines/deepq/models.py
@@ -2,9 +2,9 @@ import tensorflow as tf
 import tensorflow.contrib.layers as layers


-def _mlp(hiddens, inpt, num_actions, scope, reuse=False, layer_norm=False):
+def _mlp(hiddens, input_, num_actions, scope, reuse=False, layer_norm=False):
    with tf.variable_scope(scope, reuse=reuse):
-        out = inpt
+        out = input_
        for hidden in hiddens:
            out = layers.fully_connected(out, num_outputs=hidden, activation_fn=None)
            if layer_norm:
@@ -21,6 +21,9 @@ def mlp(hiddens=[], layer_norm=False):
    ----------
    hiddens: [int]
        list of sizes of hidden layers
+    layer_norm: bool
+        if true applies layer normalization for every layer
+        as described in https://arxiv.org/abs/1607.06450

    Returns
    -------
@@ -30,9 +33,9 @@ def mlp(hiddens=[], layer_norm=False):
    return lambda *args, **kwargs: _mlp(hiddens, layer_norm=layer_norm, *args, **kwargs)


-def _cnn_to_mlp(convs, hiddens, dueling, inpt, num_actions, scope, reuse=False, layer_norm=False):
+def _cnn_to_mlp(convs, hiddens, dueling, input_, num_actions, scope, reuse=False, layer_norm=False):
    with tf.variable_scope(scope, reuse=reuse):
-        out = inpt
+        out = input_
        with tf.variable_scope("convnet"):
            for num_outputs, kernel_size, stride in convs:
                out = layers.convolution2d(out,
@@ -72,7 +75,7 @@ def cnn_to_mlp(convs, hiddens, dueling=False, layer_norm=False):

    Parameters
    ----------
-    convs: [(int, int int)]
+    convs: [(int, int, int)]
        list of convolutional layers in form of
        (num_outputs, kernel_size, stride)
    hiddens: [int]
@@ -80,6 +83,9 @@ def cnn_to_mlp(convs, hiddens, dueling=False, layer_norm=False):
    dueling: bool
        if true double the output MLP to compute a baseline
        for action scores
+    layer_norm: bool
+        if true applies layer normalization for every layer
+        as described in https://arxiv.org/abs/1607.06450

    Returns
    -------
--- a/baselines/her/README.md
+++ b/baselines/her/README.md
@@ -6,26 +6,29 @@ For details on Hindsight Experience Replay (HER), please read the [paper](https:
 ### Getting started
 Training an agent is very simple:
 ```bash
-python -m baselines.her.experiment.train
+python -m baselines.run --alg=her --env=FetchReach-v1 --num_timesteps=5000
 ```
 This will train a DDPG+HER agent on the `FetchReach` environment.
 You should see the success rate go up quickly to `1.0`, which means that the agent achieves the
-desired goal in 100% of the cases.
-The training script logs other diagnostics as well and pickles the best policy so far (w.r.t. to its test success rate),
-the latest policy, and, if enabled, a history of policies every K epochs.
-
-To inspect what the agent has learned, use the play script:
+desired goal in 100% of the cases (note how HER can solve it in <5k steps - try doing that with PPO by replacing her with ppo2 :))
+The training script logs other diagnostics as well. Policy at the end of the training can be saved using `--save_path` flag, for instance:
 ```bash
-python -m baselines.her.experiment.play /path/to/an/experiment/policy_best.pkl
+python -m baselines.run --alg=her --env=FetchReach-v1 --num_timesteps=5000 --save_path=~/policies/her/fetchreach5k
 ```
-You can try it right now with the results of the training step (the script prints out the path for you).
-This should visualize the current policy for 10 episodes and will also print statistics.
+
+To inspect what the agent has learned, use the `--play` flag: 
+```bash
+python -m baselines.run --alg=her --env=FetchReach-v1 --num_timesteps=5000 --play
+```
+(note `--play` can be combined with `--load_path`, which lets one load trained policies, for more results see [README.md](../../README.md))


 ### Reproducing results
-In order to reproduce the results from [Plappert et al. (2018)](https://arxiv.org/abs/1802.09464), run the following command:
+In [Plappert et al. (2018)](https://arxiv.org/abs/1802.09464), 38 trajectories were generated in parallel
+(19 MPI processes, each generating computing gradients from 2 trajectories and aggregating). 
+To reproduce that behaviour, use 
 ```bash
-python -m baselines.her.experiment.train --num_cpu 19
+mpirun -np 19 python -m baselines.run --num_env=2 --alg=her ... 
 ```
 This will require a machine with sufficient amount of physical CPU cores. In our experiments,
 we used [Azure's D15v2 instances](https://docs.microsoft.com/en-us/azure/virtual-machines/linux/sizes),
@@ -45,6 +48,13 @@ python experiment/data_generation/fetch_data_generation.py
 ```
 This outputs ```data_fetch_random_100.npz``` file which is our data file.

+To launch training with demonstrations (more technically, with behaviour cloning loss as an auxilliary loss), run the following
+```bash
+python -m baselines.run --alg=her --env=FetchPickAndPlace-v1 --num_timesteps=2.5e6 --demo_file=/Path/to/demo_file.npz
+```
+This will train a DDPG+HER agent on the `FetchPickAndPlace` environment by using previously generated demonstration data.
+To inspect what the agent has learned, use the `--play` flag as described above.
+
 #### Configuration
 The provided configuration is for training an agent with HER without demonstrations, we need to change a few paramters for the HER algorithm to learn through demonstrations, to do that, set:

@@ -62,13 +72,7 @@ Apart from these changes the reported results also have the following configurat
 * random_eps: 0.1  - percentage of time a random action is taken
 * noise_eps: 0.1  - std of gaussian noise added to not-completely-random actions

-Now training an agent with pre-recorded demonstrations:
-```bash
-python -m baselines.her.experiment.train --env=FetchPickAndPlace-v0 --n_epochs=1000 --demo_file=/Path/to/demo_file.npz --num_cpu=1
-```
-
-This will train a DDPG+HER agent on the `FetchPickAndPlace` environment by using previously generated demonstration data.
-To inspect what the agent has learned, use the play script as described above.
+These parameters can be changed either in [experiment/config.py](experiment/config.py) or passed to the command line as `--param=value`)

 ### Results
 Training with demonstrations helps overcome the exploration problem and achieves a faster and better convergence. The following graphs contrast the difference between training with and without demonstration data, We report the mean Q values vs Epoch and the Success Rate vs Epoch:
@@ -78,3 +82,4 @@ Training with demonstrations helps overcome the exploration problem and achieves
 <center><img src="../../data/fetchPickAndPlaceContrast.png"></center>
 <div class="thecap" align="middle"><b>Training results for Fetch Pick and Place task constrasting between training with and without demonstration data.</b></div>
 </div>
+
--- a/baselines/her/ddpg.py
+++ b/baselines/her/ddpg.py
@@ -10,13 +10,14 @@ from baselines.her.util import (
 from baselines.her.normalizer import Normalizer
 from baselines.her.replay_buffer import ReplayBuffer
 from baselines.common.mpi_adam import MpiAdam
+from baselines.common import tf_util


 def dims_to_shapes(input_dims):
    return {key: tuple([val]) if val > 0 else tuple() for key, val in input_dims.items()}


-global demoBuffer #buffer for demonstrations
+global DEMO_BUFFER #buffer for demonstrations

 class DDPG(object):
    @store_args
@@ -94,16 +95,16 @@ class DDPG(object):
            self._create_network(reuse=reuse)

        # Configure the replay buffer.
-        buffer_shapes = {key: (self.T if key != 'o' else self.T+1, *input_shapes[key])
+        buffer_shapes = {key: (self.T-1 if key != 'o' else self.T, *input_shapes[key])
                         for key, val in input_shapes.items()}
        buffer_shapes['g'] = (buffer_shapes['g'][0], self.dimg)
-        buffer_shapes['ag'] = (self.T+1, self.dimg)
+        buffer_shapes['ag'] = (self.T, self.dimg)

        buffer_size = (self.buffer_size // self.rollout_batch_size) * self.rollout_batch_size
        self.buffer = ReplayBuffer(buffer_shapes, buffer_size, self.T, self.sample_transitions)

-        global demoBuffer
-        demoBuffer = ReplayBuffer(buffer_shapes, buffer_size, self.T, self.sample_transitions) #initialize the demo buffer; in the same way as the primary data buffer
+        global DEMO_BUFFER
+        DEMO_BUFFER = ReplayBuffer(buffer_shapes, buffer_size, self.T, self.sample_transitions) #initialize the demo buffer; in the same way as the primary data buffer

    def _random_action(self, n):
        return np.random.uniform(low=-self.max_u, high=self.max_u, size=(n, self.dimu))
@@ -119,6 +120,11 @@ class DDPG(object):
        g = np.clip(g, -self.clip_obs, self.clip_obs)
        return o, g

+    def step(self, obs):
+        actions = self.get_actions(obs['observation'], obs['achieved_goal'], obs['desired_goal'])
+        return actions, None, None, None
+
+
    def get_actions(self, o, ag, g, noise_eps=0., random_eps=0., use_target_net=False,
                    compute_Q=False):
        o, g = self._preprocess_og(o, ag, g)
@@ -151,25 +157,30 @@ class DDPG(object):
        else:
            return ret

-    def initDemoBuffer(self, demoDataFile, update_stats=True): #function that initializes the demo buffer
+    def init_demo_buffer(self, demoDataFile, update_stats=True): #function that initializes the demo buffer

        demoData = np.load(demoDataFile) #load the demonstration data from data file
        info_keys = [key.replace('info_', '') for key in self.input_dims.keys() if key.startswith('info_')]
-        info_values = [np.empty((self.T, 1, self.input_dims['info_' + key]), np.float32) for key in info_keys]
+        info_values = [np.empty((self.T - 1, 1, self.input_dims['info_' + key]), np.float32) for key in info_keys]
+
+        demo_data_obs = demoData['obs']
+        demo_data_acs = demoData['acs']
+        demo_data_info = demoData['info']

        for epsd in range(self.num_demo): # we initialize the whole demo buffer at the start of the training
            obs, acts, goals, achieved_goals = [], [] ,[] ,[]
            i = 0
-            for transition in range(self.T):
-                obs.append([demoData['obs'][epsd ][transition].get('observation')])
-                acts.append([demoData['acs'][epsd][transition]])
-                goals.append([demoData['obs'][epsd][transition].get('desired_goal')])
-                achieved_goals.append([demoData['obs'][epsd][transition].get('achieved_goal')])
+            for transition in range(self.T - 1):
+                obs.append([demo_data_obs[epsd][transition].get('observation')])
+                acts.append([demo_data_acs[epsd][transition]])
+                goals.append([demo_data_obs[epsd][transition].get('desired_goal')])
+                achieved_goals.append([demo_data_obs[epsd][transition].get('achieved_goal')])
                for idx, key in enumerate(info_keys):
-                    info_values[idx][transition, i] = demoData['info'][epsd][transition][key]
+                    info_values[idx][transition, i] = demo_data_info[epsd][transition][key]

-            obs.append([demoData['obs'][epsd][self.T].get('observation')])
-            achieved_goals.append([demoData['obs'][epsd][self.T].get('achieved_goal')])
+
+            obs.append([demo_data_obs[epsd][self.T - 1].get('observation')])
+            achieved_goals.append([demo_data_obs[epsd][self.T - 1].get('achieved_goal')])

            episode = dict(o=obs,
                           u=acts,
@@ -179,10 +190,9 @@ class DDPG(object):
                episode['info_{}'.format(key)] = value

            episode = convert_episode_to_batch_major(episode)
-            global demoBuffer
-            demoBuffer.store_episode(episode) # create the observation dict and append them into the demonstration buffer
-
-            print("Demo buffer size currently ", demoBuffer.get_current_size()) #print out the demonstration buffer size
+            global DEMO_BUFFER
+            DEMO_BUFFER.store_episode(episode) # create the observation dict and append them into the demonstration buffer
+            logger.debug("Demo buffer size currently ", DEMO_BUFFER.get_current_size()) #print out the demonstration buffer size

            if update_stats:
                # add transitions to normalizer to normalize the demo data as well
@@ -191,7 +201,7 @@ class DDPG(object):
                num_normalizing_transitions = transitions_in_episode_batch(episode)
                transitions = self.sample_transitions(episode, num_normalizing_transitions)

-                o, o_2, g, ag = transitions['o'], transitions['o_2'], transitions['g'], transitions['ag']
+                o, g, ag = transitions['o'], transitions['g'], transitions['ag']
                transitions['o'], transitions['g'] = self._preprocess_og(o, ag, g)
                # No need to preprocess the o_2 and g_2 since this is only used for stats

@@ -202,6 +212,8 @@ class DDPG(object):
                self.g_stats.recompute_stats()
            episode.clear()

+        logger.info("Demo buffer size: ", DEMO_BUFFER.get_current_size()) #print out the demonstration buffer size
+
    def store_episode(self, episode_batch, update_stats=True):
        """
        episode_batch: array of batch_size x (T or T+1) x dim_key
@@ -217,7 +229,7 @@ class DDPG(object):
            num_normalizing_transitions = transitions_in_episode_batch(episode_batch)
            transitions = self.sample_transitions(episode_batch, num_normalizing_transitions)

-            o, o_2, g, ag = transitions['o'], transitions['o_2'], transitions['g'], transitions['ag']
+            o, g, ag = transitions['o'], transitions['g'], transitions['ag']
            transitions['o'], transitions['g'] = self._preprocess_og(o, ag, g)
            # No need to preprocess the o_2 and g_2 since this is only used for stats

@@ -251,9 +263,9 @@ class DDPG(object):
    def sample_batch(self):
        if self.bc_loss: #use demonstration buffer to sample as well if bc_loss flag is set TRUE
            transitions = self.buffer.sample(self.batch_size - self.demo_batch_size)
-            global demoBuffer
-            transitionsDemo = demoBuffer.sample(self.demo_batch_size) #sample from the demo buffer
-            for k, values in transitionsDemo.items():
+            global DEMO_BUFFER
+            transitions_demo = DEMO_BUFFER.sample(self.demo_batch_size) #sample from the demo buffer
+            for k, values in transitions_demo.items():
                rolloutV = transitions[k].tolist()
                for v in values:
                    rolloutV.append(v.tolist())
@@ -302,10 +314,7 @@ class DDPG(object):

    def _create_network(self, reuse=False):
        logger.info("Creating a DDPG agent with action space %d x %s..." % (self.dimu, self.max_u))
-
-        self.sess = tf.get_default_session()
-        if self.sess is None:
-            self.sess = tf.InteractiveSession()
+        self.sess = tf_util.get_session()

        # running averages
        with tf.variable_scope('o_stats') as vs:
@@ -367,8 +376,6 @@ class DDPG(object):
            self.pi_loss_tf = -tf.reduce_mean(self.main.Q_pi_tf)
            self.pi_loss_tf += self.action_l2 * tf.reduce_mean(tf.square(self.main.pi_tf / self.max_u))

-        self.pi_loss_tf = -tf.reduce_mean(self.main.Q_pi_tf)
-        self.pi_loss_tf += self.action_l2 * tf.reduce_mean(tf.square(self.main.pi_tf / self.max_u))
        Q_grads_tf = tf.gradients(self.Q_loss_tf, self._vars('main/Q'))
        pi_grads_tf = tf.gradients(self.pi_loss_tf, self._vars('main/pi'))
        assert len(self._vars('main/Q')) == len(Q_grads_tf)
@@ -435,3 +442,7 @@ class DDPG(object):
        assert(len(vars) == len(state["tf"]))
        node = [tf.assign(var, val) for var, val in zip(vars, state["tf"])]
        self.sess.run(node)
+
+    def save(self, save_path):
+        tf_util.save_variables(save_path)
+
--- a/baselines/her/experiment/config.py
+++ b/baselines/her/experiment/config.py
@@ -1,10 +1,11 @@
+import os
 import numpy as np
 import gym

 from baselines import logger
 from baselines.her.ddpg import DDPG
-from baselines.her.her import make_sample_her_transitions
-
+from baselines.her.her_sampler import make_sample_her_transitions
+from baselines.bench.monitor import Monitor

 DEFAULT_ENV_PARAMS = {
    'FetchReach-v1': {
@@ -72,16 +73,32 @@ def cached_make_env(make_env):
 def prepare_params(kwargs):
    # DDPG params
    ddpg_params = dict()
-
    env_name = kwargs['env_name']

-    def make_env():
-        return gym.make(env_name)
+    def make_env(subrank=None):
+        env = gym.make(env_name)
+        if subrank is not None and logger.get_dir() is not None:
+            try:
+                from mpi4py import MPI
+                mpi_rank = MPI.COMM_WORLD.Get_rank()
+            except ImportError:
+                MPI = None
+                mpi_rank = 0
+                logger.warn('Running with a single MPI process. This should work, but the results may differ from the ones publshed in Plappert et al.')
+
+            max_episode_steps = env._max_episode_steps
+            env =  Monitor(env,
+                           os.path.join(logger.get_dir(), str(mpi_rank) + '.' + str(subrank)),
+                           allow_early_resets=True)
+            # hack to re-expose _max_episode_steps (ideally should replace reliance on it downstream)
+            env = gym.wrappers.TimeLimit(env, max_episode_steps=max_episode_steps)
+        return env
+
    kwargs['make_env'] = make_env
    tmp_env = cached_make_env(kwargs['make_env'])
    assert hasattr(tmp_env, '_max_episode_steps')
    kwargs['T'] = tmp_env._max_episode_steps
-    tmp_env.reset()
+
    kwargs['max_u'] = np.array(kwargs['max_u']) if isinstance(kwargs['max_u'], list) else kwargs['max_u']
    kwargs['gamma'] = 1. - 1. / kwargs['T']
    if 'lr' in kwargs:
--- a/baselines/her/experiment/data_generation/fetch_data_generation.py
+++ b/baselines/her/experiment/data_generation/fetch_data_generation.py
@@ -1,18 +1,5 @@
 import gym
-import time
-import random
 import numpy as np
-import rospy
-import roslaunch
-
-from random import randint
-from std_srvs.srv import Empty
-from sensor_msgs.msg import JointState
-from geometry_msgs.msg import PoseStamped
-from geometry_msgs.msg import Pose
-from std_msgs.msg import Float64
-from controller_manager_msgs.srv import SwitchController
-from gym.utils import seeding


 """Data generation for the case of a single block pick and place in Fetch Env"""
@@ -22,7 +9,7 @@ observations = []
 infos = []

 def main():
-    env = gym.make('FetchPickAndPlace-v0')
+    env = gym.make('FetchPickAndPlace-v1')
    numItr = 100
    initStateSpace = "random"
    env.reset()
@@ -31,21 +18,19 @@ def main():
        obs = env.reset()
        print("ITERATION NUMBER ", len(actions))
        goToGoal(env, obs)
-        
+

    fileName = "data_fetch"
    fileName += "_" + initStateSpace
    fileName += "_" + str(numItr)
    fileName += ".npz"
-    
+
    np.savez_compressed(fileName, acs=actions, obs=observations, info=infos) # save the file

 def goToGoal(env, lastObs):

    goal = lastObs['desired_goal']
    objectPos = lastObs['observation'][3:6]
-    gripperPos = lastObs['observation'][:3]
-    gripperState = lastObs['observation'][9:11]
    object_rel_pos = lastObs['observation'][6:9]
    episodeAcs = []
    episodeObs = []
@@ -53,7 +38,7 @@ def goToGoal(env, lastObs):

    object_oriented_goal = object_rel_pos.copy()
    object_oriented_goal[2] += 0.03 # first make the gripper go slightly above the object
-    
+
    timeStep = 0 #count the total number of timesteps
    episodeObs.append(lastObs)

@@ -76,8 +61,6 @@ def goToGoal(env, lastObs):
        episodeObs.append(obsDataNew)

        objectPos = obsDataNew['observation'][3:6]
-        gripperPos = obsDataNew['observation'][:3]
-        gripperState = obsDataNew['observation'][9:11]
        object_rel_pos = obsDataNew['observation'][6:9]

    while np.linalg.norm(object_rel_pos) >= 0.005 and timeStep <= env._max_episode_steps :
@@ -96,8 +79,6 @@ def goToGoal(env, lastObs):
        episodeObs.append(obsDataNew)

        objectPos = obsDataNew['observation'][3:6]
-        gripperPos = obsDataNew['observation'][:3]
-        gripperState = obsDataNew['observation'][9:11]
        object_rel_pos = obsDataNew['observation'][6:9]


@@ -117,8 +98,6 @@ def goToGoal(env, lastObs):
        episodeObs.append(obsDataNew)

        objectPos = obsDataNew['observation'][3:6]
-        gripperPos = obsDataNew['observation'][:3]
-        gripperState = obsDataNew['observation'][9:11]
        object_rel_pos = obsDataNew['observation'][6:9]

    while True: #limit the number of timesteps in the episode to a fixed duration
@@ -134,8 +113,6 @@ def goToGoal(env, lastObs):
        episodeObs.append(obsDataNew)

        objectPos = obsDataNew['observation'][3:6]
-        gripperPos = obsDataNew['observation'][:3]
-        gripperState = obsDataNew['observation'][9:11]
        object_rel_pos = obsDataNew['observation'][6:9]

        if timeStep >= env._max_episode_steps: break
--- a/baselines/her/experiment/play.py
+++ b/baselines/her/experiment/play.py
@@ -1,3 +1,4 @@
+# DEPRECATED, use --play flag to baselines.run instead
 import click
 import numpy as np
 import pickle
--- a/baselines/her/experiment/plot.py
+++ b/baselines/her/experiment/plot.py
@@ -1,3 +1,5 @@
+# DEPRECATED, use baselines.common.plot_util instead
+
 import os
 import matplotlib.pyplot as plt
 import numpy as np
--- a/baselines/her/experiment/train.py
+++ b/baselines/her/experiment/train.py
@@ -1,194 +0,0 @@
-import os
-import sys
-
-import click
-import numpy as np
-import json
-from mpi4py import MPI
-
-from baselines import logger
-from baselines.common import set_global_seeds
-from baselines.common.mpi_moments import mpi_moments
-import baselines.her.experiment.config as config
-from baselines.her.rollout import RolloutWorker
-from baselines.her.util import mpi_fork
-
-from subprocess import CalledProcessError
-
-
-def mpi_average(value):
-    if value == []:
-        value = [0.]
-    if not isinstance(value, list):
-        value = [value]
-    return mpi_moments(np.array(value))[0]
-
-
-def train(policy, rollout_worker, evaluator,
-          n_epochs, n_test_rollouts, n_cycles, n_batches, policy_save_interval,
-          save_policies, demo_file, **kwargs):
-    rank = MPI.COMM_WORLD.Get_rank()
-
-    latest_policy_path = os.path.join(logger.get_dir(), 'policy_latest.pkl')
-    best_policy_path = os.path.join(logger.get_dir(), 'policy_best.pkl')
-    periodic_policy_path = os.path.join(logger.get_dir(), 'policy_{}.pkl')
-
-    logger.info("Training...")
-    best_success_rate = -1
-
-    if policy.bc_loss == 1: policy.initDemoBuffer(demo_file) #initialize demo buffer if training with demonstrations
-    for epoch in range(n_epochs):
-        # train
-        rollout_worker.clear_history()
-        for _ in range(n_cycles):
-            episode = rollout_worker.generate_rollouts()
-            policy.store_episode(episode)
-            for _ in range(n_batches):
-                policy.train()
-            policy.update_target_net()
-
-        # test
-        evaluator.clear_history()
-        for _ in range(n_test_rollouts):
-            evaluator.generate_rollouts()
-
-        # record logs
-        logger.record_tabular('epoch', epoch)
-        for key, val in evaluator.logs('test'):
-            logger.record_tabular(key, mpi_average(val))
-        for key, val in rollout_worker.logs('train'):
-            logger.record_tabular(key, mpi_average(val))
-        for key, val in policy.logs():
-            logger.record_tabular(key, mpi_average(val))
-
-        if rank == 0:
-            logger.dump_tabular()
-
-        # save the policy if it's better than the previous ones
-        success_rate = mpi_average(evaluator.current_success_rate())
-        if rank == 0 and success_rate >= best_success_rate and save_policies:
-            best_success_rate = success_rate
-            logger.info('New best success rate: {}. Saving policy to {} ...'.format(best_success_rate, best_policy_path))
-            evaluator.save_policy(best_policy_path)
-            evaluator.save_policy(latest_policy_path)
-        if rank == 0 and policy_save_interval > 0 and epoch % policy_save_interval == 0 and save_policies:
-            policy_path = periodic_policy_path.format(epoch)
-            logger.info('Saving periodic policy to {} ...'.format(policy_path))
-            evaluator.save_policy(policy_path)
-
-        # make sure that different threads have different seeds
-        local_uniform = np.random.uniform(size=(1,))
-        root_uniform = local_uniform.copy()
-        MPI.COMM_WORLD.Bcast(root_uniform, root=0)
-        if rank != 0:
-            assert local_uniform[0] != root_uniform[0]
-
-
-def launch(
-    env, logdir, n_epochs, num_cpu, seed, replay_strategy, policy_save_interval, clip_return,
-    demo_file, override_params={}, save_policies=True
-):
-    # Fork for multi-CPU MPI implementation.
-    if num_cpu > 1:
-        try:
-            whoami = mpi_fork(num_cpu, ['--bind-to', 'core'])
-        except CalledProcessError:
-            # fancy version of mpi call failed, try simple version
-            whoami = mpi_fork(num_cpu)
-
-        if whoami == 'parent':
-            sys.exit(0)
-        import baselines.common.tf_util as U
-        U.single_threaded_session().__enter__()
-    rank = MPI.COMM_WORLD.Get_rank()
-
-    # Configure logging
-    if rank == 0:
-        if logdir or logger.get_dir() is None:
-            logger.configure(dir=logdir)
-    else:
-        logger.configure()
-    logdir = logger.get_dir()
-    assert logdir is not None
-    os.makedirs(logdir, exist_ok=True)
-
-    # Seed everything.
-    rank_seed = seed + 1000000 * rank
-    set_global_seeds(rank_seed)
-
-    # Prepare params.
-    params = config.DEFAULT_PARAMS
-    params['env_name'] = env
-    params['replay_strategy'] = replay_strategy
-    if env in config.DEFAULT_ENV_PARAMS:
-        params.update(config.DEFAULT_ENV_PARAMS[env])  # merge env-specific parameters in
-    params.update(**override_params)  # makes it possible to override any parameter
-    with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f:
-        json.dump(params, f)
-    params = config.prepare_params(params)
-    config.log_params(params, logger=logger)
-
-    if num_cpu == 1:
-        logger.warn()
-        logger.warn('*** Warning ***')
-        logger.warn(
-            'You are running HER with just a single MPI worker. This will work, but the ' +
-            'experiments that we report in Plappert et al. (2018, https://arxiv.org/abs/1802.09464) ' +
-            'were obtained with --num_cpu 19. This makes a significant difference and if you ' +
-            'are looking to reproduce those results, be aware of this. Please also refer to ' +
-            'https://github.com/openai/baselines/issues/314 for further details.')
-        logger.warn('****************')
-        logger.warn()
-
-    dims = config.configure_dims(params)
-    policy = config.configure_ddpg(dims=dims, params=params, clip_return=clip_return)
-
-    rollout_params = {
-        'exploit': False,
-        'use_target_net': False,
-        'use_demo_states': True,
-        'compute_Q': False,
-        'T': params['T'],
-    }
-
-    eval_params = {
-        'exploit': True,
-        'use_target_net': params['test_with_polyak'],
-        'use_demo_states': False,
-        'compute_Q': True,
-        'T': params['T'],
-    }
-
-    for name in ['T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps']:
-        rollout_params[name] = params[name]
-        eval_params[name] = params[name]
-
-    rollout_worker = RolloutWorker(params['make_env'], policy, dims, logger, **rollout_params)
-    rollout_worker.seed(rank_seed)
-
-    evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params)
-    evaluator.seed(rank_seed)
-
-    train(
-        logdir=logdir, policy=policy, rollout_worker=rollout_worker,
-        evaluator=evaluator, n_epochs=n_epochs, n_test_rollouts=params['n_test_rollouts'],
-        n_cycles=params['n_cycles'], n_batches=params['n_batches'],
-        policy_save_interval=policy_save_interval, save_policies=save_policies, demo_file=demo_file)
-
-
-@click.command()
-@click.option('--env', type=str, default='FetchReach-v1', help='the name of the OpenAI Gym environment that you want to train on')
-@click.option('--logdir', type=str, default=None, help='the path to where logs and policy pickles should go. If not specified, creates a folder in /tmp/')
-@click.option('--n_epochs', type=int, default=50, help='the number of training epochs to run')
-@click.option('--num_cpu', type=int, default=1, help='the number of CPU cores to use (using MPI)')
-@click.option('--seed', type=int, default=0, help='the random seed used to seed both the environment and the training code')
-@click.option('--policy_save_interval', type=int, default=5, help='the interval with which policy pickles are saved. If set to 0, only the best and latest policy will be pickled.')
-@click.option('--replay_strategy', type=click.Choice(['future', 'none']), default='future', help='the HER replay strategy to be used. "future" uses HER, "none" disables HER.')
-@click.option('--clip_return', type=int, default=1, help='whether or not returns should be clipped')
-@click.option('--demo_file', type=str, default = 'PATH/TO/DEMO/DATA/FILE.npz', help='demo data file path')
-def main(**kwargs):
-    launch(**kwargs)
-
-
-if __name__ == '__main__':
-    main()
--- a/baselines/her/her.py
+++ b/baselines/her/her.py
@@ -1,63 +1,193 @@
+import os
+
+import click
 import numpy as np
+import json
+from mpi4py import MPI
+
+from baselines import logger
+from baselines.common import set_global_seeds, tf_util
+from baselines.common.mpi_moments import mpi_moments
+import baselines.her.experiment.config as config
+from baselines.her.rollout import RolloutWorker
+
+def mpi_average(value):
+    if not isinstance(value, list):
+        value = [value]
+    if not any(value):
+        value = [0.]
+    return mpi_moments(np.array(value))[0]


-def make_sample_her_transitions(replay_strategy, replay_k, reward_fun):
-    """Creates a sample function that can be used for HER experience replay.
+def train(*, policy, rollout_worker, evaluator,
+          n_epochs, n_test_rollouts, n_cycles, n_batches, policy_save_interval,
+          save_path, demo_file, **kwargs):
+    rank = MPI.COMM_WORLD.Get_rank()

-    Args:
-        replay_strategy (in ['future', 'none']): the HER replay strategy; if set to 'none',
-            regular DDPG experience replay is used
-        replay_k (int): the ratio between HER replays and regular replays (e.g. k = 4 -> 4 times
-            as many HER replays as regular replays are used)
-        reward_fun (function): function to re-compute the reward with substituted goals
-    """
-    if replay_strategy == 'future':
-        future_p = 1 - (1. / (1 + replay_k))
-    else:  # 'replay_strategy' == 'none'
-        future_p = 0
+    if save_path:
+        latest_policy_path = os.path.join(save_path, 'policy_latest.pkl')
+        best_policy_path = os.path.join(save_path, 'policy_best.pkl')
+        periodic_policy_path = os.path.join(save_path, 'policy_{}.pkl')

-    def _sample_her_transitions(episode_batch, batch_size_in_transitions):
-        """episode_batch is {key: array(buffer_size x T x dim_key)}
-        """
-        T = episode_batch['u'].shape[1]
-        rollout_batch_size = episode_batch['u'].shape[0]
-        batch_size = batch_size_in_transitions
+    logger.info("Training...")
+    best_success_rate = -1

-        # Select which episodes and time steps to use.
-        episode_idxs = np.random.randint(0, rollout_batch_size, batch_size)
-        t_samples = np.random.randint(T, size=batch_size)
-        transitions = {key: episode_batch[key][episode_idxs, t_samples].copy()
-                       for key in episode_batch.keys()}
+    if policy.bc_loss == 1: policy.init_demo_buffer(demo_file) #initialize demo buffer if training with demonstrations

-        # Select future time indexes proportional with probability future_p. These
-        # will be used for HER replay by substituting in future goals.
-        her_indexes = np.where(np.random.uniform(size=batch_size) < future_p)
-        future_offset = np.random.uniform(size=batch_size) * (T - t_samples)
-        future_offset = future_offset.astype(int)
-        future_t = (t_samples + 1 + future_offset)[her_indexes]
+    # num_timesteps = n_epochs * n_cycles * rollout_length * number of rollout workers
+    for epoch in range(n_epochs):
+        # train
+        rollout_worker.clear_history()
+        for _ in range(n_cycles):
+            episode = rollout_worker.generate_rollouts()
+            policy.store_episode(episode)
+            for _ in range(n_batches):
+                policy.train()
+            policy.update_target_net()

-        # Replace goal with achieved goal but only for the previously-selected
-        # HER transitions (as defined by her_indexes). For the other transitions,
-        # keep the original goal.
-        future_ag = episode_batch['ag'][episode_idxs[her_indexes], future_t]
-        transitions['g'][her_indexes] = future_ag
+        # test
+        evaluator.clear_history()
+        for _ in range(n_test_rollouts):
+            evaluator.generate_rollouts()

-        # Reconstruct info dictionary for reward  computation.
-        info = {}
-        for key, value in transitions.items():
-            if key.startswith('info_'):
-                info[key.replace('info_', '')] = value
+        # record logs
+        logger.record_tabular('epoch', epoch)
+        for key, val in evaluator.logs('test'):
+            logger.record_tabular(key, mpi_average(val))
+        for key, val in rollout_worker.logs('train'):
+            logger.record_tabular(key, mpi_average(val))
+        for key, val in policy.logs():
+            logger.record_tabular(key, mpi_average(val))

-        # Re-compute reward since we may have substituted the goal.
-        reward_params = {k: transitions[k] for k in ['ag_2', 'g']}
-        reward_params['info'] = info
-        transitions['r'] = reward_fun(**reward_params)
+        if rank == 0:
+            logger.dump_tabular()

-        transitions = {k: transitions[k].reshape(batch_size, *transitions[k].shape[1:])
-                       for k in transitions.keys()}
+        # save the policy if it's better than the previous ones
+        success_rate = mpi_average(evaluator.current_success_rate())
+        if rank == 0 and success_rate >= best_success_rate and save_path:
+            best_success_rate = success_rate
+            logger.info('New best success rate: {}. Saving policy to {} ...'.format(best_success_rate, best_policy_path))
+            evaluator.save_policy(best_policy_path)
+            evaluator.save_policy(latest_policy_path)
+        if rank == 0 and policy_save_interval > 0 and epoch % policy_save_interval == 0 and save_path:
+            policy_path = periodic_policy_path.format(epoch)
+            logger.info('Saving periodic policy to {} ...'.format(policy_path))
+            evaluator.save_policy(policy_path)

-        assert(transitions['u'].shape[0] == batch_size_in_transitions)
+        # make sure that different threads have different seeds
+        local_uniform = np.random.uniform(size=(1,))
+        root_uniform = local_uniform.copy()
+        MPI.COMM_WORLD.Bcast(root_uniform, root=0)
+        if rank != 0:
+            assert local_uniform[0] != root_uniform[0]

-        return transitions
+    return policy

-    return _sample_her_transitions
+
+def learn(*, network, env, total_timesteps,
+    seed=None,
+    eval_env=None,
+    replay_strategy='future',
+    policy_save_interval=5,
+    clip_return=True,
+    demo_file=None,
+    override_params=None,
+    load_path=None,
+    save_path=None,
+    **kwargs
+):
+
+    override_params = override_params or {}
+    if MPI is not None:
+        rank = MPI.COMM_WORLD.Get_rank()
+        num_cpu = MPI.COMM_WORLD.Get_size()
+
+    # Seed everything.
+    rank_seed = seed + 1000000 * rank if seed is not None else None
+    set_global_seeds(rank_seed)
+
+    # Prepare params.
+    params = config.DEFAULT_PARAMS
+    env_name = env.specs[0].id
+    params['env_name'] = env_name
+    params['replay_strategy'] = replay_strategy
+    if env_name in config.DEFAULT_ENV_PARAMS:
+        params.update(config.DEFAULT_ENV_PARAMS[env_name])  # merge env-specific parameters in
+    params.update(**override_params)  # makes it possible to override any parameter
+    with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f:
+         json.dump(params, f)
+    params = config.prepare_params(params)
+    params['rollout_batch_size'] = env.num_envs
+
+    if demo_file is not None:
+        params['bc_loss'] = 1
+    params.update(kwargs)
+
+    config.log_params(params, logger=logger)
+
+    if num_cpu == 1:
+        logger.warn()
+        logger.warn('*** Warning ***')
+        logger.warn(
+            'You are running HER with just a single MPI worker. This will work, but the ' +
+            'experiments that we report in Plappert et al. (2018, https://arxiv.org/abs/1802.09464) ' +
+            'were obtained with --num_cpu 19. This makes a significant difference and if you ' +
+            'are looking to reproduce those results, be aware of this. Please also refer to ' +
+            'https://github.com/openai/baselines/issues/314 for further details.')
+        logger.warn('****************')
+        logger.warn()
+
+    dims = config.configure_dims(params)
+    policy = config.configure_ddpg(dims=dims, params=params, clip_return=clip_return)
+    if load_path is not None:
+        tf_util.load_variables(load_path)
+
+    rollout_params = {
+        'exploit': False,
+        'use_target_net': False,
+        'use_demo_states': True,
+        'compute_Q': False,
+        'T': params['T'],
+    }
+
+    eval_params = {
+        'exploit': True,
+        'use_target_net': params['test_with_polyak'],
+        'use_demo_states': False,
+        'compute_Q': True,
+        'T': params['T'],
+    }
+
+    for name in ['T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps']:
+        rollout_params[name] = params[name]
+        eval_params[name] = params[name]
+
+    eval_env = eval_env or env
+
+    rollout_worker = RolloutWorker(env, policy, dims, logger, monitor=True, **rollout_params)
+    evaluator = RolloutWorker(eval_env, policy, dims, logger, **eval_params)
+
+    n_cycles = params['n_cycles']
+    n_epochs = total_timesteps // n_cycles // rollout_worker.T // rollout_worker.rollout_batch_size
+
+    return train(
+        save_path=save_path, policy=policy, rollout_worker=rollout_worker,
+        evaluator=evaluator, n_epochs=n_epochs, n_test_rollouts=params['n_test_rollouts'],
+        n_cycles=params['n_cycles'], n_batches=params['n_batches'],
+        policy_save_interval=policy_save_interval, demo_file=demo_file)
+
+
+@click.command()
+@click.option('--env', type=str, default='FetchReach-v1', help='the name of the OpenAI Gym environment that you want to train on')
+@click.option('--total_timesteps', type=int, default=int(5e5), help='the number of timesteps to run')
+@click.option('--seed', type=int, default=0, help='the random seed used to seed both the environment and the training code')
+@click.option('--policy_save_interval', type=int, default=5, help='the interval with which policy pickles are saved. If set to 0, only the best and latest policy will be pickled.')
+@click.option('--replay_strategy', type=click.Choice(['future', 'none']), default='future', help='the HER replay strategy to be used. "future" uses HER, "none" disables HER.')
+@click.option('--clip_return', type=int, default=1, help='whether or not returns should be clipped')
+@click.option('--demo_file', type=str, default = 'PATH/TO/DEMO/DATA/FILE.npz', help='demo data file path')
+def main(**kwargs):
+    learn(**kwargs)
+
+
+if __name__ == '__main__':
+    main()
--- a/baselines/her/her_sampler.py
+++ b/baselines/her/her_sampler.py
@@ -0,0 +1,63 @@
+import numpy as np
+
+
+def make_sample_her_transitions(replay_strategy, replay_k, reward_fun):
+    """Creates a sample function that can be used for HER experience replay.
+
+    Args:
+        replay_strategy (in ['future', 'none']): the HER replay strategy; if set to 'none',
+            regular DDPG experience replay is used
+        replay_k (int): the ratio between HER replays and regular replays (e.g. k = 4 -> 4 times
+            as many HER replays as regular replays are used)
+        reward_fun (function): function to re-compute the reward with substituted goals
+    """
+    if replay_strategy == 'future':
+        future_p = 1 - (1. / (1 + replay_k))
+    else:  # 'replay_strategy' == 'none'
+        future_p = 0
+
+    def _sample_her_transitions(episode_batch, batch_size_in_transitions):
+        """episode_batch is {key: array(buffer_size x T x dim_key)}
+        """
+        T = episode_batch['u'].shape[1]
+        rollout_batch_size = episode_batch['u'].shape[0]
+        batch_size = batch_size_in_transitions
+
+        # Select which episodes and time steps to use.
+        episode_idxs = np.random.randint(0, rollout_batch_size, batch_size)
+        t_samples = np.random.randint(T, size=batch_size)
+        transitions = {key: episode_batch[key][episode_idxs, t_samples].copy()
+                       for key in episode_batch.keys()}
+
+        # Select future time indexes proportional with probability future_p. These
+        # will be used for HER replay by substituting in future goals.
+        her_indexes = np.where(np.random.uniform(size=batch_size) < future_p)
+        future_offset = np.random.uniform(size=batch_size) * (T - t_samples)
+        future_offset = future_offset.astype(int)
+        future_t = (t_samples + 1 + future_offset)[her_indexes]
+
+        # Replace goal with achieved goal but only for the previously-selected
+        # HER transitions (as defined by her_indexes). For the other transitions,
+        # keep the original goal.
+        future_ag = episode_batch['ag'][episode_idxs[her_indexes], future_t]
+        transitions['g'][her_indexes] = future_ag
+
+        # Reconstruct info dictionary for reward  computation.
+        info = {}
+        for key, value in transitions.items():
+            if key.startswith('info_'):
+                info[key.replace('info_', '')] = value
+
+        # Re-compute reward since we may have substituted the goal.
+        reward_params = {k: transitions[k] for k in ['ag_2', 'g']}
+        reward_params['info'] = info
+        transitions['r'] = reward_fun(**reward_params)
+
+        transitions = {k: transitions[k].reshape(batch_size, *transitions[k].shape[1:])
+                       for k in transitions.keys()}
+
+        assert(transitions['u'].shape[0] == batch_size_in_transitions)
+
+        return transitions
+
+    return _sample_her_transitions
--- a/baselines/her/rollout.py
+++ b/baselines/her/rollout.py
@@ -2,7 +2,6 @@ from collections import deque

 import numpy as np
 import pickle
-from mujoco_py import MujocoException

 from baselines.her.util import convert_episode_to_batch_major, store_args

@@ -10,9 +9,9 @@ from baselines.her.util import convert_episode_to_batch_major, store_args
 class RolloutWorker:

    @store_args
-    def __init__(self, make_env, policy, dims, logger, T, rollout_batch_size=1,
+    def __init__(self, venv, policy, dims, logger, T, rollout_batch_size=1,
                 exploit=False, use_target_net=False, compute_Q=False, noise_eps=0,
-                 random_eps=0, history_len=100, render=False, **kwargs):
+                 random_eps=0, history_len=100, render=False, monitor=False, **kwargs):
        """Rollout worker generates experience by interacting with one or many environments.

        Args:
@@ -31,7 +30,7 @@ class RolloutWorker:
            history_len (int): length of history for statistics smoothing
            render (boolean): whether or not to render the rollouts
        """
-        self.envs = [make_env() for _ in range(rollout_batch_size)]
+
        assert self.T > 0

        self.info_keys = [key.replace('info_', '') for key in dims.keys() if key.startswith('info_')]
@@ -40,26 +39,14 @@ class RolloutWorker:
        self.Q_history = deque(maxlen=history_len)

        self.n_episodes = 0
-        self.g = np.empty((self.rollout_batch_size, self.dims['g']), np.float32)  # goals
-        self.initial_o = np.empty((self.rollout_batch_size, self.dims['o']), np.float32)  # observations
-        self.initial_ag = np.empty((self.rollout_batch_size, self.dims['g']), np.float32)  # achieved goals
        self.reset_all_rollouts()
        self.clear_history()

-    def reset_rollout(self, i):
-        """Resets the `i`-th rollout environment, re-samples a new goal, and updates the `initial_o`
-        and `g` arrays accordingly.
-        """
-        obs = self.envs[i].reset()
-        self.initial_o[i] = obs['observation']
-        self.initial_ag[i] = obs['achieved_goal']
-        self.g[i] = obs['desired_goal']
-
    def reset_all_rollouts(self):
-        """Resets all `rollout_batch_size` rollout workers.
-        """
-        for i in range(self.rollout_batch_size):
-            self.reset_rollout(i)
+        self.obs_dict = self.venv.reset()
+        self.initial_o = self.obs_dict['observation']
+        self.initial_ag = self.obs_dict['achieved_goal']
+        self.g = self.obs_dict['desired_goal']

    def generate_rollouts(self):
        """Performs `rollout_batch_size` rollouts in parallel for time horizon `T` with the current
@@ -75,7 +62,8 @@ class RolloutWorker:

        # generate episodes
        obs, achieved_goals, acts, goals, successes = [], [], [], [], []
-        info_values = [np.empty((self.T, self.rollout_batch_size, self.dims['info_' + key]), np.float32) for key in self.info_keys]
+        dones = []
+        info_values = [np.empty((self.T - 1, self.rollout_batch_size, self.dims['info_' + key]), np.float32) for key in self.info_keys]
        Qs = []
        for t in range(self.T):
            policy_output = self.policy.get_actions(
@@ -99,27 +87,27 @@ class RolloutWorker:
            ag_new = np.empty((self.rollout_batch_size, self.dims['g']))
            success = np.zeros(self.rollout_batch_size)
            # compute new states and observations
-            for i in range(self.rollout_batch_size):
-                try:
-                    # We fully ignore the reward here because it will have to be re-computed
-                    # for HER.
-                    curr_o_new, _, _, info = self.envs[i].step(u[i])
-                    if 'is_success' in info:
-                        success[i] = info['is_success']
-                    o_new[i] = curr_o_new['observation']
-                    ag_new[i] = curr_o_new['achieved_goal']
-                    for idx, key in enumerate(self.info_keys):
-                        info_values[idx][t, i] = info[key]
-                    if self.render:
-                        self.envs[i].render()
-                except MujocoException as e:
-                    return self.generate_rollouts()
+            obs_dict_new, _, done, info = self.venv.step(u)
+            o_new = obs_dict_new['observation']
+            ag_new = obs_dict_new['achieved_goal']
+            success = np.array([i.get('is_success', 0.0) for i in info])
+
+            if any(done):
+                # here we assume all environments are done is ~same number of steps, so we terminate rollouts whenever any of the envs returns done
+                # trick with using vecenvs is not to add the obs from the environments that are "done", because those are already observations
+                # after a reset
+                break
+
+            for i, info_dict in enumerate(info):
+                for idx, key in enumerate(self.info_keys):
+                    info_values[idx][t, i] = info[i][key]

            if np.isnan(o_new).any():
                self.logger.warn('NaN caught during rollout generation. Trying again...')
                self.reset_all_rollouts()
                return self.generate_rollouts()

+            dones.append(done)
            obs.append(o.copy())
            achieved_goals.append(ag.copy())
            successes.append(success.copy())
@@ -129,7 +117,6 @@ class RolloutWorker:
            ag[...] = ag_new
        obs.append(o.copy())
        achieved_goals.append(ag.copy())
-        self.initial_o[:] = o

        episode = dict(o=obs,
                       u=acts,
@@ -181,8 +168,3 @@ class RolloutWorker:
        else:
            return logs

-    def seed(self, seed):
-        """Seeds each environment with a distinct seed derived from the passed in global seed.
-        """
-        for idx, env in enumerate(self.envs):
-            env.seed(seed + 1000 * idx)
--- a/baselines/logger.py
+++ b/baselines/logger.py
@@ -54,7 +54,7 @@ class HumanOutputFormat(KVWriter, SeqWriter):
        # Write out the data
        dashes = '-' * (keywidth + valwidth + 7)
        lines = [dashes]
-        for (key, val) in sorted(key2str.items()):
+        for (key, val) in sorted(key2str.items(), key=lambda kv: kv[0].lower()):
            lines.append('| %s%s | %s%s |' % (
                key,
                ' ' * (keywidth - len(key)),
--- a/baselines/ppo1/pposgd_simple.py
+++ b/baselines/ppo1/pposgd_simple.py
@@ -97,7 +97,7 @@ def learn(env, policy_fn, *,
    ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return

    lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule
-    clip_param = clip_param * lrmult # Annealed cliping parameter epislon
+    clip_param = clip_param * lrmult # Annealed clipping parameter epsilon

    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])
--- a/baselines/ppo2/microbatched_model.py
+++ b/baselines/ppo2/microbatched_model.py
@@ -0,0 +1,76 @@
+import tensorflow as tf
+import numpy as np
+from baselines.ppo2.model import Model
+
+class MicrobatchedModel(Model):
+    """
+    Model that does training one microbatch at a time - when gradient computation
+    on the entire minibatch causes some overflow
+    """
+    def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train,
+                nsteps, ent_coef, vf_coef, max_grad_norm, microbatch_size):
+
+        self.nmicrobatches = nbatch_train // microbatch_size
+        self.microbatch_size = microbatch_size
+        assert nbatch_train % microbatch_size == 0, 'microbatch_size ({}) should divide nbatch_train ({}) evenly'.format(microbatch_size, nbatch_train)
+
+        super().__init__(
+                policy=policy,
+                ob_space=ob_space,
+                ac_space=ac_space,
+                nbatch_act=nbatch_act,
+                nbatch_train=microbatch_size,
+                nsteps=nsteps,
+                ent_coef=ent_coef,
+                vf_coef=vf_coef,
+                max_grad_norm=max_grad_norm)
+
+        self.grads_ph = [tf.placeholder(dtype=g.dtype, shape=g.shape) for g in self.grads]
+        grads_ph_and_vars = list(zip(self.grads_ph, self.var))
+        self._apply_gradients_op = self.trainer.apply_gradients(grads_ph_and_vars)
+
+
+    def train(self, lr, cliprange, obs, returns, masks, actions, values, neglogpacs, states=None):
+        assert states is None, "microbatches with recurrent models are not supported yet"
+
+        # Here we calculate advantage A(s,a) = R + yV(s') - V(s)
+        # Returns = R + yV(s')
+        advs = returns - values
+
+        # Normalize the advantages
+        advs = (advs - advs.mean()) / (advs.std() + 1e-8)
+
+        # Initialize empty list for per-microbatch stats like pg_loss, vf_loss, entropy, approxkl (whatever is in self.stats_list)
+        stats_vs = []
+
+        for microbatch_idx in range(self.nmicrobatches):
+            _sli = range(microbatch_idx * self.microbatch_size, (microbatch_idx+1) * self.microbatch_size)
+            td_map = {
+                self.train_model.X: obs[_sli],
+                self.A:actions[_sli],
+                self.ADV:advs[_sli],
+                self.R:returns[_sli],
+                self.CLIPRANGE:cliprange,
+                self.OLDNEGLOGPAC:neglogpacs[_sli],
+                self.OLDVPRED:values[_sli]
+            }
+
+            # Compute gradient on a microbatch (note that variables do not change here) ...
+            grad_v, stats_v  = self.sess.run([self.grads, self.stats_list], td_map)
+            if microbatch_idx == 0:
+                sum_grad_v = grad_v
+            else:
+                # .. and add to the total of the gradients
+                for i, g in enumerate(grad_v):
+                    sum_grad_v[i] += g
+            stats_vs.append(stats_v)
+
+        feed_dict = {ph: sum_g / self.nmicrobatches for ph, sum_g in zip(self.grads_ph, sum_grad_v)}
+        feed_dict[self.LR] = lr
+        # Update variables using average of the gradients
+        self.sess.run(self._apply_gradients_op, feed_dict)
+        # Return average of the stats
+        return np.mean(np.array(stats_vs), axis=0).tolist()
+
+
+
--- a/baselines/ppo2/model.py
+++ b/baselines/ppo2/model.py
@@ -0,0 +1,156 @@
+import tensorflow as tf
+import functools
+
+from baselines.common.tf_util import get_session, save_variables, load_variables
+from baselines.common.tf_util import initialize
+
+try:
+    from baselines.common.mpi_adam_optimizer import MpiAdamOptimizer
+    from mpi4py import MPI
+    from baselines.common.mpi_util import sync_from_root
+except ImportError:
+    MPI = None
+
+class Model(object):
+    """
+    We use this object to :
+    __init__:
+    - Creates the step_model
+    - Creates the train_model
+
+    train():
+    - Make the training part (feedforward and retropropagation of gradients)
+
+    save/load():
+    - Save load the model
+    """
+    def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train,
+                nsteps, ent_coef, vf_coef, max_grad_norm, microbatch_size=None):
+        self.sess = sess = get_session()
+
+        with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE):
+            # CREATE OUR TWO MODELS
+            # act_model that is used for sampling
+            act_model = policy(nbatch_act, 1, sess)
+
+            # Train model for training
+            if microbatch_size is None:
+                train_model = policy(nbatch_train, nsteps, sess)
+            else:
+                train_model = policy(microbatch_size, nsteps, sess)
+
+        # CREATE THE PLACEHOLDERS
+        self.A = A = train_model.pdtype.sample_placeholder([None])
+        self.ADV = ADV = tf.placeholder(tf.float32, [None])
+        self.R = R = tf.placeholder(tf.float32, [None])
+        # Keep track of old actor
+        self.OLDNEGLOGPAC = OLDNEGLOGPAC = tf.placeholder(tf.float32, [None])
+        # Keep track of old critic
+        self.OLDVPRED = OLDVPRED = tf.placeholder(tf.float32, [None])
+        self.LR = LR = tf.placeholder(tf.float32, [])
+        # Cliprange
+        self.CLIPRANGE = CLIPRANGE = tf.placeholder(tf.float32, [])
+
+        neglogpac = train_model.pd.neglogp(A)
+
+        # Calculate the entropy
+        # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy.
+        entropy = tf.reduce_mean(train_model.pd.entropy())
+
+        # CALCULATE THE LOSS
+        # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss
+
+        # Clip the value to reduce variability during Critic training
+        # Get the predicted value
+        vpred = train_model.vf
+        vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, - CLIPRANGE, CLIPRANGE)
+        # Unclipped value
+        vf_losses1 = tf.square(vpred - R)
+        # Clipped value
+        vf_losses2 = tf.square(vpredclipped - R)
+
+        vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))
+
+        # Calculate ratio (pi current policy / pi old policy)
+        ratio = tf.exp(OLDNEGLOGPAC - neglogpac)
+
+        # Defining Loss = - J is equivalent to max J
+        pg_losses = -ADV * ratio
+
+        pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE)
+
+        # Final PG loss
+        pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2))
+        approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC))
+        clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE)))
+
+        # Total loss
+        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef
+
+        # UPDATE THE PARAMETERS USING LOSS
+        # 1. Get the model parameters
+        params = tf.trainable_variables('ppo2_model')
+        # 2. Build our trainer
+        if MPI is not None:
+            self.trainer = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=LR, epsilon=1e-5)
+        else:
+            self.trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5)
+        # 3. Calculate the gradients
+        grads_and_var = self.trainer.compute_gradients(loss, params)
+        grads, var = zip(*grads_and_var)
+
+        if max_grad_norm is not None:
+            # Clip the gradients (normalize)
+            grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
+        grads_and_var = list(zip(grads, var))
+        # zip aggregate each gradient with parameters associated
+        # For instance zip(ABCD, xyza) => Ax, By, Cz, Da
+
+        self.grads = grads
+        self.var = var
+        self._train_op = self.trainer.apply_gradients(grads_and_var)
+        self.loss_names = ['policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac']
+        self.stats_list = [pg_loss, vf_loss, entropy, approxkl, clipfrac]
+
+
+        self.train_model = train_model
+        self.act_model = act_model
+        self.step = act_model.step
+        self.value = act_model.value
+        self.initial_state = act_model.initial_state
+
+        self.save = functools.partial(save_variables, sess=sess)
+        self.load = functools.partial(load_variables, sess=sess)
+
+        initialize()
+        global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="")
+        if MPI is not None:
+            sync_from_root(sess, global_variables) #pylint: disable=E1101
+
+    def train(self, lr, cliprange, obs, returns, masks, actions, values, neglogpacs, states=None):
+        # Here we calculate advantage A(s,a) = R + yV(s') - V(s)
+        # Returns = R + yV(s')
+        advs = returns - values
+
+        # Normalize the advantages
+        advs = (advs - advs.mean()) / (advs.std() + 1e-8)
+
+        td_map = {
+            self.train_model.X : obs,
+            self.A : actions,
+            self.ADV : advs,
+            self.R : returns,
+            self.LR : lr,
+            self.CLIPRANGE : cliprange,
+            self.OLDNEGLOGPAC : neglogpacs,
+            self.OLDVPRED : values
+        }
+        if states is not None:
+            td_map[self.train_model.S] = states
+            td_map[self.train_model.M] = masks
+
+        return self.sess.run(
+            self.stats_list + [self._train_op],
+            td_map
+        )[:-1]
+
--- a/baselines/ppo2/ppo2.py
+++ b/baselines/ppo2/ppo2.py
@@ -1,226 +1,17 @@
 import os
 import time
-import functools
 import numpy as np
 import os.path as osp
-import tensorflow as tf
 from baselines import logger
 from collections import deque
 from baselines.common import explained_variance, set_global_seeds
 from baselines.common.policies import build_policy
-from baselines.common.runners import AbstractEnvRunner
-from baselines.common.tf_util import get_session, save_variables, load_variables
-
 try:
-    from baselines.common.mpi_adam_optimizer import MpiAdamOptimizer
    from mpi4py import MPI
-    from baselines.common.mpi_util import sync_from_root
 except ImportError:
    MPI = None
+from baselines.ppo2.runner import Runner

-from baselines.common.tf_util import initialize
-
-class Model(object):
-    """
-    We use this object to :
-    __init__:
-    - Creates the step_model
-    - Creates the train_model
-
-    train():
-    - Make the training part (feedforward and retropropagation of gradients)
-
-    save/load():
-    - Save load the model
-    """
-    def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train,
-                nsteps, ent_coef, vf_coef, max_grad_norm):
-        sess = get_session()
-
-        with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE):
-            # CREATE OUR TWO MODELS
-            # act_model that is used for sampling
-            act_model = policy(nbatch_act, 1, sess)
-
-            # Train model for training
-            train_model = policy(nbatch_train, nsteps, sess)
-
-        # CREATE THE PLACEHOLDERS
-        A = train_model.pdtype.sample_placeholder([None])
-        ADV = tf.placeholder(tf.float32, [None])
-        R = tf.placeholder(tf.float32, [None])
-        # Keep track of old actor
-        OLDNEGLOGPAC = tf.placeholder(tf.float32, [None])
-        # Keep track of old critic
-        OLDVPRED = tf.placeholder(tf.float32, [None])
-        LR = tf.placeholder(tf.float32, [])
-        # Cliprange
-        CLIPRANGE = tf.placeholder(tf.float32, [])
-
-        neglogpac = train_model.pd.neglogp(A)
-
-        # Calculate the entropy
-        # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy.
-        entropy = tf.reduce_mean(train_model.pd.entropy())
-
-        # CALCULATE THE LOSS
-        # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss
-
-        # Clip the value to reduce variability during Critic training
-        # Get the predicted value
-        vpred = train_model.vf
-        vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, - CLIPRANGE, CLIPRANGE)
-        # Unclipped value
-        vf_losses1 = tf.square(vpred - R)
-        # Clipped value
-        vf_losses2 = tf.square(vpredclipped - R)
-
-        vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))
-
-        # Calculate ratio (pi current policy / pi old policy)
-        ratio = tf.exp(OLDNEGLOGPAC - neglogpac)
-
-        # Defining Loss = - J is equivalent to max J
-        pg_losses = -ADV * ratio
-
-        pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE)
-
-        # Final PG loss
-        pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2))
-        approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC))
-        clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE)))
-
-        # Total loss
-        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef
-
-        # UPDATE THE PARAMETERS USING LOSS
-        # 1. Get the model parameters
-        params = tf.trainable_variables('ppo2_model')
-        # 2. Build our trainer
-        if MPI is not None:
-            trainer = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=LR, epsilon=1e-5)
-        else:
-            trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5)
-        # 3. Calculate the gradients
-        grads_and_var = trainer.compute_gradients(loss, params)
-        grads, var = zip(*grads_and_var)
-
-        if max_grad_norm is not None:
-            # Clip the gradients (normalize)
-            grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
-        grads_and_var = list(zip(grads, var))
-        # zip aggregate each gradient with parameters associated
-        # For instance zip(ABCD, xyza) => Ax, By, Cz, Da
-
-        _train = trainer.apply_gradients(grads_and_var)
-
-        def train(lr, cliprange, obs, returns, masks, actions, values, neglogpacs, states=None):
-            # Here we calculate advantage A(s,a) = R + yV(s') - V(s)
-            # Returns = R + yV(s')
-            advs = returns - values
-
-            # Normalize the advantages
-            advs = (advs - advs.mean()) / (advs.std() + 1e-8)
-            td_map = {train_model.X:obs, A:actions, ADV:advs, R:returns, LR:lr,
-                    CLIPRANGE:cliprange, OLDNEGLOGPAC:neglogpacs, OLDVPRED:values}
-            if states is not None:
-                td_map[train_model.S] = states
-                td_map[train_model.M] = masks
-            return sess.run(
-                [pg_loss, vf_loss, entropy, approxkl, clipfrac, _train],
-                td_map
-            )[:-1]
-        self.loss_names = ['policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac']
-
-
-        self.train = train
-        self.train_model = train_model
-        self.act_model = act_model
-        self.step = act_model.step
-        self.value = act_model.value
-        self.initial_state = act_model.initial_state
-
-        self.save = functools.partial(save_variables, sess=sess)
-        self.load = functools.partial(load_variables, sess=sess)
-
-        if MPI is None or MPI.COMM_WORLD.Get_rank() == 0:
-            initialize()
-        global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="")
-
-        if MPI is not None:
-            sync_from_root(sess, global_variables) #pylint: disable=E1101
-
-class Runner(AbstractEnvRunner):
-    """
-    We use this object to make a mini batch of experiences
-    __init__:
-    - Initialize the runner
-
-    run():
-    - Make a mini batch
-    """
-    def __init__(self, *, env, model, nsteps, gamma, lam):
-        super().__init__(env=env, model=model, nsteps=nsteps)
-        # Lambda used in GAE (General Advantage Estimation)
-        self.lam = lam
-        # Discount rate
-        self.gamma = gamma
-
-    def run(self):
-        # Here, we init the lists that will contain the mb of experiences
-        mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, mb_neglogpacs = [],[],[],[],[],[]
-        mb_states = self.states
-        epinfos = []
-        # For n in range number of steps
-        for _ in range(self.nsteps):
-            # Given observations, get action value and neglopacs
-            # We already have self.obs because Runner superclass run self.obs[:] = env.reset() on init
-            actions, values, self.states, neglogpacs = self.model.step(self.obs, S=self.states, M=self.dones)
-            mb_obs.append(self.obs.copy())
-            mb_actions.append(actions)
-            mb_values.append(values)
-            mb_neglogpacs.append(neglogpacs)
-            mb_dones.append(self.dones)
-
-            # Take actions in env and look the results
-            # Infos contains a ton of useful informations
-            self.obs[:], rewards, self.dones, infos = self.env.step(actions)
-            for info in infos:
-                maybeepinfo = info.get('episode')
-                if maybeepinfo: epinfos.append(maybeepinfo)
-            mb_rewards.append(rewards)
-        #batch of steps to batch of rollouts
-        mb_obs = np.asarray(mb_obs, dtype=self.obs.dtype)
-        mb_rewards = np.asarray(mb_rewards, dtype=np.float32)
-        mb_actions = np.asarray(mb_actions)
-        mb_values = np.asarray(mb_values, dtype=np.float32)
-        mb_neglogpacs = np.asarray(mb_neglogpacs, dtype=np.float32)
-        mb_dones = np.asarray(mb_dones, dtype=np.bool)
-        last_values = self.model.value(self.obs, S=self.states, M=self.dones)
-
-        # discount/bootstrap off value fn
-        mb_returns = np.zeros_like(mb_rewards)
-        mb_advs = np.zeros_like(mb_rewards)
-        lastgaelam = 0
-        for t in reversed(range(self.nsteps)):
-            if t == self.nsteps - 1:
-                nextnonterminal = 1.0 - self.dones
-                nextvalues = last_values
-            else:
-                nextnonterminal = 1.0 - mb_dones[t+1]
-                nextvalues = mb_values[t+1]
-            delta = mb_rewards[t] + self.gamma * nextvalues * nextnonterminal - mb_values[t]
-            mb_advs[t] = lastgaelam = delta + self.gamma * self.lam * nextnonterminal * lastgaelam
-        mb_returns = mb_advs + mb_values
-        return (*map(sf01, (mb_obs, mb_returns, mb_dones, mb_actions, mb_values, mb_neglogpacs)),
-            mb_states, epinfos)
-# obs, returns, masks, actions, values, neglogpacs, states = runner.run()
-def sf01(arr):
-    """
-    swap and then flatten axes 0 and 1
-    """
-    s = arr.shape
-    return arr.swapaxes(0, 1).reshape(s[0] * s[1], *s[2:])

 def constfn(val):
    def f(_):
@@ -230,7 +21,7 @@ def constfn(val):
 def learn(*, network, env, total_timesteps, eval_env = None, seed=None, nsteps=2048, ent_coef=0.0, lr=3e-4,
            vf_coef=0.5,  max_grad_norm=0.5, gamma=0.99, lam=0.95,
            log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2,
-            save_interval=0, load_path=None, **network_kwargs):
+            save_interval=0, load_path=None, model_fn=None, **network_kwargs):
    '''
    Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347)

@@ -308,10 +99,14 @@ def learn(*, network, env, total_timesteps, eval_env = None, seed=None, nsteps=2
    nbatch_train = nbatch // nminibatches

    # Instantiate the model object (that creates act_model and train_model)
-    make_model = lambda : Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train,
+    if model_fn is None:
+        from baselines.ppo2.model import Model
+        model_fn = Model
+
+    model = model_fn(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train,
                    nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef,
                    max_grad_norm=max_grad_norm)
-    model = make_model()
+
    if load_path is not None:
        model.load(load_path)
    # Instantiate the runner object
@@ -319,8 +114,6 @@ def learn(*, network, env, total_timesteps, eval_env = None, seed=None, nsteps=2
    if eval_env is not None:
        eval_runner = Runner(env = eval_env, model = model, nsteps = nsteps, gamma = gamma, lam= lam)

-
-
    epinfobuf = deque(maxlen=100)
    if eval_env is not None:
        eval_epinfobuf = deque(maxlen=100)
--- a/baselines/ppo2/runner.py
+++ b/baselines/ppo2/runner.py
@@ -0,0 +1,76 @@
+import numpy as np
+from baselines.common.runners import AbstractEnvRunner
+
+class Runner(AbstractEnvRunner):
+    """
+    We use this object to make a mini batch of experiences
+    __init__:
+    - Initialize the runner
+
+    run():
+    - Make a mini batch
+    """
+    def __init__(self, *, env, model, nsteps, gamma, lam):
+        super().__init__(env=env, model=model, nsteps=nsteps)
+        # Lambda used in GAE (General Advantage Estimation)
+        self.lam = lam
+        # Discount rate
+        self.gamma = gamma
+
+    def run(self):
+        # Here, we init the lists that will contain the mb of experiences
+        mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, mb_neglogpacs = [],[],[],[],[],[]
+        mb_states = self.states
+        epinfos = []
+        # For n in range number of steps
+        for _ in range(self.nsteps):
+            # Given observations, get action value and neglopacs
+            # We already have self.obs because Runner superclass run self.obs[:] = env.reset() on init
+            actions, values, self.states, neglogpacs = self.model.step(self.obs, S=self.states, M=self.dones)
+            mb_obs.append(self.obs.copy())
+            mb_actions.append(actions)
+            mb_values.append(values)
+            mb_neglogpacs.append(neglogpacs)
+            mb_dones.append(self.dones)
+
+            # Take actions in env and look the results
+            # Infos contains a ton of useful informations
+            self.obs[:], rewards, self.dones, infos = self.env.step(actions)
+            for info in infos:
+                maybeepinfo = info.get('episode')
+                if maybeepinfo: epinfos.append(maybeepinfo)
+            mb_rewards.append(rewards)
+        #batch of steps to batch of rollouts
+        mb_obs = np.asarray(mb_obs, dtype=self.obs.dtype)
+        mb_rewards = np.asarray(mb_rewards, dtype=np.float32)
+        mb_actions = np.asarray(mb_actions)
+        mb_values = np.asarray(mb_values, dtype=np.float32)
+        mb_neglogpacs = np.asarray(mb_neglogpacs, dtype=np.float32)
+        mb_dones = np.asarray(mb_dones, dtype=np.bool)
+        last_values = self.model.value(self.obs, S=self.states, M=self.dones)
+
+        # discount/bootstrap off value fn
+        mb_returns = np.zeros_like(mb_rewards)
+        mb_advs = np.zeros_like(mb_rewards)
+        lastgaelam = 0
+        for t in reversed(range(self.nsteps)):
+            if t == self.nsteps - 1:
+                nextnonterminal = 1.0 - self.dones
+                nextvalues = last_values
+            else:
+                nextnonterminal = 1.0 - mb_dones[t+1]
+                nextvalues = mb_values[t+1]
+            delta = mb_rewards[t] + self.gamma * nextvalues * nextnonterminal - mb_values[t]
+            mb_advs[t] = lastgaelam = delta + self.gamma * self.lam * nextnonterminal * lastgaelam
+        mb_returns = mb_advs + mb_values
+        return (*map(sf01, (mb_obs, mb_returns, mb_dones, mb_actions, mb_values, mb_neglogpacs)),
+            mb_states, epinfos)
+# obs, returns, masks, actions, values, neglogpacs, states = runner.run()
+def sf01(arr):
+    """
+    swap and then flatten axes 0 and 1
+    """
+    s = arr.shape
+    return arr.swapaxes(0, 1).reshape(s[0] * s[1], *s[2:])
+
+
--- a/baselines/ppo2/test_microbatches.py
+++ b/baselines/ppo2/test_microbatches.py
@@ -0,0 +1,34 @@
+import gym
+import tensorflow as tf
+import numpy as np
+from functools import partial
+
+from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
+from baselines.common.tf_util import make_session
+from baselines.ppo2.ppo2 import learn
+
+from baselines.ppo2.microbatched_model import MicrobatchedModel
+
+def test_microbatches():
+    def env_fn():
+        env = gym.make('CartPole-v0')
+        env.seed(0)
+        return env
+
+    learn_fn = partial(learn, network='mlp', nsteps=32, total_timesteps=32, seed=0)
+
+    env_ref = DummyVecEnv([env_fn])
+    sess_ref = make_session(make_default=True, graph=tf.Graph())
+    learn_fn(env=env_ref)
+    vars_ref = {v.name: sess_ref.run(v) for v in tf.trainable_variables()}
+
+    env_test = DummyVecEnv([env_fn])
+    sess_test = make_session(make_default=True, graph=tf.Graph())
+    learn_fn(env=env_test, model_fn=partial(MicrobatchedModel, microbatch_size=2))
+    vars_test = {v.name: sess_test.run(v) for v in tf.trainable_variables()}
+
+    for v in vars_ref:
+        np.testing.assert_allclose(vars_ref[v], vars_test[v], atol=1e-3)
+
+if __name__ == '__main__':
+    test_microbatches()
--- a/baselines/results_plotter.py
+++ b/baselines/results_plotter.py
@@ -5,7 +5,7 @@ matplotlib.use('TkAgg') # Can change to 'Agg' for non-interactive mode
 import matplotlib.pyplot as plt
 plt.rcParams['svg.fonttype'] = 'none'

-from baselines.bench.monitor import load_results
+from baselines.common import plot_util

 X_TIMESTEPS = 'timesteps'
 X_EPISODES = 'episodes'
@@ -16,7 +16,7 @@ POSSIBLE_X_AXES = [X_TIMESTEPS, X_EPISODES, X_WALLTIME]
 EPISODES_WINDOW = 100
 COLORS = ['blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'black', 'purple', 'pink',
        'brown', 'orange', 'teal', 'coral', 'lightblue', 'lime', 'lavender', 'turquoise',
-        'darkgreen', 'tan', 'salmon', 'gold', 'lightpurple', 'darkred', 'darkblue']
+        'darkgreen', 'tan', 'salmon', 'gold', 'darkred', 'darkblue']

 def rolling_window(a, window):
    shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
@@ -50,7 +50,7 @@ def plot_curves(xy_list, xaxis, yaxis, title):
    maxx = max(xy[0][-1] for xy in xy_list)
    minx = 0
    for (i, (x, y)) in enumerate(xy_list):
-        color = COLORS[i]
+        color = COLORS[i % len(COLORS)]
        plt.scatter(x, y, s=2)
        x, y_mean = window_func(x, y, EPISODES_WINDOW, np.mean) #So returns average of last EPISODE_WINDOW episodes
        plt.plot(x, y_mean, color=color)
@@ -62,19 +62,18 @@ def plot_curves(xy_list, xaxis, yaxis, title):
    fig.canvas.mpl_connect('resize_event', lambda event: plt.tight_layout())
    plt.grid(True)

-def plot_results(dirs, num_timesteps, xaxis, yaxis, task_name):
-    tslist = []
-    for dir in dirs:
-        ts = load_results(dir)
-        ts = ts[ts.l.cumsum() <= num_timesteps]
-        tslist.append(ts)
-    xy_list = [ts2xy(ts, xaxis, yaxis) for ts in tslist]
-    plot_curves(xy_list, xaxis, yaxis, task_name)
+
+def split_by_task(taskpath):
+    return taskpath['dirname'].split('/')[-1].split('-')[0]
+
+def plot_results(dirs, num_timesteps=10e6, xaxis=X_TIMESTEPS, yaxis=Y_REWARD, title='', split_fn=split_by_task):
+    results = plot_util.load_results(dirs)
+    plot_util.plot_results(results, xy_fn=lambda r: ts2xy(r['monitor'], xaxis, yaxis), split_fn=split_fn, average_group=True, resample=int(1e6))

 # Example usage in jupyter-notebook
-# from baselines import log_viewer
+# from baselines.results_plotter import plot_results
 # %matplotlib inline
-# log_viewer.plot_results(["./log"], 10e6, log_viewer.X_TIMESTEPS, "Breakout")
+# plot_results("./log")
 # Here ./log is a directory containing the monitor.csv files

 def main():
--- a/baselines/run.py
+++ b/baselines/run.py
@@ -110,7 +110,8 @@ def build_env(args):
       config.gpu_options.allow_growth = True
       get_session(config=config)

-       env = make_vec_env(env_id, env_type, args.num_env or 1, seed, reward_scale=args.reward_scale)
+       flatten_dict_observations = alg not in {'her'}
+       env = make_vec_env(env_id, env_type, args.num_env or 1, seed, reward_scale=args.reward_scale, flatten_dict_observations=flatten_dict_observations)

       if env_type == 'mujoco':
           env = VecNormalize(env)
@@ -119,6 +120,11 @@ def build_env(args):


 def get_env_type(env_id):
+    # Re-parse the gym registry, since we could have new envs since last time.
+    for env in gym.envs.registry.all():
+        env_type = env._entry_point.split(':')[0].split('.')[-1]
+        _game_envs[env_type].add(env.id)  # This is a set so add is idempotent
+
    if env_id in _game_envs.keys():
        env_type = env_id
        env_id = [g for g in _game_envs[env_type]][0]
@@ -181,13 +187,16 @@ def parse_cmdline_kwargs(args):



-def main():
+def main(args):
    # configure logger, disable logging in child MPI processes (with rank > 0)

    arg_parser = common_arg_parser()
-    args, unknown_args = arg_parser.parse_known_args()
+    args, unknown_args = arg_parser.parse_known_args(args)
    extra_args = parse_cmdline_kwargs(unknown_args)

+    if args.extra_import is not None:
+        import_module(args.extra_import)
+
    if MPI is None or MPI.COMM_WORLD.Get_rank() == 0:
        rank = 0
        logger.configure()
@@ -206,11 +215,16 @@ def main():
        logger.log("Running trained model")
        env = build_env(args)
        obs = env.reset()
-        def initialize_placeholders(nlstm=128,**kwargs):
-            return np.zeros((args.num_env or 1, 2*nlstm)), np.zeros((1))
-        state, dones = initialize_placeholders(**extra_args)
+
+        state = model.initial_state if hasattr(model, 'initial_state') else None
+        dones = np.zeros((1,))
+
        while True:
-            actions, _, state, _ = model.step(obs,S=state, M=dones)
+            if state is not None:
+                actions, _, state, _ = model.step(obs,S=state, M=dones)
+            else:
+                actions, _, _, _ = model.step(obs)
+
            obs, _, done, _ = env.step(actions)
            env.render()
            done = done.any() if isinstance(done, np.ndarray) else done
@@ -220,5 +234,7 @@ def main():

        env.close()

+    return model
+
 if __name__ == '__main__':
-    main()
+    main(sys.argv)
--- a/docs/viz/viz.ipynb
+++ b/docs/viz/viz.ipynb
--- a/docs/viz/viz.md
+++ b/docs/viz/viz.md
@@ -1,136 +0,0 @@
-# Loading and visualizing results ([colab notebook](https://colab.research.google.com/drive/1Wez1SA9PmNkCoYc8Fvl53bhU3F8OffGm))
-In order to compare performance of algorithms, we often would like to visualize learning curves (reward as a function of time steps), or some other auxiliary information about learning
-aggregated into a plot. Baselines repo provides tools for doing so in several different ways, depending on the goal. 
-
-## Preliminaries
-For all algorithms in baselines summary data is saved into a folder defined by logger. By default, a folder `$TMPDIR/openai-<date>-<time>` is used;
-you can see the location of logger directory at the beginning of the training in the message like this:
-
-```
-Logging to /var/folders/mq/tgrn7bs17s1fnhlwt314b2fm0000gn/T/openai-2018-10-29-15-03-13-537078
-```
-The location can be changed by changing `OPENAI_LOGDIR` environment variable; for instance:
-```bash
-export OPENAI_LOGDIR=$HOME/logs/cartpole-ppo
-python -m baselines.run --alg=ppo2 --env=CartPole-v0 --num_timesteps=30000 --nsteps=128
-```
-will log data to `~/logs/cartpole-ppo`. 
-
-## Using TensorBoard
-One of the most straightforward ways to visualize data is to use [TensorBoard](https://www.tensorflow.org/guide/summaries_and_tensorboard). Baselines logger can dump data in tensorboard-compatible format; to 
-set that up, set environment variables `OPENAI_LOG_FORMAT`
-```bash
-export OPENAI_LOG_FORMAT='stdout,log,csv,tensorboard' # formats are comma-separated, but for tensorboard you only really need the last one
-```
-And you can now start TensorBoard with:
-```bash
-tensorboard --logdir=$OPENAI_LOGDIR
-```
-
-## Loading summaries of the results
-If the summary overview provided by tensorboard is not sufficient, and you would like to either access to raw environment episode data, or use complex post-processing notavailable in tensorboard, you can load results into python as [pandas](https://pandas.pydata.org/) dataframes. 
-
-For instance, the following snippet:
-```python
-from baselines.common import plot_util as pu
-results = pu.load_results('~/logs/cartpole-ppo') 
-```
-will search for all folders with baselines-compatible results in `~/logs/cartpole-ppo` and subfolders and 
-return a list of Result objects. Each Result object is a named tuple with the following fields:
-
- dirname: str - name of the folder from which data was loaded
-
- metadata: dict) - dictionary with various metadata (read from metadata.json file)
-
- progress: pandas.DataFrame - tabular data saved by logger as a pandas dataframe. Available if csv is in logger formats. 
-
- monitor: pandas.DataFrame - raw episode data (length, episode reward, timestamp). Available if environment wrapped with [Monitor](../../baselines/bench/monitor.py) wrapper
-
-## Plotting: single- and few curve plots
-Once results are loaded, they can be plotted in all conventional means. For example:
-```python
-import matplotlib.pyplot as plt
-import numpy as np
-r = results[0]
-plt.plot(np.cumsum(r.monitor.l), r.monitor.r)
-```
-will print a (very noisy learning curve) for CartPole (assuming we ran the training command for CartPole above). Note the cumulative sum trick to get convert length of the episode into number of time steps taken so far.
-
-<img src="https://storage.googleapis.com/baselines/assets/viz/Screen%20Shot%202018-10-29%20at%204.44.46%20PM.png" width="500">
-
-We can get a smoothened version of the same curve by using `plot_util.smooth()` function:
-```python
-plt.plot(np.cumsum(r.monitor.l), pu.smooth(r.monitor.r, radius=10))
-```
-
-<img src="https://storage.googleapis.com/baselines/assets/viz/Screen%20Shot%202018-10-29%20at%204.49.13%20PM.png" width="730">
-
-We can also get a similar curve by using logger summaries (instead of raw episode data in monitor.csv): 
-```python
-plt.plot(r.progress.total_timesteps, r.progress.eprewmean)
-```
-
-<img src="https://storage.googleapis.com/baselines/assets/viz/Screen%20Shot%202018-10-29%20at%205.04.31%20PM.png" width="730">
-
-Note, however, that raw episode data is stored by the Monitor wrapper, and hence looks similar for all algorithms, whereas progress data
-is handled by the algorithm itself, and hence can vary (column names, type of data available) between algorithms. 
-
-## Plotting: many curves 
-While the loading and the plotting functions described above in principle give you access to any slice of the training summaries,
-sometimes it is necessary to plot and compare many training runs (multiple algorithms, multiple seeds for random number generator),
-and usage of the functions above can get tedious and messy. For that case, `baselines.common.plot_util` provides convenience function
-`plot_results` that handles multiple Result objects that need to be routed in multiple plots. Consider the following bash snippet that
-runs ppo2 with cartpole with 6 different seeds for 30k time steps, first with batch size 32, and then with batch size 128:
-
-```bash
-for seed in $(seq 0 5); do
-OPENAI_LOGDIR=$HOME/logs/cartpole-ppo/b32-$seed python -m baselines.run --alg=ppo2 --env=CartPole-v0 --num_timesteps=3e4 --seed=$seed --nsteps=32
-done
-for seed in $(seq 0 5); do
-OPENAI_LOGDIR=$HOME/logs/cartpole-ppo/b128-$seed python -m baselines.run --alg=ppo2 --env=CartPole-v0 --num_timesteps=3e4 --seed=$seed --nsteps=128
-done
-```
-These 12 runs can be loaded just as before:
-```python
-results = pu.load_results('~/logs/cartpole-ppo')
-```
-But how do we plot all 12 of them in a sensible manner? `baselines.common.plot_util` module provides `plot_results` function to do just that:
-```
-results = results[1:]
-pu.plot_results(results)
-```
-(note that now the length of the results list is 13, due to the data from the previous run stored directly in `~/logs/cartpole-ppo`; we discard first element for the same reason)
-The results are split into two groups based on batch size and are plotted on a separate graph. More specifically, by default `plot_results` considers digits after dash at the end of the directory name to be seed id and groups the runs that differ only by those together. 
-
-<img src="https://storage.googleapis.com/baselines/assets/viz/Screen%20Shot%202018-10-29%20at%205.53.45%20PM.png" width="700">
-
-Showing all seeds on the same plot may be somewhat hard to comprehend and analyse. We can instead average over all seeds via the following command:
-
-<img  src="https://storage.googleapis.com/baselines/assets/viz/Screen%20Shot%202018-11-02%20at%204.42.52%20PM.png" width="720">
-
-The lighter shade shows the standard deviation of data, and darker shade - 
-error in estimate of the mean (that is, standard deviation divided by square root of number of seeds).
-Note that averaging over seeds requires resampling to a common grid, which, in turn, requires smoothing
-(using language of signal processing, we need to do low-pass filtering before resampling to avoid aliasing effects). 
-You can change the amount of smoothing by adjusting `resample` and `smooth_step` arguments to achieve desired smoothing effect
-See the docstring of `plot_util` function for more info. 
-
-To plot both groups on the same graph, we can use the following:
-```python
-pu.plot_results(results, average_group=True, split_fn=lambda _: '')
-```
-Option `split_fn=labmda _:'' ` effectively disables splitting, so that all curves end up on the same panel.
-
-<img src="https://storage.googleapis.com/baselines/assets/viz/Screen%20Shot%202018-11-06%20at%203.11.51%20PM.png" width=720>
-
-Now, with many groups the overlapping shaded regions may start looking messy. We can disable either
-light shaded region (corresponding to standard deviation of the curves in the group) or darker shaded region
-(corresponding to the error in mean estimate) by using `shaded_std=False` or `shaded_err=False` options respectively.
-For instance,
-
-```python
-pu.plot_results(results, average_group=True, split_fn=lambda _: '', shaded_std=False)
-```
-produces the following plot:
-
-<img src="https://storage.googleapis.com/baselines/assets/viz/Screen%20Shot%202018-11-06%20at%203.12.02%20PM.png" width=820>
--- a/setup.cfg
+++ b/setup.cfg
@@ -3,6 +3,5 @@ select = F,E999,W291,W293
 exclude = 
    .git,
    __pycache__,
-    baselines/her,
    baselines/ppo1,
    baselines/bench,
--- a/setup.py
+++ b/setup.py
@@ -53,11 +53,11 @@ setup(name='baselines',
 # ensure there is some tensorflow build with version above 1.4
 import pkg_resources
 tf_pkg = None
-for tf_pkg_name in ['tensorflow', 'tensorflow-gpu']:
+for tf_pkg_name in ['tensorflow', 'tensorflow-gpu', 'tf-nightly', 'tf-nightly-gpu']:
    try:
        tf_pkg = pkg_resources.get_distribution(tf_pkg_name)
    except pkg_resources.DistributionNotFound:
        pass
 assert tf_pkg is not None, 'TensorFlow needed, of version above 1.4'
-from distutils.version import StrictVersion
-assert StrictVersion(re.sub(r'-?rc\d+$', '', tf_pkg.version)) >= StrictVersion('1.4.0')
+from distutils.version import LooseVersion
+assert LooseVersion(re.sub(r'-?rc\d+$', '', tf_pkg.version)) >= LooseVersion('1.4.0')
Author	SHA1	Message	Date
Alex Ray	4d0746b957	add an argument for importing extra modules from run	2019-01-03 11:33:31 -08:00
Ankesh Anand	5115707ce9	Recognize nightly tf builds (#763 ) * Recognize nightly tf builds * Use LooseVersion instead of StrictVersion to recongnize nightly build numbers Nightly version numbers are of the form `1.3.0.dev20181215` but it's not a valid version number for `StrictVersion`, while `LooseVersion` still recognizes it.	2018-12-21 12:47:48 -08:00
pzhokhov	6c44fb28fe	refactor HER - phase 1 (#767 ) * joshim5 changes (width and height to WarpFrame wrapper) * match network output with action distribution via a linear layer only if necessary (#167) * support color vs. grayscale option in WarpFrame wrapper (#166) * support color vs. grayscale option in WarpFrame wrapper * Support color in other wrappers * Updated per Peters suggestions * fixing test failures * ppo2 with microbatches (#168) * pass microbatch_size to the model during construction * microbatch fixes and test (#169) * microbatch fixes and test * tiny cleanup * added assertions to the test * vpg-related fix * Peterz joshim5 subclass ppo2 model (#170) * microbatch fixes and test * tiny cleanup * added assertions to the test * vpg-related fix * subclassing the model to make microbatched version of model WIP * made microbatched model a subclass of ppo2 Model * flake8 complaint * mpi-less ppo2 (resolving merge conflict) * flake8 and mpi4py imports in ppo2/model.py * more un-mpying * merge master * updates to the benchmark viewer code + autopep8 (#184) * viz docs and syntactic sugar wip * update viewer yaml to use persistent volume claims * move plot_util to baselines.common, update links * use 1Tb hard drive for results viewer * small updates to benchmark vizualizer code * autopep8 * autopep8 * any folder can be a benchmark * massage games image a little bit * fixed --preload option in app.py * remove preload from run_viewer.sh * remove pdb breakpoints * update bench-viewer.yaml * fixed bug (#185) * fixed bug it's wrong to do the else statement, because no other nodes would start. * changed the fix slightly * Refactor her phase 1 (#194) * add monitor to the rollout envs in her RUN BENCHMARKS her * Slice -> Slide in her benchmarks RUN BENCHMARKS her * run her benchmark for 200 epochs * dummy commit to RUN BENCHMARKS her * her benchmark for 500 epochs RUN BENCHMARKS her * add num_timesteps to her benchmark to be compatible with viewer RUN BENCHMARKS her * add num_timesteps to her benchmark to be compatible with viewer RUN BENCHMARKS her * add num_timesteps to her benchmark to be compatible with viewer RUN BENCHMARKS her * disable saving of policies in her benchmark RUN BENCHMARKS her * run fetch benchmarks with ppo2 and ddpg RUN BENCHMARKS Fetch * run fetch benchmarks with ppo2 and ddpg RUN BENCHMARKS Fetch * launcher refactor wip * wip * her works on FetchReach * her runner refactor RUN BENCHMARKS Fetch1M * unit test for her * fixing warnings in mpi_average in her, skip test_fetchreach if mujoco is not present * pickle-based serialization in her * remove extra import from subproc_vec_env.py * investigating differences in rollout.py * try with old rollout code RUN BENCHMARKS her * temporarily use DummyVecEnv in cmd_util.py RUN BENCHMARKS her * dummy commit to RUN BENCHMARKS her * set info_values in rollout worker in her RUN BENCHMARKS her * bug in rollout_new.py RUN BENCHMARKS her * fixed bug in rollout_new.py RUN BENCHMARKS her * do not use last step because vecenv calls reset and returns obs after reset RUN BENCHMARKS her * updated buffer sizes RUN BENCHMARKS her * fixed loading/saving via joblib * dust off learning from demonstrations in HER, docs, refactor * add deprecation notice on her play and plot files * address comments by Matthias	2018-12-19 14:44:08 -08:00
Timothy Lee	146bbf886b	Removed code that prevented changes to actor loss when training with demos (#740 )	2018-11-29 17:28:08 -08:00
pzhokhov	f3a5abaeeb	added smoke tests of ddpg (#734 )	2018-11-26 17:57:25 -08:00
pzhokhov	97e039127f	Fix ppo2 with MPI bug, other minor fixes (#735 ) * joshim5 changes (width and height to WarpFrame wrapper) * match network output with action distribution via a linear layer only if necessary (#167) * support color vs. grayscale option in WarpFrame wrapper (#166) * support color vs. grayscale option in WarpFrame wrapper * Support color in other wrappers * Updated per Peters suggestions * fixing test failures * ppo2 with microbatches (#168) * pass microbatch_size to the model during construction * microbatch fixes and test (#169) * microbatch fixes and test * tiny cleanup * added assertions to the test * vpg-related fix * Peterz joshim5 subclass ppo2 model (#170) * microbatch fixes and test * tiny cleanup * added assertions to the test * vpg-related fix * subclassing the model to make microbatched version of model WIP * made microbatched model a subclass of ppo2 Model * flake8 complaint * mpi-less ppo2 (resolving merge conflict) * flake8 and mpi4py imports in ppo2/model.py * more un-mpying * merge master * updates to the benchmark viewer code + autopep8 (#184) * viz docs and syntactic sugar wip * update viewer yaml to use persistent volume claims * move plot_util to baselines.common, update links * use 1Tb hard drive for results viewer * small updates to benchmark vizualizer code * autopep8 * autopep8 * any folder can be a benchmark * massage games image a little bit * fixed --preload option in app.py * remove preload from run_viewer.sh * remove pdb breakpoints * update bench-viewer.yaml * fixed bug (#185) * fixed bug it's wrong to do the else statement, because no other nodes would start. * changed the fix slightly	2018-11-26 17:56:41 -08:00
pzhokhov	25ecb64821	fixed issue with wrong output layer variable names in ddpg (#733 )	2018-11-26 16:30:37 -08:00
Prabhat Nagarajan	7dc6bc7c70	fixes typo (#732 ) * fixes typo * adds apostrophe	2018-11-26 16:19:09 -08:00
Christopher Hesse	7139a66d33	Merge pull request #728 from openai/christopherhesse-patch-1 Update README.md	2018-11-21 15:00:51 -08:00
Christopher Hesse	8607dca99e	Update README.md	2018-11-21 14:57:10 -08:00
pzhokhov	9f9835fe38	Update __init__.py	2018-11-21 12:51:15 -08:00
sedand	d3fed181b5	Fixed comment on example usage in jupyter-notebook (#396 ) Cause of error: Import name must be results_plotter, not log_viewer.	2018-11-14 14:50:59 -08:00
Roman Ring	339d5640b9	add docs for layer_norm param in DQN baseline (#107 )	2018-11-14 12:22:42 -08:00
Buck Shlegeris	a75bc37a40	fix typo in a comment (#161 )	2018-11-14 12:20:55 -08:00
Peter Zhokhov	87b3a04a38	autopep8	2018-11-14 12:16:53 -08:00
Brent Komer	c5b1a1b643	typo fix (#230 )	2018-11-13 13:08:32 -08:00
JohannesAck	c59a10947d	Parameter documentation for tf_util.function (#349 ) * Added parameter documentation This parameter was thus far not documented and is non-intuitive when unfamiliar with tf. * Added parameter documentation	2018-11-13 13:03:48 -08:00
James Alan Preiss	5cd66010dc	case-insensitive sort for human-readable logger (#289 )	2018-11-13 11:09:11 -08:00
Xiaoquan Kong	0a13da8dfe	Change variable name from `inpt` to `input_` (#297 )	2018-11-13 11:08:21 -08:00
Vladislav Zavadskyy	18b6390be6	Typo fix (#287 )	2018-11-13 11:03:55 -08:00
pzhokhov	52255beda5	microbatches in ppo2, custom frame size in WarpFrame, matching fc layer only when needed (#707 ) * joshim5 changes (width and height to WarpFrame wrapper) * match network output with action distribution via a linear layer only if necessary (#167) * support color vs. grayscale option in WarpFrame wrapper (#166) * support color vs. grayscale option in WarpFrame wrapper * Support color in other wrappers * Updated per Peters suggestions * fixing test failures * ppo2 with microbatches (#168) * pass microbatch_size to the model during construction * microbatch fixes and test (#169) * microbatch fixes and test * tiny cleanup * added assertions to the test * vpg-related fix * Peterz joshim5 subclass ppo2 model (#170) * microbatch fixes and test * tiny cleanup * added assertions to the test * vpg-related fix * subclassing the model to make microbatched version of model WIP * made microbatched model a subclass of ppo2 Model * flake8 complaint * mpi-less ppo2 (resolving merge conflict) * flake8 and mpi4py imports in ppo2/model.py * more un-mpying	2018-11-09 11:18:05 -08:00
AurelianTactics	d80acbb4d1	Removing print spam from Wrapper (#705 ) * DDPG has unused 'seed' argument DeepQ, PPO2, ACER, trpo_mpi, A2C, and ACKTR have the code for: ``` from baselines.common import set_global_seeds ... def learn(...): ... set_global_seeds(seed) ``` DDPG has the argument 'seed=None' but doesn't have the two lines of code needed to set the global seeds. * DDPG: duplicate variable assignment variable nb_actions assigned same value twice in space of 10 lines nb_actions = env.action_space.shape[-1] * DDPG: noise_type 'normal_x' and 'ou_x' cause assert noise_type default 'adaptive-param_0.2' works but the arguments that change from parameter noise to actor noise (like 'normal_0.2' and 'ou_0.2' cause an assert message and DDPG not to run. Issue is noise following block: ''' if self.action_noise is not None and apply_noise: noise = self.action_noise() assert noise.shape == action.shape action += noise ''' noise is not nested: [number_of_actions] actions is nested: [[number_of_actions]] Can either nest noise or unnest actions * Revert "DDPG: noise_type 'normal_x' and 'ou_x' cause assert" * DDPG: noise_type 'normal_x' and 'ou_x' cause AssertionError noise_type default 'adaptive-param_0.2' works but the arguments that change from parameter noise to actor noise (like 'normal_0.2' and 'ou_0.2') cause an assert message and DDPG not to run. Issue is the following block: ''' if self.action_noise is not None and apply_noise: noise = self.action_noise() assert noise.shape == action.shape action += noise ''' noise is not nested: [number_of_actions] action is nested: [[number_of_actions]] Hence the shapes do not pass the assert line even though the action += noise line is correct * Removing Print Spam from Wrapper Prints a line every time a video is saved or not saved. Seems unnecessary.	2018-11-08 10:13:07 -08:00
pzhokhov	556b198454	Internal minifixes (#694 ) * joshim5 changes (width and height to WarpFrame wrapper) * match network output with action distribution via a linear layer only if necessary (#167) * support color vs. grayscale option in WarpFrame wrapper (#166) * support color vs. grayscale option in WarpFrame wrapper * Support color in other wrappers * Updated per Peters suggestions * fixing test failures	2018-11-08 10:11:45 -08:00
pzhokhov	cc88804042	Update viz.ipynb	2018-11-07 17:20:52 -08:00
pzhokhov	c14d307834	move viz docs to a notebook entirely (#704 ) * viz docs * writing vizualization docs * documenting plot_util * docstrings in plot_util * autopep8 and flake8 * spelling (using default vim spellchecker and ingoring things like dataframe, docstring and etc) * rephrased viz.md a little bit * more examples of viz code usage in the docs * replaced vizualization doc with notebook	2018-11-07 17:19:42 -08:00
pzhokhov	0b71d4c6c4	remove unused args of DDPG class (#702 )	2018-11-07 17:19:25 -08:00