remove ref to simple_bench + rank variable not used [mpi gone]

2017-10-25 14:08:01 -07:00
127 changed files with 2122 additions and 5311 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,8 +1,6 @@
 *.swp
 *.pyc
-*.pkl
 *.py~
-.pytest_cache
 .DS_Store
 .idea

@@ -35,4 +33,3 @@ src

 MUJOCO_LOG.TXT

-
--- a/README.md
+++ b/README.md
@@ -15,20 +15,16 @@ pip install -e .
 ```

 - [A2C](baselines/a2c)
- [ACER](baselines/acer)
 - [ACKTR](baselines/acktr)
 - [DDPG](baselines/ddpg)
 - [DQN](baselines/deepq)
- [GAIL](baselines/gail)
- [HER](baselines/her)
- [PPO1](baselines/ppo1) (Multi-CPU using MPI)
- [PPO2](baselines/ppo2) (Optimized for GPU)
+- [PPO](baselines/ppo1)
 - [TRPO](baselines/trpo_mpi)

 To cite this repository in publications:

    @misc{baselines,
-      author = {Dhariwal, Prafulla and Hesse, Christopher and Klimov, Oleg and Nichol, Alex and Plappert, Matthias and Radford, Alec and Schulman, John and Sidor, Szymon and Wu, Yuhuai},
+      author = {Hesse, Christopher and Plappert, Matthias and Radford, Alec and Schulman, John and Sidor, Szymon and Wu, Yuhuai},
      title = {OpenAI Baselines},
      year = {2017},
      publisher = {GitHub},
--- a/baselines/a2c/a2c.py
+++ b/baselines/a2c/a2c.py
@@ -1,4 +1,3 @@
-import os
 import os.path as osp
 import gym
 import time
@@ -11,19 +10,22 @@ from baselines import logger
 from baselines.common import set_global_seeds, explained_variance
 from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
 from baselines.common.atari_wrappers import wrap_deepmind
-from baselines.common import tf_util

 from baselines.a2c.utils import discount_with_dones
 from baselines.a2c.utils import Scheduler, make_path, find_trainable_variables
+from baselines.a2c.policies import CnnPolicy
 from baselines.a2c.utils import cat_entropy, mse

 class Model(object):

-    def __init__(self, policy, ob_space, ac_space, nenvs, nsteps,
+    def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, nstack, num_procs,
            ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4,
            alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear'):
-
-        sess = tf_util.make_session()
+        config = tf.ConfigProto(allow_soft_placement=True,
+                                intra_op_parallelism_threads=num_procs,
+                                inter_op_parallelism_threads=num_procs)
+        config.gpu_options.allow_growth = True
+        sess = tf.Session(config=config)
        nact = ac_space.n
        nbatch = nenvs*nsteps

@@ -32,8 +34,8 @@ class Model(object):
        R = tf.placeholder(tf.float32, [nbatch])
        LR = tf.placeholder(tf.float32, [])

-        step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False)
-        train_model = policy(sess, ob_space, ac_space, nenvs*nsteps, nsteps, reuse=True)
+        step_model = policy(sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False)
+        train_model = policy(sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True)

        neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=A)
        pg_loss = tf.reduce_mean(ADV * neglogpac)
@@ -56,7 +58,7 @@ class Model(object):
            for step in range(len(obs)):
                cur_lr = lr.value()
            td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, LR:cur_lr}
-            if states is not None:
+            if states != []:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks
            policy_loss, value_loss, policy_entropy, _ = sess.run(
@@ -67,7 +69,7 @@ class Model(object):

        def save(save_path):
            ps = sess.run(params)
-            make_path(osp.dirname(save_path))
+            make_path(save_path)
            joblib.dump(ps, save_path)

        def load(load_path):
@@ -89,25 +91,32 @@ class Model(object):

 class Runner(object):

-    def __init__(self, env, model, nsteps=5, gamma=0.99):
+    def __init__(self, env, model, nsteps=5, nstack=4, gamma=0.99):
        self.env = env
        self.model = model
        nh, nw, nc = env.observation_space.shape
        nenv = env.num_envs
-        self.batch_ob_shape = (nenv*nsteps, nh, nw, nc)
-        self.obs = np.zeros((nenv, nh, nw, nc), dtype=np.uint8)
+        self.batch_ob_shape = (nenv*nsteps, nh, nw, nc*nstack)
+        self.obs = np.zeros((nenv, nh, nw, nc*nstack), dtype=np.uint8)
        self.nc = nc
        obs = env.reset()
+        self.update_obs(obs)
        self.gamma = gamma
        self.nsteps = nsteps
        self.states = model.initial_state
        self.dones = [False for _ in range(nenv)]

+    def update_obs(self, obs):
+        # Do frame-stacking here instead of the FrameStack wrapper to reduce
+        # IPC overhead
+        self.obs = np.roll(self.obs, shift=-self.nc, axis=3)
+        self.obs[:, :, :, -self.nc:] = obs
+
    def run(self):
        mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[]
        mb_states = self.states
        for n in range(self.nsteps):
-            actions, values, states, _ = self.model.step(self.obs, self.states, self.dones)
+            actions, values, states = self.model.step(self.obs, self.states, self.dones)
            mb_obs.append(np.copy(self.obs))
            mb_actions.append(actions)
            mb_values.append(values)
@@ -118,7 +127,7 @@ class Runner(object):
            for n, done in enumerate(dones):
                if done:
                    self.obs[n] = self.obs[n]*0
-            self.obs = obs
+            self.update_obs(obs)
            mb_rewards.append(rewards)
        mb_dones.append(self.dones)
        #batch of steps to batch of rollouts
@@ -145,16 +154,17 @@ class Runner(object):
        mb_masks = mb_masks.flatten()
        return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values

-def learn(policy, env, seed, nsteps=5, total_timesteps=int(80e6), vf_coef=0.5, ent_coef=0.01, max_grad_norm=0.5, lr=7e-4, lrschedule='linear', epsilon=1e-5, alpha=0.99, gamma=0.99, log_interval=100):
+def learn(policy, env, seed, nsteps=5, nstack=4, total_timesteps=int(80e6), vf_coef=0.5, ent_coef=0.01, max_grad_norm=0.5, lr=7e-4, lrschedule='linear', epsilon=1e-5, alpha=0.99, gamma=0.99, log_interval=100):
    tf.reset_default_graph()
    set_global_seeds(seed)

    nenvs = env.num_envs
    ob_space = env.observation_space
    ac_space = env.action_space
-    model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nenvs=nenvs, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef,
+    num_procs = len(env.remotes) # HACK
+    model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nenvs=nenvs, nsteps=nsteps, nstack=nstack, num_procs=num_procs, ent_coef=ent_coef, vf_coef=vf_coef,
        max_grad_norm=max_grad_norm, lr=lr, alpha=alpha, epsilon=epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule)
-    runner = Runner(env, model, nsteps=nsteps, gamma=gamma)
+    runner = Runner(env, model, nsteps=nsteps, nstack=nstack, gamma=gamma)

    nbatch = nenvs*nsteps
    tstart = time.time()
@@ -173,3 +183,6 @@ def learn(policy, env, seed, nsteps=5, total_timesteps=int(80e6), vf_coef=0.5, e
            logger.record_tabular("explained_variance", float(ev))
            logger.dump_tabular()
    env.close()
+
+if __name__ == '__main__':
+    main()
--- a/baselines/a2c/policies.py
+++ b/baselines/a2c/policies.py
@@ -1,48 +1,36 @@
 import numpy as np
 import tensorflow as tf
-from baselines.a2c.utils import conv, fc, conv_to_fc, batch_to_seq, seq_to_batch, lstm, lnlstm
-from baselines.common.distributions import make_pdtype
-
-def nature_cnn(unscaled_images):
-    """
-    CNN from Nature paper.
-    """
-    scaled_images = tf.cast(unscaled_images, tf.float32) / 255.
-    activ = tf.nn.relu
-    h = activ(conv(scaled_images, 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2)))
-    h2 = activ(conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2)))
-    h3 = activ(conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2)))
-    h3 = conv_to_fc(h3)
-    return activ(fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2)))
+from baselines.a2c.utils import conv, fc, conv_to_fc, batch_to_seq, seq_to_batch, lstm, lnlstm, sample

 class LnLstmPolicy(object):
-    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False):
-        nenv = nbatch // nsteps
+    def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, nlstm=256, reuse=False):
+        nbatch = nenv*nsteps
        nh, nw, nc = ob_space.shape
-        ob_shape = (nbatch, nh, nw, nc)
+        ob_shape = (nbatch, nh, nw, nc*nstack)
        nact = ac_space.n
        X = tf.placeholder(tf.uint8, ob_shape) #obs
        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
        with tf.variable_scope("model", reuse=reuse):
-            h = nature_cnn(X)
-            xs = batch_to_seq(h, nenv, nsteps)
+            h = conv(tf.cast(X, tf.float32)/255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2))
+            h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))
+            h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2))
+            h3 = conv_to_fc(h3)
+            h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
+            xs = batch_to_seq(h4, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm)
            h5 = seq_to_batch(h5)
-            pi = fc(h5, 'pi', nact)
-            vf = fc(h5, 'v', 1)
-
-        self.pdtype = make_pdtype(ac_space)
-        self.pd = self.pdtype.pdfromflat(pi)
+            pi = fc(h5, 'pi', nact, act=lambda x:x)
+            vf = fc(h5, 'v', 1, act=lambda x:x)

        v0 = vf[:, 0]
-        a0 = self.pd.sample()
-        neglogp0 = self.pd.neglogp(a0)
+        a0 = sample(pi)
        self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)

        def step(ob, state, mask):
-            return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask})
+            a, v, s = sess.run([a0, v0, snew], {X:ob, S:state, M:mask})
+            return a, v, s

        def value(ob, state, mask):
            return sess.run(v0, {X:ob, S:state, M:mask})
@@ -57,34 +45,34 @@ class LnLstmPolicy(object):

 class LstmPolicy(object):

-    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False):
-        nenv = nbatch // nsteps
-
+    def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, nlstm=256, reuse=False):
+        nbatch = nenv*nsteps
        nh, nw, nc = ob_space.shape
-        ob_shape = (nbatch, nh, nw, nc)
+        ob_shape = (nbatch, nh, nw, nc*nstack)
        nact = ac_space.n
        X = tf.placeholder(tf.uint8, ob_shape) #obs
        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
        with tf.variable_scope("model", reuse=reuse):
-            h = nature_cnn(X)
-            xs = batch_to_seq(h, nenv, nsteps)
+            h = conv(tf.cast(X, tf.float32)/255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2))
+            h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))
+            h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2))
+            h3 = conv_to_fc(h3)
+            h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
+            xs = batch_to_seq(h4, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
            h5 = seq_to_batch(h5)
-            pi = fc(h5, 'pi', nact)
-            vf = fc(h5, 'v', 1)
-
-        self.pdtype = make_pdtype(ac_space)
-        self.pd = self.pdtype.pdfromflat(pi)
+            pi = fc(h5, 'pi', nact, act=lambda x:x)
+            vf = fc(h5, 'v', 1, act=lambda x:x)

        v0 = vf[:, 0]
-        a0 = self.pd.sample()
-        neglogp0 = self.pd.neglogp(a0)
+        a0 = sample(pi)
        self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)

        def step(ob, state, mask):
-            return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask})
+            a, v, s = sess.run([a0, v0, snew], {X:ob, S:state, M:mask})
+            return a, v, s

        def value(ob, state, mask):
            return sess.run(v0, {X:ob, S:state, M:mask})
@@ -99,67 +87,31 @@ class LstmPolicy(object):

 class CnnPolicy(object):

-    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False): #pylint: disable=W0613
+    def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False):
+        nbatch = nenv*nsteps
        nh, nw, nc = ob_space.shape
-        ob_shape = (nbatch, nh, nw, nc)
+        ob_shape = (nbatch, nh, nw, nc*nstack)
        nact = ac_space.n
        X = tf.placeholder(tf.uint8, ob_shape) #obs
        with tf.variable_scope("model", reuse=reuse):
-            h = nature_cnn(X)
-            pi = fc(h, 'pi', nact, init_scale=0.01)
-            vf = fc(h, 'v', 1)[:,0]
+            h = conv(tf.cast(X, tf.float32)/255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2))
+            h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))
+            h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2))
+            h3 = conv_to_fc(h3)
+            h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
+            pi = fc(h4, 'pi', nact, act=lambda x:x)
+            vf = fc(h4, 'v', 1, act=lambda x:x)

-        self.pdtype = make_pdtype(ac_space)
-        self.pd = self.pdtype.pdfromflat(pi)
-
-        a0 = self.pd.sample()
-        neglogp0 = self.pd.neglogp(a0)
-        self.initial_state = None
+        v0 = vf[:, 0]
+        a0 = sample(pi)
+        self.initial_state = [] #not stateful

        def step(ob, *_args, **_kwargs):
-            a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob})
-            return a, v, self.initial_state, neglogp
+            a, v = sess.run([a0, v0], {X:ob})
+            return a, v, [] #dummy state

        def value(ob, *_args, **_kwargs):
-            return sess.run(vf, {X:ob})
-
-        self.X = X
-        self.pi = pi
-        self.vf = vf
-        self.step = step
-        self.value = value
-
-class MlpPolicy(object):
-    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False): #pylint: disable=W0613
-        ob_shape = (nbatch,) + ob_space.shape
-        actdim = ac_space.shape[0]
-        X = tf.placeholder(tf.float32, ob_shape, name='Ob') #obs
-        with tf.variable_scope("model", reuse=reuse):
-            activ = tf.tanh
-            h1 = activ(fc(X, 'pi_fc1', nh=64, init_scale=np.sqrt(2)))
-            h2 = activ(fc(h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2)))
-            pi = fc(h2, 'pi', actdim, init_scale=0.01)
-            h1 = activ(fc(X, 'vf_fc1', nh=64, init_scale=np.sqrt(2)))
-            h2 = activ(fc(h1, 'vf_fc2', nh=64, init_scale=np.sqrt(2)))
-            vf = fc(h2, 'vf', 1)[:,0]
-            logstd = tf.get_variable(name="logstd", shape=[1, actdim],
-                initializer=tf.zeros_initializer())
-
-        pdparam = tf.concat([pi, pi * 0.0 + logstd], axis=1)
-
-        self.pdtype = make_pdtype(ac_space)
-        self.pd = self.pdtype.pdfromflat(pdparam)
-
-        a0 = self.pd.sample()
-        neglogp0 = self.pd.neglogp(a0)
-        self.initial_state = None
-
-        def step(ob, *_args, **_kwargs):
-            a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob})
-            return a, v, self.initial_state, neglogp
-
-        def value(ob, *_args, **_kwargs):
-            return sess.run(vf, {X:ob})
+            return sess.run(v0, {X:ob})

        self.X = X
        self.pi = pi
--- a/baselines/a2c/run_atari.py
+++ b/baselines/a2c/run_atari.py
@@ -1,30 +1,45 @@
-#!/usr/bin/env python3
-
+#!/usr/bin/env python
+import os, logging, gym
 from baselines import logger
-from baselines.common.cmd_util import make_atari_env, atari_arg_parser
-from baselines.common.vec_env.vec_frame_stack import VecFrameStack
+from baselines.common import set_global_seeds
+from baselines import bench
 from baselines.a2c.a2c import learn
-from baselines.ppo2.policies import CnnPolicy, LstmPolicy, LnLstmPolicy
+from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
+from baselines.common.atari_wrappers import make_atari, wrap_deepmind
+from baselines.a2c.policies import CnnPolicy, LstmPolicy, LnLstmPolicy

-def train(env_id, num_timesteps, seed, policy, lrschedule, num_env):
+def train(env_id, num_timesteps, seed, policy, lrschedule, num_cpu):
+    def make_env(rank):
+        def _thunk():
+            env = make_atari(env_id)
+            env.seed(seed + rank)
+            env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
+            gym.logger.setLevel(logging.WARN)
+            return wrap_deepmind(env)
+        return _thunk
+    set_global_seeds(seed)
+    env = SubprocVecEnv([make_env(i) for i in range(num_cpu)])
    if policy == 'cnn':
        policy_fn = CnnPolicy
    elif policy == 'lstm':
        policy_fn = LstmPolicy
    elif policy == 'lnlstm':
        policy_fn = LnLstmPolicy
-    env = VecFrameStack(make_atari_env(env_id, num_env, seed), 4)
    learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), lrschedule=lrschedule)
    env.close()

 def main():
-    parser = atari_arg_parser()
+    import argparse
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4')
+    parser.add_argument('--seed', help='RNG seed', type=int, default=0)
    parser.add_argument('--policy', help='Policy architecture', choices=['cnn', 'lstm', 'lnlstm'], default='cnn')
    parser.add_argument('--lrschedule', help='Learning rate schedule', choices=['constant', 'linear'], default='constant')
+    parser.add_argument('--num-timesteps', type=int, default=int(10e6))
    args = parser.parse_args()
    logger.configure()
    train(args.env, num_timesteps=args.num_timesteps, seed=args.seed,
-        policy=args.policy, lrschedule=args.lrschedule, num_env=16)
+        policy=args.policy, lrschedule=args.lrschedule, num_cpu=16)

 if __name__ == '__main__':
    main()
--- a/baselines/a2c/utils.py
+++ b/baselines/a2c/utils.py
@@ -39,31 +39,23 @@ def ortho_init(scale=1.0):
        return (scale * q[:shape[0], :shape[1]]).astype(np.float32)
    return _ortho_init

-def conv(x, scope, *, nf, rf, stride, pad='VALID', init_scale=1.0, data_format='NHWC'):
-    if data_format == 'NHWC':
-        channel_ax = 3
-        strides = [1, stride, stride, 1]
-        bshape = [1, 1, 1, nf]
-    elif data_format == 'NCHW':
-        channel_ax = 1
-        strides = [1, 1, stride, stride]
-        bshape = [1, nf, 1, 1]
-    else:
-        raise NotImplementedError
-    nin = x.get_shape()[channel_ax].value
-    wshape = [rf, rf, nin, nf]
+def conv(x, scope, nf, rf, stride, pad='VALID', act=tf.nn.relu, init_scale=1.0):
    with tf.variable_scope(scope):
-        w = tf.get_variable("w", wshape, initializer=ortho_init(init_scale))
-        b = tf.get_variable("b", [1, nf, 1, 1], initializer=tf.constant_initializer(0.0))
-        if data_format == 'NHWC': b = tf.reshape(b, bshape)
-        return b + tf.nn.conv2d(x, w, strides=strides, padding=pad, data_format=data_format)
+        nin = x.get_shape()[3].value
+        w = tf.get_variable("w", [rf, rf, nin, nf], initializer=ortho_init(init_scale))
+        b = tf.get_variable("b", [nf], initializer=tf.constant_initializer(0.0))
+        z = tf.nn.conv2d(x, w, strides=[1, stride, stride, 1], padding=pad)+b
+        h = act(z)
+        return h

-def fc(x, scope, nh, *, init_scale=1.0, init_bias=0.0):
+def fc(x, scope, nh, act=tf.nn.relu, init_scale=1.0):
    with tf.variable_scope(scope):
        nin = x.get_shape()[1].value
        w = tf.get_variable("w", [nin, nh], initializer=ortho_init(init_scale))
-        b = tf.get_variable("b", [nh], initializer=tf.constant_initializer(init_bias))
-        return tf.matmul(x, w)+b
+        b = tf.get_variable("b", [nh], initializer=tf.constant_initializer(0.0))
+        z = tf.matmul(x, w)+b
+        h = act(z)
+        return h

 def batch_to_seq(h, nbatch, nsteps, flat=False):
    if flat:
@@ -170,34 +162,9 @@ def constant(p):
 def linear(p):
    return 1-p

-def middle_drop(p):
-    eps = 0.75
-    if 1-p<eps:
-        return eps*0.1
-    return 1-p
-
-def double_linear_con(p):
-    p *= 2
-    eps = 0.125
-    if 1-p<eps:
-        return eps
-    return 1-p
-
-def double_middle_drop(p):
-    eps1 = 0.75
-    eps2 = 0.25
-    if 1-p<eps1:
-        if 1-p<eps2:
-            return eps2*0.5
-        return eps1*0.1
-    return 1-p
-
 schedules = {
    'linear':linear,
-    'constant':constant,
-    'double_linear_con': double_linear_con,
-    'middle_drop': middle_drop,
-    'double_middle_drop': double_middle_drop
+    'constant':constant
 }

 class Scheduler(object):
@@ -271,7 +238,7 @@ def check_shape(ts,shapes):
 def avg_norm(t):
    return tf.reduce_mean(tf.sqrt(tf.reduce_sum(tf.square(t), axis=-1)))

-def gradient_add(g1, g2, param):
+def myadd(g1, g2, param):
    print([g1, g2, param.name])
    assert (not (g1 is None and g2 is None)), param.name
    if g1 is None:
@@ -281,7 +248,7 @@ def gradient_add(g1, g2, param):
    else:
        return g1 + g2

-def q_explained_variance(qpred, q):
+def my_explained_variance(qpred, q):
    _, vary = tf.nn.moments(q, axes=[0, 1])
    _, varpred = tf.nn.moments(q - qpred, axes=[0, 1])
    check_shape([vary, varpred], [[]] * 2)
--- a/baselines/acer/README.md
+++ b/baselines/acer/README.md
@@ -1,4 +0,0 @@
-# ACER
-
- Original paper: https://arxiv.org/abs/1611.01224
- `python -m baselines.acer.run_atari` runs the algorithm for 40M frames = 10M timesteps on an Atari game. See help (`-h`) for more options.
--- a/baselines/acer/acer_simple.py
+++ b/baselines/acer/acer_simple.py
@@ -1,349 +0,0 @@
-import time
-import joblib
-import numpy as np
-import tensorflow as tf
-from baselines import logger
-
-from baselines.common import set_global_seeds
-
-from baselines.a2c.utils import batch_to_seq, seq_to_batch
-from baselines.a2c.utils import Scheduler, make_path, find_trainable_variables
-from baselines.a2c.utils import cat_entropy_softmax
-from baselines.a2c.utils import EpisodeStats
-from baselines.a2c.utils import get_by_index, check_shape, avg_norm, gradient_add, q_explained_variance
-from baselines.acer.buffer import Buffer
-
-# remove last step
-def strip(var, nenvs, nsteps, flat = False):
-    vars = batch_to_seq(var, nenvs, nsteps + 1, flat)
-    return seq_to_batch(vars[:-1], flat)
-
-def q_retrace(R, D, q_i, v, rho_i, nenvs, nsteps, gamma):
-    """
-    Calculates q_retrace targets
-
-    :param R: Rewards
-    :param D: Dones
-    :param q_i: Q values for actions taken
-    :param v: V values
-    :param rho_i: Importance weight for each action
-    :return: Q_retrace values
-    """
-    rho_bar = batch_to_seq(tf.minimum(1.0, rho_i), nenvs, nsteps, True)  # list of len steps, shape [nenvs]
-    rs = batch_to_seq(R, nenvs, nsteps, True)  # list of len steps, shape [nenvs]
-    ds = batch_to_seq(D, nenvs, nsteps, True)  # list of len steps, shape [nenvs]
-    q_is = batch_to_seq(q_i, nenvs, nsteps, True)
-    vs = batch_to_seq(v, nenvs, nsteps + 1, True)
-    v_final = vs[-1]
-    qret = v_final
-    qrets = []
-    for i in range(nsteps - 1, -1, -1):
-        check_shape([qret, ds[i], rs[i], rho_bar[i], q_is[i], vs[i]], [[nenvs]] * 6)
-        qret = rs[i] + gamma * qret * (1.0 - ds[i])
-        qrets.append(qret)
-        qret = (rho_bar[i] * (qret - q_is[i])) + vs[i]
-    qrets = qrets[::-1]
-    qret = seq_to_batch(qrets, flat=True)
-    return qret
-
-# For ACER with PPO clipping instead of trust region
-# def clip(ratio, eps_clip):
-#     # assume 0 <= eps_clip <= 1
-#     return tf.minimum(1 + eps_clip, tf.maximum(1 - eps_clip, ratio))
-
-class Model(object):
-    def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, nstack, num_procs,
-                 ent_coef, q_coef, gamma, max_grad_norm, lr,
-                 rprop_alpha, rprop_epsilon, total_timesteps, lrschedule,
-                 c, trust_region, alpha, delta):
-        config = tf.ConfigProto(allow_soft_placement=True,
-                                intra_op_parallelism_threads=num_procs,
-                                inter_op_parallelism_threads=num_procs)
-        sess = tf.Session(config=config)
-        nact = ac_space.n
-        nbatch = nenvs * nsteps
-
-        A = tf.placeholder(tf.int32, [nbatch]) # actions
-        D = tf.placeholder(tf.float32, [nbatch]) # dones
-        R = tf.placeholder(tf.float32, [nbatch]) # rewards, not returns
-        MU = tf.placeholder(tf.float32, [nbatch, nact]) # mu's
-        LR = tf.placeholder(tf.float32, [])
-        eps = 1e-6
-
-        step_model = policy(sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False)
-        train_model = policy(sess, ob_space, ac_space, nenvs, nsteps + 1, nstack, reuse=True)
-
-        params = find_trainable_variables("model")
-        print("Params {}".format(len(params)))
-        for var in params:
-            print(var)
-
-        # create polyak averaged model
-        ema = tf.train.ExponentialMovingAverage(alpha)
-        ema_apply_op = ema.apply(params)
-
-        def custom_getter(getter, *args, **kwargs):
-            v = ema.average(getter(*args, **kwargs))
-            print(v.name)
-            return v
-
-        with tf.variable_scope("", custom_getter=custom_getter, reuse=True):
-            polyak_model = policy(sess, ob_space, ac_space, nenvs, nsteps + 1, nstack, reuse=True)
-
-        # Notation: (var) = batch variable, (var)s = seqeuence variable, (var)_i = variable index by action at step i
-        v = tf.reduce_sum(train_model.pi * train_model.q, axis = -1) # shape is [nenvs * (nsteps + 1)]
-
-        # strip off last step
-        f, f_pol, q = map(lambda var: strip(var, nenvs, nsteps), [train_model.pi, polyak_model.pi, train_model.q])
-        # Get pi and q values for actions taken
-        f_i = get_by_index(f, A)
-        q_i = get_by_index(q, A)
-
-        # Compute ratios for importance truncation
-        rho = f / (MU + eps)
-        rho_i = get_by_index(rho, A)
-
-        # Calculate Q_retrace targets
-        qret = q_retrace(R, D, q_i, v, rho_i, nenvs, nsteps, gamma)
-
-        # Calculate losses
-        # Entropy
-        entropy = tf.reduce_mean(cat_entropy_softmax(f))
-
-        # Policy Graident loss, with truncated importance sampling & bias correction
-        v = strip(v, nenvs, nsteps, True)
-        check_shape([qret, v, rho_i, f_i], [[nenvs * nsteps]] * 4)
-        check_shape([rho, f, q], [[nenvs * nsteps, nact]] * 2)
-
-        # Truncated importance sampling
-        adv = qret - v
-        logf = tf.log(f_i + eps)
-        gain_f = logf * tf.stop_gradient(adv * tf.minimum(c, rho_i))  # [nenvs * nsteps]
-        loss_f = -tf.reduce_mean(gain_f)
-
-        # Bias correction for the truncation
-        adv_bc = (q - tf.reshape(v, [nenvs * nsteps, 1]))  # [nenvs * nsteps, nact]
-        logf_bc = tf.log(f + eps) # / (f_old + eps)
-        check_shape([adv_bc, logf_bc], [[nenvs * nsteps, nact]]*2)
-        gain_bc = tf.reduce_sum(logf_bc * tf.stop_gradient(adv_bc * tf.nn.relu(1.0 - (c / (rho + eps))) * f), axis = 1) #IMP: This is sum, as expectation wrt f
-        loss_bc= -tf.reduce_mean(gain_bc)
-
-        loss_policy = loss_f + loss_bc
-
-        # Value/Q function loss, and explained variance
-        check_shape([qret, q_i], [[nenvs * nsteps]]*2)
-        ev = q_explained_variance(tf.reshape(q_i, [nenvs, nsteps]), tf.reshape(qret, [nenvs, nsteps]))
-        loss_q = tf.reduce_mean(tf.square(tf.stop_gradient(qret) - q_i)*0.5)
-
-        # Net loss
-        check_shape([loss_policy, loss_q, entropy], [[]] * 3)
-        loss = loss_policy + q_coef * loss_q - ent_coef * entropy
-
-        if trust_region:
-            g = tf.gradients(- (loss_policy - ent_coef * entropy) * nsteps * nenvs, f) #[nenvs * nsteps, nact]
-            # k = tf.gradients(KL(f_pol || f), f)
-            k = - f_pol / (f + eps) #[nenvs * nsteps, nact] # Directly computed gradient of KL divergence wrt f
-            k_dot_g = tf.reduce_sum(k * g, axis=-1)
-            adj = tf.maximum(0.0, (tf.reduce_sum(k * g, axis=-1) - delta) / (tf.reduce_sum(tf.square(k), axis=-1) + eps)) #[nenvs * nsteps]
-
-            # Calculate stats (before doing adjustment) for logging.
-            avg_norm_k = avg_norm(k)
-            avg_norm_g = avg_norm(g)
-            avg_norm_k_dot_g = tf.reduce_mean(tf.abs(k_dot_g))
-            avg_norm_adj = tf.reduce_mean(tf.abs(adj))
-
-            g = g - tf.reshape(adj, [nenvs * nsteps, 1]) * k
-            grads_f = -g/(nenvs*nsteps) # These are turst region adjusted gradients wrt f ie statistics of policy pi
-            grads_policy = tf.gradients(f, params, grads_f)
-            grads_q = tf.gradients(loss_q * q_coef, params)
-            grads = [gradient_add(g1, g2, param) for (g1, g2, param) in zip(grads_policy, grads_q, params)]
-
-            avg_norm_grads_f = avg_norm(grads_f) * (nsteps * nenvs)
-            norm_grads_q = tf.global_norm(grads_q)
-            norm_grads_policy = tf.global_norm(grads_policy)
-        else:
-            grads = tf.gradients(loss, params)
-
-        if max_grad_norm is not None:
-            grads, norm_grads = tf.clip_by_global_norm(grads, max_grad_norm)
-        grads = list(zip(grads, params))
-        trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=rprop_alpha, epsilon=rprop_epsilon)
-        _opt_op = trainer.apply_gradients(grads)
-
-        # so when you call _train, you first do the gradient step, then you apply ema
-        with tf.control_dependencies([_opt_op]):
-            _train = tf.group(ema_apply_op)
-
-        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)
-
-        # Ops/Summaries to run, and their names for logging
-        run_ops = [_train, loss, loss_q, entropy, loss_policy, loss_f, loss_bc, ev, norm_grads]
-        names_ops = ['loss', 'loss_q', 'entropy', 'loss_policy', 'loss_f', 'loss_bc', 'explained_variance',
-                     'norm_grads']
-        if trust_region:
-            run_ops = run_ops + [norm_grads_q, norm_grads_policy, avg_norm_grads_f, avg_norm_k, avg_norm_g, avg_norm_k_dot_g,
-                                 avg_norm_adj]
-            names_ops = names_ops + ['norm_grads_q', 'norm_grads_policy', 'avg_norm_grads_f', 'avg_norm_k', 'avg_norm_g',
-                                     'avg_norm_k_dot_g', 'avg_norm_adj']
-
-        def train(obs, actions, rewards, dones, mus, states, masks, steps):
-            cur_lr = lr.value_steps(steps)
-            td_map = {train_model.X: obs, polyak_model.X: obs, A: actions, R: rewards, D: dones, MU: mus, LR: cur_lr}
-            if states != []:
-                td_map[train_model.S] = states
-                td_map[train_model.M] = masks
-                td_map[polyak_model.S] = states
-                td_map[polyak_model.M] = masks
-            return names_ops, sess.run(run_ops, td_map)[1:]  # strip off _train
-
-        def save(save_path):
-            ps = sess.run(params)
-            make_path(osp.dirname(save_path))
-            joblib.dump(ps, save_path)
-
-        self.train = train
-        self.save = save
-        self.train_model = train_model
-        self.step_model = step_model
-        self.step = step_model.step
-        self.initial_state = step_model.initial_state
-        tf.global_variables_initializer().run(session=sess)
-
-class Runner(object):
-    def __init__(self, env, model, nsteps, nstack):
-        self.env = env
-        self.nstack = nstack
-        self.model = model
-        nh, nw, nc = env.observation_space.shape
-        self.nc = nc  # nc = 1 for atari, but just in case
-        self.nenv = nenv = env.num_envs
-        self.nact = env.action_space.n
-        self.nbatch = nenv * nsteps
-        self.batch_ob_shape = (nenv*(nsteps+1), nh, nw, nc*nstack)
-        self.obs = np.zeros((nenv, nh, nw, nc * nstack), dtype=np.uint8)
-        obs = env.reset()
-        self.update_obs(obs)
-        self.nsteps = nsteps
-        self.states = model.initial_state
-        self.dones = [False for _ in range(nenv)]
-
-    def update_obs(self, obs, dones=None):
-        if dones is not None:
-            self.obs *= (1 - dones.astype(np.uint8))[:, None, None, None]
-        self.obs = np.roll(self.obs, shift=-self.nc, axis=3)
-        self.obs[:, :, :, -self.nc:] = obs[:, :, :, :]
-
-    def run(self):
-        enc_obs = np.split(self.obs, self.nstack, axis=3)  # so now list of obs steps
-        mb_obs, mb_actions, mb_mus, mb_dones, mb_rewards = [], [], [], [], []
-        for _ in range(self.nsteps):
-            actions, mus, states = self.model.step(self.obs, state=self.states, mask=self.dones)
-            mb_obs.append(np.copy(self.obs))
-            mb_actions.append(actions)
-            mb_mus.append(mus)
-            mb_dones.append(self.dones)
-            obs, rewards, dones, _ = self.env.step(actions)
-            # states information for statefull models like LSTM
-            self.states = states
-            self.dones = dones
-            self.update_obs(obs, dones)
-            mb_rewards.append(rewards)
-            enc_obs.append(obs)
-        mb_obs.append(np.copy(self.obs))
-        mb_dones.append(self.dones)
-
-        enc_obs = np.asarray(enc_obs, dtype=np.uint8).swapaxes(1, 0)
-        mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0)
-        mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0)
-        mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0)
-        mb_mus = np.asarray(mb_mus, dtype=np.float32).swapaxes(1, 0)
-
-        mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
-
-        mb_masks = mb_dones # Used for statefull models like LSTM's to mask state when done
-        mb_dones = mb_dones[:, 1:] # Used for calculating returns. The dones array is now aligned with rewards
-
-        # shapes are now [nenv, nsteps, []]
-        # When pulling from buffer, arrays will now be reshaped in place, preventing a deep copy.
-
-        return enc_obs, mb_obs, mb_actions, mb_rewards, mb_mus, mb_dones, mb_masks
-
-class Acer():
-    def __init__(self, runner, model, buffer, log_interval):
-        self.runner = runner
-        self.model = model
-        self.buffer = buffer
-        self.log_interval = log_interval
-        self.tstart = None
-        self.episode_stats = EpisodeStats(runner.nsteps, runner.nenv)
-        self.steps = None
-
-    def call(self, on_policy):
-        runner, model, buffer, steps = self.runner, self.model, self.buffer, self.steps
-        if on_policy:
-            enc_obs, obs, actions, rewards, mus, dones, masks = runner.run()
-            self.episode_stats.feed(rewards, dones)
-            if buffer is not None:
-                buffer.put(enc_obs, actions, rewards, mus, dones, masks)
-        else:
-            # get obs, actions, rewards, mus, dones from buffer.
-            obs, actions, rewards, mus, dones, masks = buffer.get()
-
-        # reshape stuff correctly
-        obs = obs.reshape(runner.batch_ob_shape)
-        actions = actions.reshape([runner.nbatch])
-        rewards = rewards.reshape([runner.nbatch])
-        mus = mus.reshape([runner.nbatch, runner.nact])
-        dones = dones.reshape([runner.nbatch])
-        masks = masks.reshape([runner.batch_ob_shape[0]])
-
-        names_ops, values_ops = model.train(obs, actions, rewards, dones, mus, model.initial_state, masks, steps)
-
-        if on_policy and (int(steps/runner.nbatch) % self.log_interval == 0):
-            logger.record_tabular("total_timesteps", steps)
-            logger.record_tabular("fps", int(steps/(time.time() - self.tstart)))
-            # IMP: In EpisodicLife env, during training, we get done=True at each loss of life, not just at the terminal state.
-            # Thus, this is mean until end of life, not end of episode.
-            # For true episode rewards, see the monitor files in the log folder.
-            logger.record_tabular("mean_episode_length", self.episode_stats.mean_length())
-            logger.record_tabular("mean_episode_reward", self.episode_stats.mean_reward())
-            for name, val in zip(names_ops, values_ops):
-                logger.record_tabular(name, float(val))
-            logger.dump_tabular()
-
-
-def learn(policy, env, seed, nsteps=20, nstack=4, total_timesteps=int(80e6), q_coef=0.5, ent_coef=0.01,
-          max_grad_norm=10, lr=7e-4, lrschedule='linear', rprop_epsilon=1e-5, rprop_alpha=0.99, gamma=0.99,
-          log_interval=100, buffer_size=50000, replay_ratio=4, replay_start=10000, c=10.0,
-          trust_region=True, alpha=0.99, delta=1):
-    print("Running Acer Simple")
-    print(locals())
-    tf.reset_default_graph()
-    set_global_seeds(seed)
-
-    nenvs = env.num_envs
-    ob_space = env.observation_space
-    ac_space = env.action_space
-    num_procs = len(env.remotes) # HACK
-    model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nenvs=nenvs, nsteps=nsteps, nstack=nstack,
-                  num_procs=num_procs, ent_coef=ent_coef, q_coef=q_coef, gamma=gamma,
-                  max_grad_norm=max_grad_norm, lr=lr, rprop_alpha=rprop_alpha, rprop_epsilon=rprop_epsilon,
-                  total_timesteps=total_timesteps, lrschedule=lrschedule, c=c,
-                  trust_region=trust_region, alpha=alpha, delta=delta)
-
-    runner = Runner(env=env, model=model, nsteps=nsteps, nstack=nstack)
-    if replay_ratio > 0:
-        buffer = Buffer(env=env, nsteps=nsteps, nstack=nstack, size=buffer_size)
-    else:
-        buffer = None
-    nbatch = nenvs*nsteps
-    acer = Acer(runner, model, buffer, log_interval)
-    acer.tstart = time.time()
-    for acer.steps in range(0, total_timesteps, nbatch): #nbatch samples, 1 on_policy call and multiple off-policy calls
-        acer.call(on_policy=True)
-        if replay_ratio > 0 and buffer.has_atleast(replay_start):
-            n = np.random.poisson(replay_ratio)
-            for _ in range(n):
-                acer.call(on_policy=False)  # no simulation steps in this
-
-    env.close()
--- a/baselines/acer/buffer.py
+++ b/baselines/acer/buffer.py
@@ -1,103 +0,0 @@
-import numpy as np
-
-class Buffer(object):
-    # gets obs, actions, rewards, mu's, (states, masks), dones
-    def __init__(self, env, nsteps, nstack, size=50000):
-        self.nenv = env.num_envs
-        self.nsteps = nsteps
-        self.nh, self.nw, self.nc = env.observation_space.shape
-        self.nstack = nstack
-        self.nbatch = self.nenv * self.nsteps
-        self.size = size // (self.nsteps)  # Each loc contains nenv * nsteps frames, thus total buffer is nenv * size frames
-
-        # Memory
-        self.enc_obs = None
-        self.actions = None
-        self.rewards = None
-        self.mus = None
-        self.dones = None
-        self.masks = None
-
-        # Size indexes
-        self.next_idx = 0
-        self.num_in_buffer = 0
-
-    def has_atleast(self, frames):
-        # Frames per env, so total (nenv * frames) Frames needed
-        # Each buffer loc has nenv * nsteps frames
-        return self.num_in_buffer >= (frames // self.nsteps)
-
-    def can_sample(self):
-        return self.num_in_buffer > 0
-
-    # Generate stacked frames
-    def decode(self, enc_obs, dones):
-        # enc_obs has shape [nenvs, nsteps + nstack, nh, nw, nc]
-        # dones has shape [nenvs, nsteps, nh, nw, nc]
-        # returns stacked obs of shape [nenv, (nsteps + 1), nh, nw, nstack*nc]
-        nstack, nenv, nsteps, nh, nw, nc = self.nstack, self.nenv, self.nsteps, self.nh, self.nw, self.nc
-        y = np.empty([nsteps + nstack - 1, nenv, 1, 1, 1], dtype=np.float32)
-        obs = np.zeros([nstack, nsteps + nstack, nenv, nh, nw, nc], dtype=np.uint8)
-        x = np.reshape(enc_obs, [nenv, nsteps + nstack, nh, nw, nc]).swapaxes(1,
-                                                                              0)  # [nsteps + nstack, nenv, nh, nw, nc]
-        y[3:] = np.reshape(1.0 - dones, [nenv, nsteps, 1, 1, 1]).swapaxes(1, 0)  # keep
-        y[:3] = 1.0
-        # y = np.reshape(1 - dones, [nenvs, nsteps, 1, 1, 1])
-        for i in range(nstack):
-            obs[-(i + 1), i:] = x
-            # obs[:,i:,:,:,-(i+1),:] = x
-            x = x[:-1] * y
-            y = y[1:]
-        return np.reshape(obs[:, 3:].transpose((2, 1, 3, 4, 0, 5)), [nenv, (nsteps + 1), nh, nw, nstack * nc])
-
-    def put(self, enc_obs, actions, rewards, mus, dones, masks):
-        # enc_obs [nenv, (nsteps + nstack), nh, nw, nc]
-        # actions, rewards, dones [nenv, nsteps]
-        # mus [nenv, nsteps, nact]
-
-        if self.enc_obs is None:
-            self.enc_obs = np.empty([self.size] + list(enc_obs.shape), dtype=np.uint8)
-            self.actions = np.empty([self.size] + list(actions.shape), dtype=np.int32)
-            self.rewards = np.empty([self.size] + list(rewards.shape), dtype=np.float32)
-            self.mus = np.empty([self.size] + list(mus.shape), dtype=np.float32)
-            self.dones = np.empty([self.size] + list(dones.shape), dtype=np.bool)
-            self.masks = np.empty([self.size] + list(masks.shape), dtype=np.bool)
-
-        self.enc_obs[self.next_idx] = enc_obs
-        self.actions[self.next_idx] = actions
-        self.rewards[self.next_idx] = rewards
-        self.mus[self.next_idx] = mus
-        self.dones[self.next_idx] = dones
-        self.masks[self.next_idx] = masks
-
-        self.next_idx = (self.next_idx + 1) % self.size
-        self.num_in_buffer = min(self.size, self.num_in_buffer + 1)
-
-    def take(self, x, idx, envx):
-        nenv = self.nenv
-        out = np.empty([nenv] + list(x.shape[2:]), dtype=x.dtype)
-        for i in range(nenv):
-            out[i] = x[idx[i], envx[i]]
-        return out
-
-    def get(self):
-        # returns
-        # obs [nenv, (nsteps + 1), nh, nw, nstack*nc]
-        # actions, rewards, dones [nenv, nsteps]
-        # mus [nenv, nsteps, nact]
-        nenv = self.nenv
-        assert self.can_sample()
-
-        # Sample exactly one id per env. If you sample across envs, then higher correlation in samples from same env.
-        idx = np.random.randint(0, self.num_in_buffer, nenv)
-        envx = np.arange(nenv)
-
-        take = lambda x: self.take(x, idx, envx)  # for i in range(nenv)], axis = 0)
-        dones = take(self.dones)
-        enc_obs = take(self.enc_obs)
-        obs = self.decode(enc_obs, dones)
-        actions = take(self.actions)
-        rewards = take(self.rewards)
-        mus = take(self.mus)
-        masks = take(self.masks)
-        return obs, actions, rewards, mus, dones, masks
--- a/baselines/acer/policies.py
+++ b/baselines/acer/policies.py
@@ -1,79 +0,0 @@
-import numpy as np
-import tensorflow as tf
-from baselines.ppo2.policies import nature_cnn
-from baselines.a2c.utils import fc, batch_to_seq, seq_to_batch, lstm, sample
-
-
-class AcerCnnPolicy(object):
-
-    def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False):
-        nbatch = nenv * nsteps
-        nh, nw, nc = ob_space.shape
-        ob_shape = (nbatch, nh, nw, nc * nstack)
-        nact = ac_space.n
-        X = tf.placeholder(tf.uint8, ob_shape)  # obs
-        with tf.variable_scope("model", reuse=reuse):
-            h = nature_cnn(X)
-            pi_logits = fc(h, 'pi', nact, init_scale=0.01)
-            pi = tf.nn.softmax(pi_logits)
-            q = fc(h, 'q', nact)
-
-        a = sample(pi_logits)  # could change this to use self.pi instead
-        self.initial_state = []  # not stateful
-        self.X = X
-        self.pi = pi  # actual policy params now
-        self.q = q
-
-        def step(ob, *args, **kwargs):
-            # returns actions, mus, states
-            a0, pi0 = sess.run([a, pi], {X: ob})
-            return a0, pi0, []  # dummy state
-
-        def out(ob, *args, **kwargs):
-            pi0, q0 = sess.run([pi, q], {X: ob})
-            return pi0, q0
-
-        def act(ob, *args, **kwargs):
-            return sess.run(a, {X: ob})
-
-        self.step = step
-        self.out = out
-        self.act = act
-
-class AcerLstmPolicy(object):
-
-    def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False, nlstm=256):
-        nbatch = nenv * nsteps
-        nh, nw, nc = ob_space.shape
-        ob_shape = (nbatch, nh, nw, nc * nstack)
-        nact = ac_space.n
-        X = tf.placeholder(tf.uint8, ob_shape)  # obs
-        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
-        S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
-        with tf.variable_scope("model", reuse=reuse):
-            h = nature_cnn(X)
-
-            # lstm
-            xs = batch_to_seq(h, nenv, nsteps)
-            ms = batch_to_seq(M, nenv, nsteps)
-            h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
-            h5 = seq_to_batch(h5)
-
-            pi_logits = fc(h5, 'pi', nact, init_scale=0.01)
-            pi = tf.nn.softmax(pi_logits)
-            q = fc(h5, 'q', nact)
-
-        a = sample(pi_logits)  # could change this to use self.pi instead
-        self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)
-        self.X = X
-        self.M = M
-        self.S = S
-        self.pi = pi  # actual policy params now
-        self.q = q
-
-        def step(ob, state, mask, *args, **kwargs):
-            # returns actions, mus, states
-            a0, pi0, s = sess.run([a, pi, snew], {X: ob, S: state, M: mask})
-            return a0, pi0, s
-
-        self.step = step
--- a/baselines/acer/run_atari.py
+++ b/baselines/acer/run_atari.py
@@ -1,30 +0,0 @@
-#!/usr/bin/env python3
-from baselines import logger
-from baselines.acer.acer_simple import learn
-from baselines.acer.policies import AcerCnnPolicy, AcerLstmPolicy
-from baselines.common.cmd_util import make_atari_env, atari_arg_parser
-
-def train(env_id, num_timesteps, seed, policy, lrschedule, num_cpu):
-    env = make_atari_env(env_id, num_cpu, seed)
-    if policy == 'cnn':
-        policy_fn = AcerCnnPolicy
-    elif policy == 'lstm':
-        policy_fn = AcerLstmPolicy
-    else:
-        print("Policy {} not implemented".format(policy))
-        return
-    learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), lrschedule=lrschedule)
-    env.close()
-
-def main():
-    parser = atari_arg_parser()
-    parser.add_argument('--policy', help='Policy architecture', choices=['cnn', 'lstm', 'lnlstm'], default='cnn')
-    parser.add_argument('--lrschedule', help='Learning rate schedule', choices=['constant', 'linear'], default='constant')
-    parser.add_argument('--logdir', help ='Directory for logging')
-    args = parser.parse_args()
-    logger.configure(args.logdir)
-    train(args.env, num_timesteps=args.num_timesteps, seed=args.seed,
-          policy=args.policy, lrschedule=args.lrschedule, num_cpu=16)
-
-if __name__ == '__main__':
-    main()
--- a/baselines/acktr/acktr_cont.py
+++ b/baselines/acktr/acktr_cont.py
@@ -1,10 +1,10 @@
 import numpy as np
 import tensorflow as tf
 from baselines import logger
-import baselines.common as common
+from baselines import common
 from baselines.common import tf_util as U
 from baselines.acktr import kfac
-from baselines.common.filters import ZFilter
+from baselines.acktr.filters import ZFilter

 def pathlength(path):
    return path["reward"].shape[0]# Loss function that we'll differentiate to get the policy gradient
@@ -70,7 +70,7 @@ def learn(env, policy, vf, gamma, lam, timesteps_per_batch, num_timesteps,
    coord = tf.train.Coordinator()
    for qr in [q_runner, vf.q_runner]:
        assert (qr != None)
-        enqueue_threads.extend(qr.create_threads(tf.get_default_session(), coord=coord, start=True))
+        enqueue_threads.extend(qr.create_threads(U.get_session(), coord=coord, start=True))

    i = 0
    timesteps_so_far = 0
@@ -122,10 +122,10 @@ def learn(env, policy, vf, gamma, lam, timesteps_per_batch, num_timesteps,
        kl = policy.compute_kl(ob_no, oldac_dist)
        if kl > desired_kl * 2:
            logger.log("kl too high")
-            tf.assign(stepsize, tf.maximum(min_stepsize, stepsize / 1.5)).eval()
+            U.eval(tf.assign(stepsize, tf.maximum(min_stepsize, stepsize / 1.5)))
        elif kl < desired_kl / 2:
            logger.log("kl too low")
-            tf.assign(stepsize, tf.minimum(max_stepsize, stepsize * 1.5)).eval()
+            U.eval(tf.assign(stepsize, tf.minimum(max_stepsize, stepsize * 1.5)))            
        else:
            logger.log("kl just right!")

--- a/baselines/acktr/acktr_disc.py
+++ b/baselines/acktr/acktr_disc.py
@@ -7,17 +7,16 @@ from baselines import logger

 from baselines.common import set_global_seeds, explained_variance

-from baselines.a2c.a2c import Runner
-from baselines.a2c.utils import discount_with_dones
-from baselines.a2c.utils import Scheduler, find_trainable_variables
-from baselines.a2c.utils import cat_entropy, mse
+from baselines.acktr.utils import discount_with_dones
+from baselines.acktr.utils import Scheduler, find_trainable_variables
+from baselines.acktr.utils import cat_entropy, mse
 from baselines.acktr import kfac


 class Model(object):

    def __init__(self, policy, ob_space, ac_space, nenvs,total_timesteps, nprocs=32, nsteps=20,
-                 ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5,
+                 nstack=4, ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5,
                 kfac_clip=0.001, lrschedule='linear'):
        config = tf.ConfigProto(allow_soft_placement=True,
                                intra_op_parallelism_threads=nprocs,
@@ -32,8 +31,8 @@ class Model(object):
        PG_LR = tf.placeholder(tf.float32, [])
        VF_LR = tf.placeholder(tf.float32, [])

-        self.model = step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False)
-        self.model2 = train_model = policy(sess, ob_space, ac_space, nenvs*nsteps, nsteps, reuse=True)
+        self.model = step_model = policy(sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False)
+        self.model2 = train_model = policy(sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True)

        logpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=A)
        self.logits = logits = train_model.pi
@@ -72,7 +71,7 @@ class Model(object):
                cur_lr = self.lr.value()

            td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, PG_LR:cur_lr}
-            if states is not None:
+            if states != []:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks

@@ -105,8 +104,70 @@ class Model(object):
        self.initial_state = step_model.initial_state
        tf.global_variables_initializer().run(session=sess)

+class Runner(object):
+
+    def __init__(self, env, model, nsteps, nstack, gamma):
+        self.env = env
+        self.model = model
+        nh, nw, nc = env.observation_space.shape
+        nenv = env.num_envs
+        self.batch_ob_shape = (nenv*nsteps, nh, nw, nc*nstack)
+        self.obs = np.zeros((nenv, nh, nw, nc*nstack), dtype=np.uint8)
+        obs = env.reset()
+        self.update_obs(obs)
+        self.gamma = gamma
+        self.nsteps = nsteps
+        self.states = model.initial_state
+        self.dones = [False for _ in range(nenv)]
+
+    def update_obs(self, obs):
+        self.obs = np.roll(self.obs, shift=-1, axis=3)
+        self.obs[:, :, :, -1] = obs[:, :, :, 0]
+
+    def run(self):
+        mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[]
+        mb_states = self.states
+        for n in range(self.nsteps):
+            actions, values, states = self.model.step(self.obs, self.states, self.dones)
+            mb_obs.append(np.copy(self.obs))
+            mb_actions.append(actions)
+            mb_values.append(values)
+            mb_dones.append(self.dones)
+            obs, rewards, dones, _ = self.env.step(actions)
+            self.states = states
+            self.dones = dones
+            for n, done in enumerate(dones):
+                if done:
+                    self.obs[n] = self.obs[n]*0
+            self.update_obs(obs)
+            mb_rewards.append(rewards)
+        mb_dones.append(self.dones)
+        #batch of steps to batch of rollouts
+        mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape(self.batch_ob_shape)
+        mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0)
+        mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0)
+        mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0)
+        mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
+        mb_masks = mb_dones[:, :-1]
+        mb_dones = mb_dones[:, 1:]
+        last_values = self.model.value(self.obs, self.states, self.dones).tolist()
+        #discount/bootstrap off value fn
+        for n, (rewards, dones, value) in enumerate(zip(mb_rewards, mb_dones, last_values)):
+            rewards = rewards.tolist()
+            dones = dones.tolist()
+            if dones[-1] == 0:
+                rewards = discount_with_dones(rewards+[value], dones+[0], self.gamma)[:-1]
+            else:
+                rewards = discount_with_dones(rewards, dones, self.gamma)
+            mb_rewards[n] = rewards
+        mb_rewards = mb_rewards.flatten()
+        mb_actions = mb_actions.flatten()
+        mb_values = mb_values.flatten()
+        mb_masks = mb_masks.flatten()
+        return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values
+
 def learn(policy, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interval=1, nprocs=32, nsteps=20,
-                 ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5,
+                 nstack=4, ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5,
                 kfac_clip=0.001, save_interval=None, lrschedule='linear'):
    tf.reset_default_graph()
    set_global_seeds(seed)
@@ -115,7 +176,7 @@ def learn(policy, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interval
    ob_space = env.observation_space
    ac_space = env.action_space
    make_model = lambda : Model(policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=nprocs, nsteps
-                                =nsteps, ent_coef=ent_coef, vf_coef=vf_coef, vf_fisher_coef=
+                                =nsteps, nstack=nstack, ent_coef=ent_coef, vf_coef=vf_coef, vf_fisher_coef=
                                vf_fisher_coef, lr=lr, max_grad_norm=max_grad_norm, kfac_clip=kfac_clip,
                                lrschedule=lrschedule)
    if save_interval and logger.get_dir():
@@ -124,7 +185,7 @@ def learn(policy, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interval
            fh.write(cloudpickle.dumps(make_model))
    model = make_model()

-    runner = Runner(env, model, nsteps=nsteps, gamma=gamma)
+    runner = Runner(env, model, nsteps=nsteps, nstack=nstack, gamma=gamma)
    nbatch = nenvs*nsteps
    tstart = time.time()
    coord = tf.train.Coordinator()
--- a/baselines/acktr/filters.py
+++ b/baselines/acktr/filters.py
@@ -1,4 +1,4 @@
-from .running_stat import RunningStat
+from baselines.acktr.running_stat import RunningStat
 from collections import deque
 import numpy as np

--- a/baselines/acktr/kfac.py
+++ b/baselines/acktr/kfac.py
@@ -228,7 +228,7 @@ class KfacOptimizer():
                                Ow = bpropFactor.get_shape()[2]
                                if Oh == 1 and Ow == 1 and self._channel_fac:
                                    # factorization along the channels
-                                    # assume independence between input channels and spatial
+                                    # assume independence bewteen input channels and spatial
                                    # 2K-1 x 2K-1 covariance matrix and C x C covariance matrix
                                    # factorization along the channels do not
                                    # support homogeneous coordinate, assnBias
--- a/baselines/acktr/kfac_utils.py
+++ b/baselines/acktr/kfac_utils.py
@@ -1,55 +1,93 @@
 import tensorflow as tf
+import numpy as np
+

 def gmatmul(a, b, transpose_a=False, transpose_b=False, reduce_dim=None):
-    assert reduce_dim is not None
+    if reduce_dim == None:
+        # general batch matmul
+        if len(a.get_shape()) == 3 and len(b.get_shape()) == 3:
+            return tf.batch_matmul(a, b, adj_x=transpose_a, adj_y=transpose_b)
+        elif len(a.get_shape()) == 3 and len(b.get_shape()) == 2:
+            if transpose_b:
+                N = b.get_shape()[0].value
+            else:
+                N = b.get_shape()[1].value
+            B = a.get_shape()[0].value
+            if transpose_a:
+                K = a.get_shape()[1].value
+                a = tf.reshape(tf.transpose(a, [0, 2, 1]), [-1, K])
+            else:
+                K = a.get_shape()[-1].value
+                a = tf.reshape(a, [-1, K])
+            result = tf.matmul(a, b, transpose_b=transpose_b)
+            result = tf.reshape(result, [B, -1, N])
+            return result
+        elif len(a.get_shape()) == 2 and len(b.get_shape()) == 3:
+            if transpose_a:
+                M = a.get_shape()[1].value
+            else:
+                M = a.get_shape()[0].value
+            B = b.get_shape()[0].value
+            if transpose_b:
+                K = b.get_shape()[-1].value
+                b = tf.transpose(tf.reshape(b, [-1, K]), [1, 0])
+            else:
+                K = b.get_shape()[1].value
+                b = tf.transpose(tf.reshape(
+                    tf.transpose(b, [0, 2, 1]), [-1, K]), [1, 0])
+            result = tf.matmul(a, b, transpose_a=transpose_a)
+            result = tf.transpose(tf.reshape(result, [M, B, -1]), [1, 0, 2])
+            return result
+        else:
+            return tf.matmul(a, b, transpose_a=transpose_a, transpose_b=transpose_b)
+    else:
+        # weird batch matmul
+        if len(a.get_shape()) == 2 and len(b.get_shape()) > 2:
+            # reshape reduce_dim to the left most dim in b
+            b_shape = b.get_shape()
+            if reduce_dim != 0:
+                b_dims = list(range(len(b_shape)))
+                b_dims.remove(reduce_dim)
+                b_dims.insert(0, reduce_dim)
+                b = tf.transpose(b, b_dims)
+            b_t_shape = b.get_shape()
+            b = tf.reshape(b, [int(b_shape[reduce_dim]), -1])
+            result = tf.matmul(a, b, transpose_a=transpose_a,
+                               transpose_b=transpose_b)
+            result = tf.reshape(result, b_t_shape)
+            if reduce_dim != 0:
+                b_dims = list(range(len(b_shape)))
+                b_dims.remove(0)
+                b_dims.insert(reduce_dim, 0)
+                result = tf.transpose(result, b_dims)
+            return result

-    # weird batch matmul
-    if len(a.get_shape()) == 2 and len(b.get_shape()) > 2:
-        # reshape reduce_dim to the left most dim in b
-        b_shape = b.get_shape()
-        if reduce_dim != 0:
-            b_dims = list(range(len(b_shape)))
-            b_dims.remove(reduce_dim)
-            b_dims.insert(0, reduce_dim)
-            b = tf.transpose(b, b_dims)
-        b_t_shape = b.get_shape()
-        b = tf.reshape(b, [int(b_shape[reduce_dim]), -1])
-        result = tf.matmul(a, b, transpose_a=transpose_a,
-                           transpose_b=transpose_b)
-        result = tf.reshape(result, b_t_shape)
-        if reduce_dim != 0:
-            b_dims = list(range(len(b_shape)))
-            b_dims.remove(0)
-            b_dims.insert(reduce_dim, 0)
-            result = tf.transpose(result, b_dims)
-        return result
+        elif len(a.get_shape()) > 2 and len(b.get_shape()) == 2:
+            # reshape reduce_dim to the right most dim in a
+            a_shape = a.get_shape()
+            outter_dim = len(a_shape) - 1
+            reduce_dim = len(a_shape) - reduce_dim - 1
+            if reduce_dim != outter_dim:
+                a_dims = list(range(len(a_shape)))
+                a_dims.remove(reduce_dim)
+                a_dims.insert(outter_dim, reduce_dim)
+                a = tf.transpose(a, a_dims)
+            a_t_shape = a.get_shape()
+            a = tf.reshape(a, [-1, int(a_shape[reduce_dim])])
+            result = tf.matmul(a, b, transpose_a=transpose_a,
+                               transpose_b=transpose_b)
+            result = tf.reshape(result, a_t_shape)
+            if reduce_dim != outter_dim:
+                a_dims = list(range(len(a_shape)))
+                a_dims.remove(outter_dim)
+                a_dims.insert(reduce_dim, outter_dim)
+                result = tf.transpose(result, a_dims)
+            return result

-    elif len(a.get_shape()) > 2 and len(b.get_shape()) == 2:
-        # reshape reduce_dim to the right most dim in a
-        a_shape = a.get_shape()
-        outter_dim = len(a_shape) - 1
-        reduce_dim = len(a_shape) - reduce_dim - 1
-        if reduce_dim != outter_dim:
-            a_dims = list(range(len(a_shape)))
-            a_dims.remove(reduce_dim)
-            a_dims.insert(outter_dim, reduce_dim)
-            a = tf.transpose(a, a_dims)
-        a_t_shape = a.get_shape()
-        a = tf.reshape(a, [-1, int(a_shape[reduce_dim])])
-        result = tf.matmul(a, b, transpose_a=transpose_a,
-                           transpose_b=transpose_b)
-        result = tf.reshape(result, a_t_shape)
-        if reduce_dim != outter_dim:
-            a_dims = list(range(len(a_shape)))
-            a_dims.remove(outter_dim)
-            a_dims.insert(reduce_dim, outter_dim)
-            result = tf.transpose(result, a_dims)
-        return result
+        elif len(a.get_shape()) == 2 and len(b.get_shape()) == 2:
+            return tf.matmul(a, b, transpose_a=transpose_a, transpose_b=transpose_b)

-    elif len(a.get_shape()) == 2 and len(b.get_shape()) == 2:
-        return tf.matmul(a, b, transpose_a=transpose_a, transpose_b=transpose_b)
-
-    assert False, 'something went wrong'
+        assert False, 'something went wrong'


 def clipoutNeg(vec, threshold=1e-6):
--- a/baselines/acktr/policies.py
+++ b/baselines/acktr/policies.py
@@ -1,8 +1,43 @@
 import numpy as np
 import tensorflow as tf
-from baselines.acktr.utils import dense, kl_div
+from baselines.acktr.utils import conv, fc, dense, conv_to_fc, sample, kl_div
 import baselines.common.tf_util as U

+class CnnPolicy(object):
+
+    def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False):
+        nbatch = nenv*nsteps
+        nh, nw, nc = ob_space.shape
+        ob_shape = (nbatch, nh, nw, nc*nstack)
+        nact = ac_space.n
+        X = tf.placeholder(tf.uint8, ob_shape) #obs
+        with tf.variable_scope("model", reuse=reuse):
+            h = conv(tf.cast(X, tf.float32)/255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2))
+            h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))
+            h3 = conv(h2, 'c3', nf=32, rf=3, stride=1, init_scale=np.sqrt(2))
+            h3 = conv_to_fc(h3)
+            h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
+            pi = fc(h4, 'pi', nact, act=lambda x:x)
+            vf = fc(h4, 'v', 1, act=lambda x:x)
+
+        v0 = vf[:, 0]
+        a0 = sample(pi)
+        self.initial_state = [] #not stateful
+
+        def step(ob, *_args, **_kwargs):
+            a, v = sess.run([a0, v0], {X:ob})
+            return a, v, [] #dummy state
+
+        def value(ob, *_args, **_kwargs):
+            return sess.run(v0, {X:ob})
+
+        self.X = X
+        self.pi = pi
+        self.vf = vf
+        self.step = step
+        self.value = value
+
+
 class GaussianMlpPolicy(object):
    def __init__(self, ob_dim, ac_dim):
        # Here we'll construct a bunch of expressions, which will be used in two places:
@@ -25,12 +60,12 @@ class GaussianMlpPolicy(object):
        std_na = tf.tile(std_1a, [tf.shape(mean_na)[0], 1])
        ac_dist = tf.concat([tf.reshape(mean_na, [-1, ac_dim]), tf.reshape(std_na, [-1, ac_dim])], 1)
        sampled_ac_na = tf.random_normal(tf.shape(ac_dist[:,ac_dim:])) * ac_dist[:,ac_dim:] + ac_dist[:,:ac_dim] # This is the sampled action we'll perform.
-        logprobsampled_n = - tf.reduce_sum(tf.log(ac_dist[:,ac_dim:]), axis=1) - 0.5 * tf.log(2.0*np.pi)*ac_dim - 0.5 * tf.reduce_sum(tf.square(ac_dist[:,:ac_dim] - sampled_ac_na) / (tf.square(ac_dist[:,ac_dim:])), axis=1) # Logprob of sampled action
-        logprob_n = - tf.reduce_sum(tf.log(ac_dist[:,ac_dim:]), axis=1) - 0.5 * tf.log(2.0*np.pi)*ac_dim - 0.5 * tf.reduce_sum(tf.square(ac_dist[:,:ac_dim] - oldac_na) / (tf.square(ac_dist[:,ac_dim:])), axis=1) # Logprob of previous actions under CURRENT policy (whereas oldlogprob_n is under OLD policy)
-        kl = tf.reduce_mean(kl_div(oldac_dist, ac_dist, ac_dim))
-        #kl = .5 * tf.reduce_mean(tf.square(logprob_n - oldlogprob_n)) # Approximation of KL divergence between old policy used to generate actions, and new policy used to compute logprob_n
-        surr = - tf.reduce_mean(adv_n * logprob_n) # Loss function that we'll differentiate to get the policy gradient
-        surr_sampled = - tf.reduce_mean(logprob_n) # Sampled loss of the policy
+        logprobsampled_n = - U.sum(tf.log(ac_dist[:,ac_dim:]), axis=1) - 0.5 * tf.log(2.0*np.pi)*ac_dim - 0.5 * U.sum(tf.square(ac_dist[:,:ac_dim] - sampled_ac_na) / (tf.square(ac_dist[:,ac_dim:])), axis=1) # Logprob of sampled action
+        logprob_n = - U.sum(tf.log(ac_dist[:,ac_dim:]), axis=1) - 0.5 * tf.log(2.0*np.pi)*ac_dim - 0.5 * U.sum(tf.square(ac_dist[:,:ac_dim] - oldac_na) / (tf.square(ac_dist[:,ac_dim:])), axis=1) # Logprob of previous actions under CURRENT policy (whereas oldlogprob_n is under OLD policy)
+        kl = U.mean(kl_div(oldac_dist, ac_dist, ac_dim))
+        #kl = .5 * U.mean(tf.square(logprob_n - oldlogprob_n)) # Approximation of KL divergence between old policy used to generate actions, and new policy used to compute logprob_n
+        surr = - U.mean(adv_n * logprob_n) # Loss function that we'll differentiate to get the policy gradient
+        surr_sampled = - U.mean(logprob_n) # Sampled loss of the policy
        self._act = U.function([ob_no], [sampled_ac_na, ac_dist, logprobsampled_n]) # Generate a new action and its logprob
        #self.compute_kl = U.function([ob_no, oldac_na, oldlogprob_n], kl) # Compute (approximate) KL divergence between old policy and new policy
        self.compute_kl = U.function([ob_no, oldac_dist], kl)
--- a/baselines/acktr/run_atari.py
+++ b/baselines/acktr/run_atari.py
@@ -1,21 +1,38 @@
-#!/usr/bin/env python3
-
+#!/usr/bin/env python
+import os, logging, gym
 from baselines import logger
+from baselines.common import set_global_seeds
+from baselines import bench
 from baselines.acktr.acktr_disc import learn
-from baselines.common.cmd_util import make_atari_env, atari_arg_parser
-from baselines.common.vec_env.vec_frame_stack import VecFrameStack
-from baselines.ppo2.policies import CnnPolicy
+from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
+from baselines.common.atari_wrappers import make_atari, wrap_deepmind
+from baselines.acktr.policies import CnnPolicy

 def train(env_id, num_timesteps, seed, num_cpu):
-    env = VecFrameStack(make_atari_env(env_id, num_cpu, seed), 4)
+    def make_env(rank):
+        def _thunk():
+            env = make_atari(env_id)
+            env.seed(seed + rank)
+            env = bench.Monitor(env, logger.get_dir() and logger.get_dir())
+            gym.logger.setLevel(logging.WARN)
+            return wrap_deepmind(env)
+        return _thunk
+    set_global_seeds(seed)
+    env = SubprocVecEnv([make_env(i) for i in range(num_cpu)])
    policy_fn = CnnPolicy
    learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), nprocs=num_cpu)
    env.close()

 def main():
-    args = atari_arg_parser().parse_args()
+    import argparse
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4')
+    parser.add_argument('--seed', help='RNG seed', type=int, default=0)
+    parser.add_argument('--num-timesteps', type=int, default=int(10e6))
+    args = parser.parse_args()
    logger.configure()
    train(args.env, num_timesteps=args.num_timesteps, seed=args.seed, num_cpu=32)

+
 if __name__ == '__main__':
    main()
--- a/baselines/acktr/run_mujoco.py
+++ b/baselines/acktr/run_mujoco.py
@@ -1,14 +1,22 @@
-#!/usr/bin/env python3
-
+#!/usr/bin/env python
+import argparse
+import logging
+import os
 import tensorflow as tf
+import gym
 from baselines import logger
-from baselines.common.cmd_util import make_mujoco_env, mujoco_arg_parser
+from baselines.common import set_global_seeds
+from baselines import bench
 from baselines.acktr.acktr_cont import learn
 from baselines.acktr.policies import GaussianMlpPolicy
 from baselines.acktr.value_functions import NeuralNetValueFunction

 def train(env_id, num_timesteps, seed):
-    env = make_mujoco_env(env_id, seed)
+    env=gym.make(env_id)
+    env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
+    set_global_seeds(seed)
+    env.seed(seed)
+    gym.logger.setLevel(logging.WARN)

    with tf.Session(config=tf.ConfigProto()):
        ob_dim = env.observation_space.shape[0]
@@ -25,10 +33,11 @@ def train(env_id, num_timesteps, seed):

        env.close()

-def main():
-    args = mujoco_arg_parser().parse_args()
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Run Mujoco benchmark.')
+    parser.add_argument('--seed', help='RNG seed', type=int, default=0)
+    parser.add_argument('--env', help='environment ID', type=str, default="Reacher-v1")
+    parser.add_argument('--num-timesteps', type=int, default=int(1e6))
+    args = parser.parse_args()
    logger.configure()
    train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
-
-if __name__ == "__main__":
-    main()
--- a/baselines/acktr/utils.py
+++ b/baselines/acktr/utils.py
@@ -1,8 +1,69 @@
+import os
+import numpy as np
 import tensorflow as tf
+import baselines.common.tf_util as U
+from collections import deque
+
+def sample(logits):
+    noise = tf.random_uniform(tf.shape(logits))
+    return tf.argmax(logits - tf.log(-tf.log(noise)), 1)
+
+def std(x):
+    mean = tf.reduce_mean(x)
+    var = tf.reduce_mean(tf.square(x-mean))
+    return tf.sqrt(var)
+
+def cat_entropy(logits):
+    a0 = logits - tf.reduce_max(logits, 1, keep_dims=True)
+    ea0 = tf.exp(a0)
+    z0 = tf.reduce_sum(ea0, 1, keep_dims=True)
+    p0 = ea0 / z0
+    return tf.reduce_sum(p0 * (tf.log(z0) - a0), 1)
+
+def cat_entropy_softmax(p0):
+    return - tf.reduce_sum(p0 * tf.log(p0 + 1e-6), axis = 1)
+
+def mse(pred, target):
+    return tf.square(pred-target)/2.
+
+def ortho_init(scale=1.0):
+    def _ortho_init(shape, dtype, partition_info=None):
+        #lasagne ortho init for tf
+        shape = tuple(shape)
+        if len(shape) == 2:
+            flat_shape = shape
+        elif len(shape) == 4: # assumes NHWC
+            flat_shape = (np.prod(shape[:-1]), shape[-1])
+        else:
+            raise NotImplementedError
+        a = np.random.normal(0.0, 1.0, flat_shape)
+        u, _, v = np.linalg.svd(a, full_matrices=False)
+        q = u if u.shape == flat_shape else v # pick the one with the correct shape
+        q = q.reshape(shape)
+        return (scale * q[:shape[0], :shape[1]]).astype(np.float32)
+    return _ortho_init
+
+def conv(x, scope, nf, rf, stride, pad='VALID', act=tf.nn.relu, init_scale=1.0):
+    with tf.variable_scope(scope):
+        nin = x.get_shape()[3].value
+        w = tf.get_variable("w", [rf, rf, nin, nf], initializer=ortho_init(init_scale))
+        b = tf.get_variable("b", [nf], initializer=tf.constant_initializer(0.0))
+        z = tf.nn.conv2d(x, w, strides=[1, stride, stride, 1], padding=pad)+b
+        h = act(z)
+        return h
+
+def fc(x, scope, nh, act=tf.nn.relu, init_scale=1.0):
+    with tf.variable_scope(scope):
+        nin = x.get_shape()[1].value
+        w = tf.get_variable("w", [nin, nh], initializer=ortho_init(init_scale))
+        b = tf.get_variable("b", [nh], initializer=tf.constant_initializer(0.0))
+        z = tf.matmul(x, w)+b
+        h = act(z)
+        return h

 def dense(x, size, name, weight_init=None, bias_init=0, weight_loss_dict=None, reuse=None):
    with tf.variable_scope(name, reuse=reuse):
-        assert (len(tf.get_variable_scope().name.split('/')) == 2)
+        assert (len(U.scope_name().split('/')) == 2)

        w = tf.get_variable("w", [x.get_shape()[1], size], initializer=weight_init)
        b = tf.get_variable("b", [size], initializer=tf.constant_initializer(bias_init))
@@ -14,10 +75,15 @@ def dense(x, size, name, weight_init=None, bias_init=0, weight_loss_dict=None, r
                weight_loss_dict[w] = weight_decay_fc
                weight_loss_dict[b] = 0.0

-            tf.add_to_collection(tf.get_variable_scope().name.split('/')[0] + '_' + 'losses', weight_decay)
+            tf.add_to_collection(U.scope_name().split('/')[0] + '_' + 'losses', weight_decay)

        return tf.nn.bias_add(tf.matmul(x, w), b)

+def conv_to_fc(x):
+    nh = np.prod([v.value for v in x.get_shape()[1:]])
+    x = tf.reshape(x, [-1, nh])
+    return x
+
 def kl_div(action_dist1, action_dist2, action_size):
    mean1, std1 = action_dist1[:, :action_size], action_dist1[:, action_size:]
    mean2, std2 = action_dist2[:, :action_size], action_dist2[:, action_size:]
@@ -26,3 +92,109 @@ def kl_div(action_dist1, action_dist2, action_size):
    denominator = 2 * tf.square(std2) + 1e-8
    return tf.reduce_sum(
        numerator/denominator + tf.log(std2) - tf.log(std1),reduction_indices=-1)
+
+def discount_with_dones(rewards, dones, gamma):
+    discounted = []
+    r = 0
+    for reward, done in zip(rewards[::-1], dones[::-1]):
+        r = reward + gamma*r*(1.-done) # fixed off by one bug
+        discounted.append(r)
+    return discounted[::-1]
+
+def find_trainable_variables(key):
+    with tf.variable_scope(key):
+        return tf.trainable_variables()
+
+def make_path(f):
+    return os.makedirs(f, exist_ok=True)
+
+def constant(p):
+    return 1
+
+def linear(p):
+    return 1-p
+
+
+def middle_drop(p):
+    eps = 0.75
+    if 1-p<eps:
+        return eps*0.1
+    return 1-p
+
+def double_linear_con(p):
+    p *= 2
+    eps = 0.125
+    if 1-p<eps:
+        return eps
+    return 1-p
+
+
+def double_middle_drop(p):
+    eps1 = 0.75
+    eps2 = 0.25
+    if 1-p<eps1:
+        if 1-p<eps2:
+            return eps2*0.5
+        return eps1*0.1
+    return 1-p
+
+
+schedules = {
+    'linear':linear,
+    'constant':constant,
+    'double_linear_con':double_linear_con,
+    'middle_drop':middle_drop,
+    'double_middle_drop':double_middle_drop
+}
+
+class Scheduler(object):
+
+    def __init__(self, v, nvalues, schedule):
+        self.n = 0.
+        self.v = v
+        self.nvalues = nvalues
+        self.schedule = schedules[schedule]
+
+    def value(self):
+        current_value = self.v*self.schedule(self.n/self.nvalues)
+        self.n += 1.
+        return current_value
+
+    def value_steps(self, steps):
+        return self.v*self.schedule(steps/self.nvalues)
+
+
+class EpisodeStats:
+    def __init__(self, nsteps, nenvs):
+        self.episode_rewards = []
+        for i in range(nenvs):
+            self.episode_rewards.append([])
+        self.lenbuffer = deque(maxlen=40)  # rolling buffer for episode lengths
+        self.rewbuffer = deque(maxlen=40)  # rolling buffer for episode rewards
+        self.nsteps = nsteps
+        self.nenvs = nenvs
+
+    def feed(self, rewards, masks):
+        rewards = np.reshape(rewards, [self.nenvs, self.nsteps])
+        masks = np.reshape(masks, [self.nenvs, self.nsteps])
+        for i in range(0, self.nenvs):
+            for j in range(0, self.nsteps):
+                self.episode_rewards[i].append(rewards[i][j])
+                if masks[i][j]:
+                    l = len(self.episode_rewards[i])
+                    s = sum(self.episode_rewards[i])
+                    self.lenbuffer.append(l)
+                    self.rewbuffer.append(s)
+                    self.episode_rewards[i] = []
+
+    def mean_length(self):
+        if self.lenbuffer:
+            return np.mean(self.lenbuffer)
+        else:
+            return 0  # on the first params dump, no episodes are finished
+
+    def mean_reward(self):
+        if self.rewbuffer:
+            return np.mean(self.rewbuffer)
+        else:
+            return 0
--- a/baselines/acktr/value_functions.py
+++ b/baselines/acktr/value_functions.py
@@ -1,6 +1,6 @@
 from baselines import logger
 import numpy as np
-import baselines.common as common
+from baselines import common
 from baselines.common import tf_util as U
 import tensorflow as tf
 from baselines.acktr import kfac
@@ -16,8 +16,8 @@ class NeuralNetValueFunction(object):
        vpred_n = dense(h2, 1, "hfinal", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)[:,0]
        sample_vpred_n = vpred_n + tf.random_normal(tf.shape(vpred_n))
        wd_loss = tf.get_collection("vf_losses", None)
-        loss = tf.reduce_mean(tf.square(vpred_n - vtarg_n)) + tf.add_n(wd_loss)
-        loss_sampled = tf.reduce_mean(tf.square(vpred_n - tf.stop_gradient(sample_vpred_n)))
+        loss = U.mean(tf.square(vpred_n - vtarg_n)) + tf.add_n(wd_loss)
+        loss_sampled = U.mean(tf.square(vpred_n - tf.stop_gradient(sample_vpred_n)))
        self._predict = U.function([X], vpred_n)
        optim = kfac.KfacOptimizer(learning_rate=0.001, cold_lr=0.001*(1-0.9), momentum=0.9, \
                                    clip_kl=0.3, epsilon=0.1, stats_decay=0.95, \
--- a/baselines/bench/init.py
+++ b/baselines/bench/init.py
@@ -1,2 +1,2 @@
 from baselines.bench.benchmarks import *
-from baselines.bench.monitor import *
+from baselines.bench.monitor import *
--- a/baselines/bench/benchmarks.py
+++ b/baselines/bench/benchmarks.py
@@ -1,24 +1,15 @@
-import re
 import os.path as osp
-import os
-SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))

 _atari7 = ['BeamRider', 'Breakout', 'Enduro', 'Pong', 'Qbert', 'Seaquest', 'SpaceInvaders']
 _atariexpl7 = ['Freeway', 'Gravitar', 'MontezumaRevenge', 'Pitfall', 'PrivateEye', 'Solaris', 'Venture']

 _BENCHMARKS = []

-remove_version_re = re.compile(r'-v\d+$')
+
 def register_benchmark(benchmark):
    for b in _BENCHMARKS:
        if b['name'] == benchmark['name']:
            raise ValueError('Benchmark with name %s already registered!' % b['name'])
-
-    # automatically add a description if it is not present
-    if 'tasks' in benchmark:
-        for t in benchmark['tasks']:
-            if 'desc' not in t:
-                t['desc'] = remove_version_re.sub('', t['env_id'])
    _BENCHMARKS.append(benchmark)


@@ -51,34 +42,36 @@ _ATARI_SUFFIX = 'NoFrameskip-v4'
 register_benchmark({
    'name': 'Atari50M',
    'description': '7 Atari games from Mnih et al. (2013), with pixel observations, 50M timesteps',
-    'tasks': [{'desc': _game, 'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(50e6)} for _game in _atari7]
+    'tasks': [{'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(50e6)} for _game in _atari7]
 })

 register_benchmark({
    'name': 'Atari10M',
    'description': '7 Atari games from Mnih et al. (2013), with pixel observations, 10M timesteps',
-    'tasks': [{'desc': _game, 'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(10e6)} for _game in _atari7]
+    'tasks': [{'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(10e6)} for _game in _atari7]
 })

 register_benchmark({
    'name': 'Atari1Hr',
    'description': '7 Atari games from Mnih et al. (2013), with pixel observations, 1 hour of walltime',
-    'tasks': [{'desc': _game, 'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_seconds': 60 * 60} for _game in _atari7]
+    'tasks': [{'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_seconds': 60 * 60} for _game in _atari7]
 })

 register_benchmark({
    'name': 'AtariExploration10M',
    'description': '7 Atari games emphasizing exploration, with pixel observations, 10M timesteps',
-    'tasks': [{'desc': _game, 'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(10e6)} for _game in _atariexpl7]
+    'tasks': [{'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(10e6)} for _game in _atariexpl7]
 })


+
+
 # MuJoCo

 _mujocosmall = [
-    'InvertedDoublePendulum-v2', 'InvertedPendulum-v2',
-    'HalfCheetah-v2', 'Hopper-v2', 'Walker2d-v2',
-    'Reacher-v2', 'Swimmer-v2']
+    'InvertedDoublePendulum-v1', 'InvertedPendulum-v1',
+    'HalfCheetah-v1', 'Hopper-v1', 'Walker2d-v1',
+    'Reacher-v1', 'Swimmer-v1']
 register_benchmark({
    'name': 'Mujoco1M',
    'description': 'Some small 2D MuJoCo tasks, run for 1M timesteps',
@@ -135,6 +128,5 @@ _atari50 = [  # actually 47
 register_benchmark({
    'name': 'Atari50_10M',
    'description': '47 Atari games from Mnih et al. (2013), with pixel observations, 10M timesteps',
-    'tasks': [{'desc': _game, 'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(10e6)} for _game in _atari50]
+    'tasks': [{'env_id': _game + _ATARI_SUFFIX, 'trials': 3, 'num_timesteps': int(10e6)} for _game in _atari50]
 })
-
--- a/baselines/bench/monitor.py
+++ b/baselines/bench/monitor.py
@@ -7,13 +7,12 @@ from glob import glob
 import csv
 import os.path as osp
 import json
-import numpy as np

 class Monitor(Wrapper):
    EXT = "monitor.csv"
    f = None

-    def __init__(self, env, filename, allow_early_resets=False, reset_keywords=(), info_keywords=()):
+    def __init__(self, env, filename, allow_early_resets=False, reset_keywords=()):
        Wrapper.__init__(self, env=env)
        self.tstart = time.time()
        if filename is None:
@@ -26,23 +25,21 @@ class Monitor(Wrapper):
                else:
                    filename = filename + "." + Monitor.EXT
            self.f = open(filename, "wt")
-            self.f.write('#%s\n'%json.dumps({"t_start": self.tstart, 'env_id' : env.spec and env.spec.id}))
-            self.logger = csv.DictWriter(self.f, fieldnames=('r', 'l', 't')+reset_keywords+info_keywords)
+            self.f.write('#%s\n'%json.dumps({"t_start": self.tstart, "gym_version": gym.__version__,
+                "env_id": env.spec.id if env.spec else 'Unknown'}))
+            self.logger = csv.DictWriter(self.f, fieldnames=('r', 'l', 't')+reset_keywords)
            self.logger.writeheader()
-            self.f.flush()

        self.reset_keywords = reset_keywords
-        self.info_keywords = info_keywords
        self.allow_early_resets = allow_early_resets
        self.rewards = None
        self.needs_reset = True
        self.episode_rewards = []
        self.episode_lengths = []
-        self.episode_times = []
        self.total_steps = 0
        self.current_reset_info = {} # extra info about the current episode, that was passed in during reset()

-    def reset(self, **kwargs):
+    def _reset(self, **kwargs):
        if not self.allow_early_resets and not self.needs_reset:
            raise RuntimeError("Tried to reset an environment before done. If you want to allow early resets, wrap your env with Monitor(env, path, allow_early_resets=True)")
        self.rewards = []
@@ -54,7 +51,7 @@ class Monitor(Wrapper):
            self.current_reset_info[k] = v
        return self.env.reset(**kwargs)

-    def step(self, action):
+    def _step(self, action):
        if self.needs_reset:
            raise RuntimeError("Tried to step environment that needs reset")
        ob, rew, done, info = self.env.step(action)
@@ -64,15 +61,12 @@ class Monitor(Wrapper):
            eprew = sum(self.rewards)
            eplen = len(self.rewards)
            epinfo = {"r": round(eprew, 6), "l": eplen, "t": round(time.time() - self.tstart, 6)}
-            for k in self.info_keywords:
-                epinfo[k] = info[k]
-            self.episode_rewards.append(eprew)
-            self.episode_lengths.append(eplen)
-            self.episode_times.append(time.time() - self.tstart)
            epinfo.update(self.current_reset_info)
            if self.logger:
                self.logger.writerow(epinfo)
                self.f.flush()
+            self.episode_rewards.append(eprew)
+            self.episode_lengths.append(eplen)
            info['episode'] = epinfo
        self.total_steps += 1
        return (ob, rew, done, info)
@@ -90,9 +84,6 @@ class Monitor(Wrapper):
    def get_episode_lengths(self):
        return self.episode_lengths

-    def get_episode_times(self):
-        return self.episode_times
-
 class LoadMonitorResultsError(Exception):
    pass

@@ -101,9 +92,7 @@ def get_monitor_files(dir):

 def load_results(dir):
    import pandas
-    monitor_files = (
-        glob(osp.join(dir, "*monitor.json")) + 
-        glob(osp.join(dir, "*monitor.csv"))) # get both csv and (old) json files
+    monitor_files = glob(osp.join(dir, "*monitor.*")) # get both csv and (old) json files
    if not monitor_files:
        raise LoadMonitorResultsError("no monitor files of the form *%s found in %s" % (Monitor.EXT, dir))
    dfs = []
@@ -125,37 +114,10 @@ def load_results(dir):
                    episode = json.loads(line)
                    episodes.append(episode)
                df = pandas.DataFrame(episodes)
-            else:
-                assert 0, 'unreachable'
-            df['t'] += header['t_start']
+        df['t'] += header['t_start']
        dfs.append(df)
    df = pandas.concat(dfs)
    df.sort_values('t', inplace=True)
-    df.reset_index(inplace=True)
    df['t'] -= min(header['t_start'] for header in headers)
    df.headers = headers # HACK to preserve backwards compatibility
-    return df
-
-def test_monitor():
-    env = gym.make("CartPole-v1")
-    env.seed(0)
-    mon_file = "/tmp/baselines-test-%s.monitor.csv" % uuid.uuid4()
-    menv = Monitor(env, mon_file)
-    menv.reset()
-    for _ in range(1000):
-        _, _, done, _ = menv.step(0)
-        if done:
-            menv.reset()
-
-    f = open(mon_file, 'rt')
-
-    firstline = f.readline()
-    assert firstline.startswith('#')
-    metadata = json.loads(firstline[1:])
-    assert metadata['env_id'] == "CartPole-v1"
-    assert set(metadata.keys()) == {'env_id', 'gym_version', 't_start'},  "Incorrect keys in monitor metadata"
-
-    last_logline = pandas.read_csv(f, index_col=None)
-    assert set(last_logline.keys()) == {'l', 't', 'r'}, "Incorrect keys in monitor logline"
-    f.close()
-    os.remove(mon_file)
+    return df
--- a/baselines/common/atari_wrappers.py
+++ b/baselines/common/atari_wrappers.py
@@ -3,7 +3,6 @@ from collections import deque
 import gym
 from gym import spaces
 import cv2
-cv2.ocl.setUseOpenCL(False)

 class NoopResetEnv(gym.Wrapper):
    def __init__(self, env, noop_max=30):
@@ -13,10 +12,14 @@ class NoopResetEnv(gym.Wrapper):
        gym.Wrapper.__init__(self, env)
        self.noop_max = noop_max
        self.override_num_noops = None
-        self.noop_action = 0
-        assert env.unwrapped.get_action_meanings()[0] == 'NOOP'
+        if isinstance(env.action_space, gym.spaces.MultiBinary):
+            self.noop_action = np.zeros(self.env.action_space.n, dtype=np.int64)
+        else:
+            # used for atari environments
+            self.noop_action = 0
+            assert env.unwrapped.get_action_meanings()[0] == 'NOOP'

-    def reset(self, **kwargs):
+    def _reset(self, **kwargs):
        """ Do no-op action for a number of steps in [1, noop_max]."""
        self.env.reset(**kwargs)
        if self.override_num_noops is not None:
@@ -31,9 +34,6 @@ class NoopResetEnv(gym.Wrapper):
                obs = self.env.reset(**kwargs)
        return obs

-    def step(self, ac):
-        return self.env.step(ac)
-
 class FireResetEnv(gym.Wrapper):
    def __init__(self, env):
        """Take action on reset for environments that are fixed until firing."""
@@ -41,7 +41,7 @@ class FireResetEnv(gym.Wrapper):
        assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
        assert len(env.unwrapped.get_action_meanings()) >= 3

-    def reset(self, **kwargs):
+    def _reset(self, **kwargs):
        self.env.reset(**kwargs)
        obs, _, done, _ = self.env.step(1)
        if done:
@@ -51,9 +51,6 @@ class FireResetEnv(gym.Wrapper):
            self.env.reset(**kwargs)
        return obs

-    def step(self, ac):
-        return self.env.step(ac)
-
 class EpisodicLifeEnv(gym.Wrapper):
    def __init__(self, env):
        """Make end-of-life == end-of-episode, but only reset on true game over.
@@ -63,21 +60,21 @@ class EpisodicLifeEnv(gym.Wrapper):
        self.lives = 0
        self.was_real_done  = True

-    def step(self, action):
+    def _step(self, action):
        obs, reward, done, info = self.env.step(action)
        self.was_real_done = done
        # check current lives, make loss of life terminal,
        # then update lives to handle bonus lives
        lives = self.env.unwrapped.ale.lives()
        if lives < self.lives and lives > 0:
-            # for Qbert sometimes we stay in lives == 0 condtion for a few frames
+            # for Qbert somtimes we stay in lives == 0 condtion for a few frames
            # so its important to keep lives > 0, so that we only reset once
            # the environment advertises done.
            done = True
        self.lives = lives
        return obs, reward, done, info

-    def reset(self, **kwargs):
+    def _reset(self, **kwargs):
        """Reset only when lives are exhausted.
        This way all states are still reachable even though lives are episodic,
        and the learner need not know about any of this behind-the-scenes.
@@ -95,13 +92,10 @@ class MaxAndSkipEnv(gym.Wrapper):
        """Return only every `skip`-th frame"""
        gym.Wrapper.__init__(self, env)
        # most recent raw observations (for max pooling across time steps)
-        self._obs_buffer = np.zeros((2,)+env.observation_space.shape, dtype=np.uint8)
+        self._obs_buffer = np.zeros((2,)+env.observation_space.shape, dtype='uint8')
        self._skip       = skip

-    def reset(self):
-        return self.env.reset()
-
-    def step(self, action):
+    def _step(self, action):
        """Repeat action, sum reward, and max over last observations."""
        total_reward = 0.0
        done = None
@@ -118,14 +112,8 @@ class MaxAndSkipEnv(gym.Wrapper):

        return max_frame, total_reward, done, info

-    def reset(self, **kwargs):
-        return self.env.reset(**kwargs)
-
 class ClipRewardEnv(gym.RewardWrapper):
-    def __init__(self, env):
-        gym.RewardWrapper.__init__(self, env)
-
-    def reward(self, reward):
+    def _reward(self, reward):
        """Bin reward to {+1, 0, -1} by its sign."""
        return np.sign(reward)

@@ -135,10 +123,9 @@ class WarpFrame(gym.ObservationWrapper):
        gym.ObservationWrapper.__init__(self, env)
        self.width = 84
        self.height = 84
-        self.observation_space = spaces.Box(low=0, high=255,
-            shape=(self.height, self.width, 1), dtype=np.uint8)
+        self.observation_space = spaces.Box(low=0, high=255, shape=(self.height, self.width, 1))

-    def observation(self, frame):
+    def _observation(self, frame):
        frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
        frame = cv2.resize(frame, (self.width, self.height), interpolation=cv2.INTER_AREA)
        return frame[:, :, None]
@@ -157,15 +144,15 @@ class FrameStack(gym.Wrapper):
        self.k = k
        self.frames = deque([], maxlen=k)
        shp = env.observation_space.shape
-        self.observation_space = spaces.Box(low=0, high=255, shape=(shp[0], shp[1], shp[2] * k), dtype=np.uint8)
+        self.observation_space = spaces.Box(low=0, high=255, shape=(shp[0], shp[1], shp[2] * k))

-    def reset(self):
+    def _reset(self):
        ob = self.env.reset()
        for _ in range(self.k):
            self.frames.append(ob)
        return self._get_ob()

-    def step(self, action):
+    def _step(self, action):
        ob, reward, done, info = self.env.step(action)
        self.frames.append(ob)
        return self._get_ob(), reward, done, info
@@ -175,10 +162,7 @@ class FrameStack(gym.Wrapper):
        return LazyFrames(list(self.frames))

 class ScaledFloatFrame(gym.ObservationWrapper):
-    def __init__(self, env):
-        gym.ObservationWrapper.__init__(self, env)
-
-    def observation(self, observation):
+    def _observation(self, observation):
        # careful! This undoes the memory optimization, use
        # with smaller replay buffers only.
        return np.array(observation).astype(np.float32) / 255.0
@@ -191,28 +175,15 @@ class LazyFrames(object):

        This object should only be converted to numpy array before being passed to the model.

-        You'd not believe how complex the previous solution was."""
+        You'd not belive how complex the previous solution was."""
        self._frames = frames
-        self._out = None
-
-    def _force(self):
-        if self._out is None:
-            self._out = np.concatenate(self._frames, axis=2)
-            self._frames = None
-        return self._out

    def __array__(self, dtype=None):
-        out = self._force()
+        out = np.concatenate(self._frames, axis=2)
        if dtype is not None:
            out = out.astype(dtype)
        return out

-    def __len__(self):
-        return len(self._force())
-
-    def __getitem__(self, i):
-        return self._force()[i]
-
 def make_atari(env_id):
    env = gym.make(env_id)
    assert 'NoFrameskip' in env.spec.id
--- a/baselines/common/azure_utils.py
+++ b/baselines/common/azure_utils.py
@@ -0,0 +1,154 @@
+import os
+import tempfile
+import zipfile
+
+from azure.common import AzureMissingResourceHttpError
+try:
+    from azure.storage.blob import BlobService
+except ImportError:
+    from azure.storage.blob import BlockBlobService as BlobService
+from shutil import unpack_archive
+from threading import Event
+
+# TODOS: use Azure snapshots instead of hacky backups
+
+def fixed_list_blobs(service, *args, **kwargs):
+    """By defualt list_containers only returns a subset of results.
+
+    This function attempts to fix this.
+    """
+    res = []
+    next_marker = None
+    while next_marker is None or len(next_marker) > 0:
+        kwargs['marker'] = next_marker
+        gen = service.list_blobs(*args, **kwargs)
+        for b in gen:
+            res.append(b.name)
+        next_marker = gen.next_marker
+    return res
+
+
+def make_archive(source_path, dest_path):
+    if source_path.endswith(os.path.sep):
+        source_path = source_path.rstrip(os.path.sep)
+    prefix_path = os.path.dirname(source_path)
+    with zipfile.ZipFile(dest_path, "w", compression=zipfile.ZIP_STORED) as zf:
+        if os.path.isdir(source_path):
+            for dirname, _subdirs, files in os.walk(source_path):
+                zf.write(dirname, os.path.relpath(dirname, prefix_path))
+                for filename in files:
+                    filepath = os.path.join(dirname, filename)
+                    zf.write(filepath, os.path.relpath(filepath, prefix_path))
+        else:
+            zf.write(source_path, os.path.relpath(source_path, prefix_path))
+
+
+class Container(object):
+    services = {}
+
+    def __init__(self, account_name, account_key, container_name, maybe_create=False):
+        self._account_name = account_name
+        self._container_name = container_name
+        if account_name not in Container.services:
+            Container.services[account_name] = BlobService(account_name, account_key)
+        self._service = Container.services[account_name]
+        if maybe_create:
+            self._service.create_container(self._container_name, fail_on_exist=False)
+
+    def put(self, source_path, blob_name, callback=None):
+        """Upload a file or directory from `source_path` to azure blob `blob_name`.
+
+        Upload progress can be traced by an optional callback.
+        """
+        upload_done = Event()
+
+        def progress_callback(current, total):
+            if callback:
+                callback(current, total)
+            if current >= total:
+                upload_done.set()
+
+        # Attempt to make backup if an existing version is already available
+        try:
+            x_ms_copy_source = "https://{}.blob.core.windows.net/{}/{}".format(
+                self._account_name,
+                self._container_name,
+                blob_name
+            )
+            self._service.copy_blob(
+                container_name=self._container_name,
+                blob_name=blob_name + ".backup",
+                x_ms_copy_source=x_ms_copy_source
+            )
+        except AzureMissingResourceHttpError:
+            pass
+
+        with tempfile.TemporaryDirectory() as td:
+            arcpath = os.path.join(td, "archive.zip")
+            make_archive(source_path, arcpath)
+            self._service.put_block_blob_from_path(
+                container_name=self._container_name,
+                blob_name=blob_name,
+                file_path=arcpath,
+                max_connections=4,
+                progress_callback=progress_callback,
+                max_retries=10)
+            upload_done.wait()
+
+    def get(self, dest_path, blob_name, callback=None):
+        """Download a file or directory to `dest_path` to azure blob `blob_name`.
+
+        Warning! If directory is downloaded the `dest_path` is the parent directory.
+
+        Upload progress can be traced by an optional callback.
+        """
+        download_done = Event()
+
+        def progress_callback(current, total):
+            if callback:
+                callback(current, total)
+            if current >= total:
+                download_done.set()
+
+        with tempfile.TemporaryDirectory() as td:
+            arcpath = os.path.join(td, "archive.zip")
+            for backup_blob_name in [blob_name, blob_name + '.backup']:
+                try:
+                    properties = self._service.get_blob_properties(
+                        blob_name=backup_blob_name,
+                        container_name=self._container_name
+                    )
+                    if hasattr(properties, 'properties'):
+                        # Annoyingly, Azure has changed the API and this now returns a blob
+                        # instead of it's properties with up-to-date azure package.
+                        blob_size = properties.properties.content_length
+                    else:
+                        blob_size = properties['content-length']
+                    if int(blob_size) > 0:
+                        self._service.get_blob_to_path(
+                            container_name=self._container_name,
+                            blob_name=backup_blob_name,
+                            file_path=arcpath,
+                            max_connections=4,
+                            progress_callback=progress_callback)
+                        unpack_archive(arcpath, dest_path)
+                        download_done.wait()
+                        return True
+                except AzureMissingResourceHttpError:
+                    pass
+        return False
+
+    def list(self, prefix=None):
+        """List all blobs in the container."""
+        return fixed_list_blobs(self._service, self._container_name, prefix=prefix)
+
+    def exists(self, blob_name):
+        """Returns true if `blob_name` exists in container."""
+        try:
+            self._service.get_blob_properties(
+                blob_name=blob_name,
+                container_name=self._container_name
+            )
+            return True
+        except AzureMissingResourceHttpError:
+            return False
--- a/baselines/common/cmd_util.py
+++ b/baselines/common/cmd_util.py
@@ -1,88 +0,0 @@
-"""
-Helpers for scripts like run_atari.py.
-"""
-
-import os
-import gym
-from gym.wrappers import FlattenDictWrapper
-from baselines import logger
-from baselines.bench import Monitor
-from baselines.common import set_global_seeds
-from baselines.common.atari_wrappers import make_atari, wrap_deepmind
-from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
-from mpi4py import MPI
-
-def make_atari_env(env_id, num_env, seed, wrapper_kwargs=None, start_index=0):
-    """
-    Create a wrapped, monitored SubprocVecEnv for Atari.
-    """
-    if wrapper_kwargs is None: wrapper_kwargs = {}
-    def make_env(rank): # pylint: disable=C0111
-        def _thunk():
-            env = make_atari(env_id)
-            env.seed(seed + rank)
-            env = Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
-            return wrap_deepmind(env, **wrapper_kwargs)
-        return _thunk
-    set_global_seeds(seed)
-    return SubprocVecEnv([make_env(i + start_index) for i in range(num_env)])
-
-def make_mujoco_env(env_id, seed):
-    """
-    Create a wrapped, monitored gym.Env for MuJoCo.
-    """
-    set_global_seeds(seed)
-    env = gym.make(env_id)
-    env = Monitor(env, logger.get_dir())
-    env.seed(seed)
-    return env
-
-def make_robotics_env(env_id, seed, rank=0):
-    """
-    Create a wrapped, monitored gym.Env for MuJoCo.
-    """
-    set_global_seeds(seed)
-    env = gym.make(env_id)
-    env = FlattenDictWrapper(env, ['observation', 'desired_goal'])
-    env = Monitor(
-        env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)),
-        info_keywords=('is_success',))
-    env.seed(seed)
-    return env
-
-def arg_parser():
-    """
-    Create an empty argparse.ArgumentParser.
-    """
-    import argparse
-    return argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-
-def atari_arg_parser():
-    """
-    Create an argparse.ArgumentParser for run_atari.py.
-    """
-    parser = arg_parser()
-    parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4')
-    parser.add_argument('--seed', help='RNG seed', type=int, default=0)
-    parser.add_argument('--num-timesteps', type=int, default=int(10e6))
-    return parser
-
-def mujoco_arg_parser():
-    """
-    Create an argparse.ArgumentParser for run_mujoco.py.
-    """
-    parser = arg_parser()
-    parser.add_argument('--env', help='environment ID', type=str, default='Reacher-v2')
-    parser.add_argument('--seed', help='RNG seed', type=int, default=0)
-    parser.add_argument('--num-timesteps', type=int, default=int(1e6))
-    return parser
-
-def robotics_arg_parser():
-    """
-    Create an argparse.ArgumentParser for run_mujoco.py.
-    """
-    parser = arg_parser()
-    parser.add_argument('--env', help='environment ID', type=str, default='FetchReach-v0')
-    parser.add_argument('--seed', help='RNG seed', type=int, default=0)
-    parser.add_argument('--num-timesteps', type=int, default=int(1e6))
-    return parser
--- a/baselines/common/console_util.py
+++ b/baselines/common/console_util.py
@@ -16,12 +16,7 @@ def fmt_item(x, l):
    if isinstance(x, np.ndarray):
        assert x.ndim==0
        x = x.item()
-    if isinstance(x, (float, np.float32, np.float64)):
-        v = abs(x)
-        if (v < 1e-4 or v > 1e+4) and v > 0:
-            rep = "%7.2e" % x
-        else:
-            rep = "%7.5f" % x
+    if isinstance(x, float): rep = "%g"%x
    else: rep = str(x)
    return " "*(l - len(rep)) + rep

--- a/baselines/common/distributions.py
+++ b/baselines/common/distributions.py
@@ -57,12 +57,14 @@ class CategoricalPdType(PdType):


 class MultiCategoricalPdType(PdType):
-    def __init__(self, nvec):
-        self.ncats = nvec
+    def __init__(self, low, high):
+        self.low = low
+        self.high = high
+        self.ncats = high - low + 1
    def pdclass(self):
        return MultiCategoricalPd
    def pdfromflat(self, flat):
-        return MultiCategoricalPd(self.ncats, flat)
+        return MultiCategoricalPd(self.low, self.high, flat)
    def param_shape(self):
        return [sum(self.ncats)]
    def sample_shape(self):
@@ -123,7 +125,7 @@ class CategoricalPd(Pd):
    def flatparam(self):
        return self.logits
    def mode(self):
-        return tf.argmax(self.logits, axis=-1)
+        return U.argmax(self.logits, axis=-1)
    def neglogp(self, x):
        # return tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=x)
        # Note: we can't use sparse_softmax_cross_entropy_with_logits because
@@ -133,20 +135,20 @@ class CategoricalPd(Pd):
            logits=self.logits,
            labels=one_hot_actions)
    def kl(self, other):
-        a0 = self.logits - tf.reduce_max(self.logits, axis=-1, keep_dims=True)
-        a1 = other.logits - tf.reduce_max(other.logits, axis=-1, keep_dims=True)
+        a0 = self.logits - U.max(self.logits, axis=-1, keepdims=True)
+        a1 = other.logits - U.max(other.logits, axis=-1, keepdims=True)
        ea0 = tf.exp(a0)
        ea1 = tf.exp(a1)
-        z0 = tf.reduce_sum(ea0, axis=-1, keep_dims=True)
-        z1 = tf.reduce_sum(ea1, axis=-1, keep_dims=True)
+        z0 = U.sum(ea0, axis=-1, keepdims=True)
+        z1 = U.sum(ea1, axis=-1, keepdims=True)
        p0 = ea0 / z0
-        return tf.reduce_sum(p0 * (a0 - tf.log(z0) - a1 + tf.log(z1)), axis=-1)
+        return U.sum(p0 * (a0 - tf.log(z0) - a1 + tf.log(z1)), axis=-1)
    def entropy(self):
-        a0 = self.logits - tf.reduce_max(self.logits, axis=-1, keep_dims=True)
+        a0 = self.logits - U.max(self.logits, axis=-1, keepdims=True)
        ea0 = tf.exp(a0)
-        z0 = tf.reduce_sum(ea0, axis=-1, keep_dims=True)
+        z0 = U.sum(ea0, axis=-1, keepdims=True)
        p0 = ea0 / z0
-        return tf.reduce_sum(p0 * (tf.log(z0) - a0), axis=-1)
+        return U.sum(p0 * (tf.log(z0) - a0), axis=-1)
    def sample(self):
        u = tf.random_uniform(tf.shape(self.logits))
        return tf.argmax(self.logits - tf.log(-tf.log(u)), axis=-1)
@@ -155,21 +157,24 @@ class CategoricalPd(Pd):
        return cls(flat)

 class MultiCategoricalPd(Pd):
-    def __init__(self, nvec, flat):
+    def __init__(self, low, high, flat):
        self.flat = flat
-        self.categoricals = list(map(CategoricalPd, tf.split(flat, nvec, axis=-1)))
+        self.low = tf.constant(low, dtype=tf.int32)
+        self.categoricals = list(map(CategoricalPd, tf.split(flat, high - low + 1, axis=len(flat.get_shape()) - 1)))
    def flatparam(self):
        return self.flat
    def mode(self):
-        return tf.cast(tf.stack([p.mode() for p in self.categoricals], axis=-1), tf.int32)
+        return self.low + tf.cast(tf.stack([p.mode() for p in self.categoricals], axis=-1), tf.int32)
    def neglogp(self, x):
-        return tf.add_n([p.neglogp(px) for p, px in zip(self.categoricals, tf.unstack(x, axis=-1))])
+        return tf.add_n([p.neglogp(px) for p, px in zip(self.categoricals, tf.unstack(x - self.low, axis=len(x.get_shape()) - 1))])
    def kl(self, other):
-        return tf.add_n([p.kl(q) for p, q in zip(self.categoricals, other.categoricals)])
+        return tf.add_n([
+                p.kl(q) for p, q in zip(self.categoricals, other.categoricals)
+            ])
    def entropy(self):
        return tf.add_n([p.entropy() for p in self.categoricals])
    def sample(self):
-        return tf.cast(tf.stack([p.sample() for p in self.categoricals], axis=-1), tf.int32)
+        return self.low + tf.cast(tf.stack([p.sample() for p in self.categoricals], axis=-1), tf.int32)
    @classmethod
    def fromflat(cls, flat):
        raise NotImplementedError
@@ -186,14 +191,14 @@ class DiagGaussianPd(Pd):
    def mode(self):
        return self.mean
    def neglogp(self, x):
-        return 0.5 * tf.reduce_sum(tf.square((x - self.mean) / self.std), axis=-1) \
+        return 0.5 * U.sum(tf.square((x - self.mean) / self.std), axis=-1) \
               + 0.5 * np.log(2.0 * np.pi) * tf.to_float(tf.shape(x)[-1]) \
-               + tf.reduce_sum(self.logstd, axis=-1)
+               + U.sum(self.logstd, axis=-1)
    def kl(self, other):
        assert isinstance(other, DiagGaussianPd)
-        return tf.reduce_sum(other.logstd - self.logstd + (tf.square(self.std) + tf.square(self.mean - other.mean)) / (2.0 * tf.square(other.std)) - 0.5, axis=-1)
+        return U.sum(other.logstd - self.logstd + (tf.square(self.std) + tf.square(self.mean - other.mean)) / (2.0 * tf.square(other.std)) - 0.5, axis=-1)
    def entropy(self):
-        return tf.reduce_sum(self.logstd + .5 * np.log(2.0 * np.pi * np.e), axis=-1)
+        return U.sum(self.logstd + .5 * np.log(2.0 * np.pi * np.e), axis=-1)
    def sample(self):
        return self.mean + self.std * tf.random_normal(tf.shape(self.mean))
    @classmethod
@@ -209,11 +214,11 @@ class BernoulliPd(Pd):
    def mode(self):
        return tf.round(self.ps)
    def neglogp(self, x):
-        return tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=tf.to_float(x)), axis=-1)
+        return U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=tf.to_float(x)), axis=-1)
    def kl(self, other):
-        return tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=other.logits, labels=self.ps), axis=-1) - tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps), axis=-1)
+        return U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=other.logits, labels=self.ps), axis=-1) - U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps), axis=-1)
    def entropy(self):
-        return tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps), axis=-1)
+        return U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps), axis=-1)
    def sample(self):
        u = tf.random_uniform(tf.shape(self.ps))
        return tf.to_float(math_ops.less(u, self.ps))
@@ -229,7 +234,7 @@ def make_pdtype(ac_space):
    elif isinstance(ac_space, spaces.Discrete):
        return CategoricalPdType(ac_space.n)
    elif isinstance(ac_space, spaces.MultiDiscrete):
-        return MultiCategoricalPdType(ac_space.nvec)
+        return MultiCategoricalPdType(ac_space.low, ac_space.high)
    elif isinstance(ac_space, spaces.MultiBinary):
        return BernoulliPdType(ac_space.n)
    else:
@@ -254,11 +259,6 @@ def test_probtypes():
    categorical = CategoricalPdType(pdparam_categorical.size) #pylint: disable=E1101
    validate_probtype(categorical, pdparam_categorical)

-    nvec = [1,2,3]
-    pdparam_multicategorical = np.array([-.2, .3, .5, .1, 1, -.1])
-    multicategorical = MultiCategoricalPdType(nvec) #pylint: disable=E1101
-    validate_probtype(multicategorical, pdparam_multicategorical)
-
    pdparam_bernoulli = np.array([-.2, .3, .5])
    bernoulli = BernoulliPdType(pdparam_bernoulli.size) #pylint: disable=E1101
    validate_probtype(bernoulli, pdparam_bernoulli)
@@ -270,10 +270,10 @@ def validate_probtype(probtype, pdparam):
    Mval = np.repeat(pdparam[None, :], N, axis=0)
    M = probtype.param_placeholder([N])
    X = probtype.sample_placeholder([N])
-    pd = probtype.pdfromflat(M)
+    pd = probtype.pdclass()(M)
    calcloglik = U.function([X, M], pd.logp(X))
    calcent = U.function([M], pd.entropy())
-    Xval = tf.get_default_session().run(pd.sample(), feed_dict={M:Mval})
+    Xval = U.eval(pd.sample(), feed_dict={M:Mval})
    logliks = calcloglik(Xval, Mval)
    entval_ll = - logliks.mean() #pylint: disable=E1101
    entval_ll_stderr = logliks.std() / np.sqrt(N) #pylint: disable=E1101
@@ -282,7 +282,7 @@ def validate_probtype(probtype, pdparam):

    # Check to see if kldiv[p,q] = - ent[p] - E_p[log q]
    M2 = probtype.param_placeholder([N])
-    pd2 = probtype.pdfromflat(M2)
+    pd2 = probtype.pdclass()(M2)
    q = pdparam + np.random.randn(pdparam.size) * 0.1
    Mval2 = np.repeat(q[None, :], N, axis=0)
    calckl = U.function([M, M2], pd.kl(pd2))
@@ -291,5 +291,3 @@ def validate_probtype(probtype, pdparam):
    klval_ll = - entval - logliks.mean() #pylint: disable=E1101
    klval_ll_stderr = logliks.std() / np.sqrt(N) #pylint: disable=E1101
    assert np.abs(klval - klval_ll) < 3 * klval_ll_stderr # within 3 sigmas
-    print('ok on', probtype, pdparam)
-
--- a/baselines/common/misc_util.py
+++ b/baselines/common/misc_util.py
@@ -224,7 +224,6 @@ def relatively_safe_pickle_dump(obj, path, compression=False):
        # Using gzip here would be simpler, but the size is limited to 2GB
        with tempfile.NamedTemporaryFile() as uncompressed_file:
            pickle.dump(obj, uncompressed_file)
-            uncompressed_file.file.flush()
            with zipfile.ZipFile(temp_storage, "w", compression=zipfile.ZIP_DEFLATED) as myzip:
                myzip.write(uncompressed_file.name, "data")
    else:
--- a/baselines/common/mpi_adam.py
+++ b/baselines/common/mpi_adam.py
@@ -53,7 +53,7 @@ class MpiAdam(object):
 def test_MpiAdam():
    np.random.seed(0)
    tf.set_random_seed(0)
-
+    
    a = tf.Variable(np.random.randn(3).astype('float32'))
    b = tf.Variable(np.random.randn(2,5).astype('float32'))
    loss = tf.reduce_sum(tf.square(a)) + tf.reduce_sum(tf.sin(b))
--- a/baselines/common/mpi_moments.py
+++ b/baselines/common/mpi_moments.py
@@ -2,41 +2,29 @@ from mpi4py import MPI
 import numpy as np
 from baselines.common import zipsame

-def mpi_mean(x, axis=0, comm=None, keepdims=False):
-    x = np.asarray(x)
-    assert x.ndim > 0
-    if comm is None: comm = MPI.COMM_WORLD
-    xsum = x.sum(axis=axis, keepdims=keepdims)
-    n = xsum.size
-    localsum = np.zeros(n+1, x.dtype)
-    localsum[:n] = xsum.ravel()
-    localsum[n] = x.shape[axis]
-    globalsum = np.zeros_like(localsum)
-    comm.Allreduce(localsum, globalsum, op=MPI.SUM)
-    return globalsum[:n].reshape(xsum.shape) / globalsum[n], globalsum[n]
-
-def mpi_moments(x, axis=0, comm=None, keepdims=False):
-    x = np.asarray(x)
-    assert x.ndim > 0
-    mean, count = mpi_mean(x, axis=axis, comm=comm, keepdims=True)
-    sqdiffs = np.square(x - mean)
-    meansqdiff, count1 = mpi_mean(sqdiffs, axis=axis, comm=comm, keepdims=True)
-    assert count1 == count
-    std = np.sqrt(meansqdiff)
-    if not keepdims:
-        newshape = mean.shape[:axis] + mean.shape[axis+1:]
-        mean = mean.reshape(newshape)
-        std = std.reshape(newshape)
+def mpi_moments(x, axis=0):
+    x = np.asarray(x, dtype='float64')
+    newshape = list(x.shape)
+    newshape.pop(axis)
+    n = np.prod(newshape,dtype=int)
+    totalvec = np.zeros(n*2+1, 'float64')
+    addvec = np.concatenate([x.sum(axis=axis).ravel(), 
+        np.square(x).sum(axis=axis).ravel(), 
+        np.array([x.shape[axis]],dtype='float64')])
+    MPI.COMM_WORLD.Allreduce(addvec, totalvec, op=MPI.SUM)
+    sum = totalvec[:n]
+    sumsq = totalvec[n:2*n]
+    count = totalvec[2*n]
+    if count == 0:
+        mean = np.empty(newshape); mean[:] = np.nan
+        std = np.empty(newshape); std[:] = np.nan
+    else:
+        mean = sum/count
+        std = np.sqrt(np.maximum(sumsq/count - np.square(mean),0))
    return mean, std, count


 def test_runningmeanstd():
-    import subprocess
-    subprocess.check_call(['mpirun', '-np', '3', 
-        'python','-c', 
-        'from baselines.common.mpi_moments import _helper_runningmeanstd; _helper_runningmeanstd()'])
-
-def _helper_runningmeanstd():
    comm = MPI.COMM_WORLD
    np.random.seed(0)
    for (triple,axis) in [
@@ -57,3 +45,6 @@ def _helper_runningmeanstd():
            assert np.allclose(a1, a2)
            print("ok!")

+if __name__ == "__main__":
+    #mpirun -np 3 python <script>
+    test_runningmeanstd()
--- a/baselines/common/mpi_running_mean_std.py
+++ b/baselines/common/mpi_running_mean_std.py
@@ -57,7 +57,7 @@ def test_runningmeanstd():
        rms.update(x1)
        rms.update(x2)
        rms.update(x3)
-        ms2 = [rms.mean.eval(), rms.std.eval()]
+        ms2 = U.eval([rms.mean, rms.std])

        assert np.allclose(ms1, ms2)

@@ -94,11 +94,11 @@ def test_dist():

    assert checkallclose(
        bigvec.mean(axis=0),
-        rms.mean.eval(),
+        U.eval(rms.mean)
    )
    assert checkallclose(
        bigvec.std(axis=0),
-        rms.std.eval(),
+        U.eval(rms.std)
    )


--- a/baselines/common/running_mean_std.py
+++ b/baselines/common/running_mean_std.py
@@ -1,46 +0,0 @@
-import numpy as np
-class RunningMeanStd(object):
-    # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
-    def __init__(self, epsilon=1e-4, shape=()):
-        self.mean = np.zeros(shape, 'float64')
-        self.var = np.ones(shape, 'float64')
-        self.count = epsilon
-
-    def update(self, x):
-        batch_mean = np.mean(x, axis=0)
-        batch_var = np.var(x, axis=0)
-        batch_count = x.shape[0]
-        self.update_from_moments(batch_mean, batch_var, batch_count)
-
-    def update_from_moments(self, batch_mean, batch_var, batch_count):
-        delta = batch_mean - self.mean
-        tot_count = self.count + batch_count
-
-        new_mean = self.mean + delta * batch_count / tot_count        
-        m_a = self.var * (self.count)
-        m_b = batch_var * (batch_count)
-        M2 = m_a + m_b + np.square(delta) * self.count * batch_count / (self.count + batch_count)
-        new_var = M2 / (self.count + batch_count)
-
-        new_count = batch_count + self.count
-
-        self.mean = new_mean
-        self.var = new_var
-        self.count = new_count    
-
-def test_runningmeanstd():
-    for (x1, x2, x3) in [
-        (np.random.randn(3), np.random.randn(4), np.random.randn(5)),
-        (np.random.randn(3,2), np.random.randn(4,2), np.random.randn(5,2)),
-        ]:
-
-        rms = RunningMeanStd(epsilon=0.0, shape=x1.shape[1:])
-
-        x = np.concatenate([x1, x2, x3], axis=0)
-        ms1 = [x.mean(axis=0), x.var(axis=0)]
-        rms.update(x1)
-        rms.update(x2)
-        rms.update(x3)
-        ms2 = [rms.mean, rms.var]
-
-        assert np.allclose(ms1, ms2)
--- a/baselines/common/tests/test_tf_util.py
+++ b/baselines/common/tests/test_tf_util.py
@@ -3,39 +3,67 @@ import tensorflow as tf
 from baselines.common.tf_util import (
    function,
    initialize,
+    set_value,
    single_threaded_session
 )


+def test_set_value():
+    a = tf.Variable(42.)
+    with single_threaded_session():
+        set_value(a, 5)
+        assert a.eval() == 5
+        g = tf.get_default_graph()
+        g.finalize()
+        set_value(a, 6)
+        assert a.eval() == 6
+
+        # test the test
+        try:
+            assert a.eval() == 7
+        except AssertionError:
+            pass
+        else:
+            assert False, "assertion should have failed"
+
+
 def test_function():
-    with tf.Graph().as_default():
-        x = tf.placeholder(tf.int32, (), name="x")
-        y = tf.placeholder(tf.int32, (), name="y")
-        z = 3 * x + 2 * y
-        lin = function([x, y], z, givens={y: 0})
+    tf.reset_default_graph()
+    x = tf.placeholder(tf.int32, (), name="x")
+    y = tf.placeholder(tf.int32, (), name="y")
+    z = 3 * x + 2 * y
+    lin = function([x, y], z, givens={y: 0})

-        with single_threaded_session():
-            initialize()
+    with single_threaded_session():
+        initialize()

-            assert lin(2) == 6
-            assert lin(2, 2) == 10
+        assert lin(2) == 6
+        assert lin(x=3) == 9
+        assert lin(2, 2) == 10
+        assert lin(x=2, y=3) == 12


 def test_multikwargs():
-    with tf.Graph().as_default():
-        x = tf.placeholder(tf.int32, (), name="x")
-        with tf.variable_scope("other"):
-            x2 = tf.placeholder(tf.int32, (), name="x")
-        z = 3 * x + 2 * x2
+    tf.reset_default_graph()
+    x = tf.placeholder(tf.int32, (), name="x")
+    with tf.variable_scope("other"):
+        x2 = tf.placeholder(tf.int32, (), name="x")
+    z = 3 * x + 2 * x2

-        lin = function([x, x2], z, givens={x2: 0})
-        with single_threaded_session():
-            initialize()
-            assert lin(2) == 6
-            assert lin(2, 2) == 10
-            expt_caught = False
+    lin = function([x, x2], z, givens={x2: 0})
+    with single_threaded_session():
+        initialize()
+        assert lin(2) == 6
+        assert lin(2, 2) == 10
+        expt_caught = False
+        try:
+            lin(x=2)
+        except AssertionError:
+            expt_caught = True
+        assert expt_caught


 if __name__ == '__main__':
+    test_set_value()
    test_function()
    test_multikwargs()
--- a/baselines/common/tf_util.py
+++ b/baselines/common/tf_util.py
@@ -1,10 +1,45 @@
 import numpy as np
 import tensorflow as tf  # pylint: ignore-module
+import builtins
+import functools
 import copy
 import os
-import functools
 import collections
-import multiprocessing
+
+# ================================================================
+# Make consistent with numpy
+# ================================================================
+
+clip = tf.clip_by_value
+
+def sum(x, axis=None, keepdims=False):
+    axis = None if axis is None else [axis]
+    return tf.reduce_sum(x, axis=axis, keep_dims=keepdims)
+
+def mean(x, axis=None, keepdims=False):
+    axis = None if axis is None else [axis]
+    return tf.reduce_mean(x, axis=axis, keep_dims=keepdims)
+
+def var(x, axis=None, keepdims=False):
+    meanx = mean(x, axis=axis, keepdims=keepdims)
+    return mean(tf.square(x - meanx), axis=axis, keepdims=keepdims)
+
+def std(x, axis=None, keepdims=False):
+    return tf.sqrt(var(x, axis=axis, keepdims=keepdims))
+
+def max(x, axis=None, keepdims=False):
+    axis = None if axis is None else [axis]
+    return tf.reduce_max(x, axis=axis, keep_dims=keepdims)
+
+def min(x, axis=None, keepdims=False):
+    axis = None if axis is None else [axis]
+    return tf.reduce_min(x, axis=axis, keep_dims=keepdims)
+
+def concatenate(arrs, axis=0):
+    return tf.concat(axis=axis, values=arrs)
+
+def argmax(x, axis=None):
+    return tf.argmax(x, axis=axis)

 def switch(condition, then_expression, else_expression):
    """Switches between two operations depending on a scalar value (int or bool).
@@ -27,11 +62,105 @@ def switch(condition, then_expression, else_expression):
 # Extras
 # ================================================================

+def l2loss(params):
+    if len(params) == 0:
+        return tf.constant(0.0)
+    else:
+        return tf.add_n([sum(tf.square(p)) for p in params])
+
 def lrelu(x, leak=0.2):
    f1 = 0.5 * (1 + leak)
    f2 = 0.5 * (1 - leak)
    return f1 * x + f2 * abs(x)

+def categorical_sample_logits(X):
+    # https://github.com/tensorflow/tensorflow/issues/456
+    U = tf.random_uniform(tf.shape(X))
+    return argmax(X - tf.log(-tf.log(U)), axis=1)
+
+# ================================================================
+# Inputs
+# ================================================================
+
+def is_placeholder(x):
+    return type(x) is tf.Tensor and len(x.op.inputs) == 0
+
+class TfInput(object):
+    def __init__(self, name="(unnamed)"):
+        """Generalized Tensorflow placeholder. The main differences are:
+            - possibly uses multiple placeholders internally and returns multiple values
+            - can apply light postprocessing to the value feed to placeholder.
+        """
+        self.name = name
+
+    def get(self):
+        """Return the tf variable(s) representing the possibly postprocessed value
+        of placeholder(s).
+        """
+        raise NotImplemented()
+
+    def make_feed_dict(data):
+        """Given data input it to the placeholder(s)."""
+        raise NotImplemented()
+
+class PlacholderTfInput(TfInput):
+    def __init__(self, placeholder):
+        """Wrapper for regular tensorflow placeholder."""
+        super().__init__(placeholder.name)
+        self._placeholder = placeholder
+
+    def get(self):
+        return self._placeholder
+
+    def make_feed_dict(self, data):
+        return {self._placeholder: data}
+
+class BatchInput(PlacholderTfInput):
+    def __init__(self, shape, dtype=tf.float32, name=None):
+        """Creates a placeholder for a batch of tensors of a given shape and dtype
+
+        Parameters
+        ----------
+        shape: [int]
+            shape of a single elemenet of the batch
+        dtype: tf.dtype
+            number representation used for tensor contents
+        name: str
+            name of the underlying placeholder
+        """
+        super().__init__(tf.placeholder(dtype, [None] + list(shape), name=name))
+
+class Uint8Input(PlacholderTfInput):
+    def __init__(self, shape, name=None):
+        """Takes input in uint8 format which is cast to float32 and divided by 255
+        before passing it to the model.
+
+        On GPU this ensures lower data transfer times.
+
+        Parameters
+        ----------
+        shape: [int]
+            shape of the tensor.
+        name: str
+            name of the underlying placeholder
+        """
+
+        super().__init__(tf.placeholder(tf.uint8, [None] + list(shape), name=name))
+        self._shape = shape
+        self._output = tf.cast(super().get(), tf.float32) / 255.0
+
+    def get(self):
+        return self._output
+
+def ensure_tf_input(thing):
+    """Takes either tf.placeholder of TfInput and outputs equivalent TfInput"""
+    if isinstance(thing, TfInput):
+        return thing
+    elif is_placeholder(thing):
+        return PlacholderTfInput(thing)
+    else:
+        raise ValueError("Must be a placeholder or TfInput")
+
 # ================================================================
 # Mathematical utils
 # ================================================================
@@ -44,42 +173,78 @@ def huber_loss(x, delta=1.0):
        delta * (tf.abs(x) - 0.5 * delta)
    )

+# ================================================================
+# Optimizer utils
+# ================================================================
+
+def minimize_and_clip(optimizer, objective, var_list, clip_val=10):
+    """Minimized `objective` using `optimizer` w.r.t. variables in
+    `var_list` while ensure the norm of the gradients for each
+    variable is clipped to `clip_val`
+    """
+    gradients = optimizer.compute_gradients(objective, var_list=var_list)
+    for i, (grad, var) in enumerate(gradients):
+        if grad is not None:
+            gradients[i] = (tf.clip_by_norm(grad, clip_val), var)
+    return optimizer.apply_gradients(gradients)
+
 # ================================================================
 # Global session
 # ================================================================

-def make_session(num_cpu=None, make_default=False):
+def get_session():
+    """Returns recently made Tensorflow session"""
+    return tf.get_default_session()
+
+def make_session(num_cpu):
    """Returns a session that will use <num_cpu> CPU's only"""
-    if num_cpu is None:
-        num_cpu = int(os.getenv('RCALL_NUM_CPU', multiprocessing.cpu_count()))
    tf_config = tf.ConfigProto(
        inter_op_parallelism_threads=num_cpu,
        intra_op_parallelism_threads=num_cpu)
-    tf_config.gpu_options.allocator_type = 'BFC'
-    if make_default:
-        return tf.InteractiveSession(config=tf_config)
-    else:
-        return tf.Session(config=tf_config)
+    return tf.Session(config=tf_config)

 def single_threaded_session():
    """Returns a session which will only use a single CPU"""
-    return make_session(num_cpu=1)
-
-def in_session(f):
-    @functools.wraps(f)
-    def newfunc(*args, **kwargs):
-        with tf.Session():
-            f(*args, **kwargs)
-    return newfunc
+    return make_session(1)

 ALREADY_INITIALIZED = set()

 def initialize():
    """Initialize all the uninitialized variables in the global scope."""
    new_variables = set(tf.global_variables()) - ALREADY_INITIALIZED
-    tf.get_default_session().run(tf.variables_initializer(new_variables))
+    get_session().run(tf.variables_initializer(new_variables))
    ALREADY_INITIALIZED.update(new_variables)

+def eval(expr, feed_dict=None):
+    if feed_dict is None:
+        feed_dict = {}
+    return get_session().run(expr, feed_dict=feed_dict)
+
+VALUE_SETTERS = collections.OrderedDict()
+
+def set_value(v, val):
+    global VALUE_SETTERS
+    if v in VALUE_SETTERS:
+        set_op, set_endpoint = VALUE_SETTERS[v]
+    else:
+        set_endpoint = tf.placeholder(v.dtype)
+        set_op = v.assign(set_endpoint)
+        VALUE_SETTERS[v] = (set_op, set_endpoint)
+    get_session().run(set_op, feed_dict={set_endpoint: val})
+
+# ================================================================
+# Saving variables
+# ================================================================
+
+def load_state(fname):
+    saver = tf.train.Saver()
+    saver.restore(get_session(), fname)
+
+def save_state(fname):
+    os.makedirs(os.path.dirname(fname), exist_ok=True)
+    saver = tf.train.Saver()
+    saver.save(get_session(), fname)
+
 # ================================================================
 # Model components
 # ================================================================
@@ -120,6 +285,36 @@ def conv2d(x, num_filters, name, filter_size=(3, 3), stride=(1, 1), pad="SAME",

        return tf.nn.conv2d(x, w, stride_shape, pad) + b

+def dense(x, size, name, weight_init=None, bias=True):
+    w = tf.get_variable(name + "/w", [x.get_shape()[1], size], initializer=weight_init)
+    ret = tf.matmul(x, w)
+    if bias:
+        b = tf.get_variable(name + "/b", [size], initializer=tf.zeros_initializer())
+        return ret + b
+    else:
+        return ret
+
+def wndense(x, size, name, init_scale=1.0):
+    v = tf.get_variable(name + "/V", [int(x.get_shape()[1]), size],
+                        initializer=tf.random_normal_initializer(0, 0.05))
+    g = tf.get_variable(name + "/g", [size], initializer=tf.constant_initializer(init_scale))
+    b = tf.get_variable(name + "/b", [size], initializer=tf.constant_initializer(0.0))
+
+    # use weight normalization (Salimans & Kingma, 2016)
+    x = tf.matmul(x, v)
+    scaler = g / tf.sqrt(sum(tf.square(v), axis=0, keepdims=True))
+    return tf.reshape(scaler, [1, size]) * x + tf.reshape(b, [1, size])
+
+def densenobias(x, size, name, weight_init=None):
+    return dense(x, size, name, weight_init=weight_init, bias=False)
+
+def dropout(x, pkeep, phase=None, mask=None):
+    mask = tf.floor(pkeep + tf.random_uniform(tf.shape(x))) if mask is None else mask
+    if phase is None:
+        return mask * x
+    else:
+        return switch(phase, mask * x, pkeep * x)
+
 # ================================================================
 # Theano-like Function
 # ================================================================
@@ -149,7 +344,7 @@ def function(inputs, outputs, updates=None, givens=None):

    Parameters
    ----------
-    inputs: [tf.placeholder, tf.constant, or object with make_feed_dict method]
+    inputs: [tf.placeholder or TfInput]
        list of input arguments
    outputs: [tf.Variable] or tf.Variable
        list of outputs or a single output to be returned from function. Returned
@@ -164,36 +359,183 @@ def function(inputs, outputs, updates=None, givens=None):
        f = _Function(inputs, [outputs], updates, givens=givens)
        return lambda *args, **kwargs: f(*args, **kwargs)[0]

-
 class _Function(object):
-    def __init__(self, inputs, outputs, updates, givens):
+    def __init__(self, inputs, outputs, updates, givens, check_nan=False):
        for inpt in inputs:
-            if not hasattr(inpt, 'make_feed_dict') and not (type(inpt) is tf.Tensor and len(inpt.op.inputs) == 0):
-                assert False, "inputs should all be placeholders, constants, or have a make_feed_dict method"
+            if not issubclass(type(inpt), TfInput):
+                assert len(inpt.op.inputs) == 0, "inputs should all be placeholders of baselines.common.TfInput"
        self.inputs = inputs
        updates = updates or []
        self.update_group = tf.group(*updates)
        self.outputs_update = list(outputs) + [self.update_group]
        self.givens = {} if givens is None else givens
+        self.check_nan = check_nan

    def _feed_input(self, feed_dict, inpt, value):
-        if hasattr(inpt, 'make_feed_dict'):
+        if issubclass(type(inpt), TfInput):
            feed_dict.update(inpt.make_feed_dict(value))
-        else:
+        elif is_placeholder(inpt):
            feed_dict[inpt] = value

-    def __call__(self, *args):
+    def __call__(self, *args, **kwargs):
        assert len(args) <= len(self.inputs), "Too many arguments provided"
        feed_dict = {}
        # Update the args
        for inpt, value in zip(self.inputs, args):
            self._feed_input(feed_dict, inpt, value)
+        # Update the kwargs
+        kwargs_passed_inpt_names = set()
+        for inpt in self.inputs[len(args):]:
+            inpt_name = inpt.name.split(':')[0]
+            inpt_name = inpt_name.split('/')[-1]
+            assert inpt_name not in kwargs_passed_inpt_names, \
+                "this function has two arguments with the same name \"{}\", so kwargs cannot be used.".format(inpt_name)
+            if inpt_name in kwargs:
+                kwargs_passed_inpt_names.add(inpt_name)
+                self._feed_input(feed_dict, inpt, kwargs.pop(inpt_name))
+            else:
+                assert inpt in self.givens, "Missing argument " + inpt_name
+        assert len(kwargs) == 0, "Function got extra arguments " + str(list(kwargs.keys()))
        # Update feed dict with givens.
        for inpt in self.givens:
            feed_dict[inpt] = feed_dict.get(inpt, self.givens[inpt])
-        results = tf.get_default_session().run(self.outputs_update, feed_dict=feed_dict)[:-1]
+        results = get_session().run(self.outputs_update, feed_dict=feed_dict)[:-1]
+        if self.check_nan:
+            if any(np.isnan(r).any() for r in results):
+                raise RuntimeError("Nan detected")
        return results

+def mem_friendly_function(nondata_inputs, data_inputs, outputs, batch_size):
+    if isinstance(outputs, list):
+        return _MemFriendlyFunction(nondata_inputs, data_inputs, outputs, batch_size)
+    else:
+        f = _MemFriendlyFunction(nondata_inputs, data_inputs, [outputs], batch_size)
+        return lambda *inputs: f(*inputs)[0]
+
+class _MemFriendlyFunction(object):
+    def __init__(self, nondata_inputs, data_inputs, outputs, batch_size):
+        self.nondata_inputs = nondata_inputs
+        self.data_inputs = data_inputs
+        self.outputs = list(outputs)
+        self.batch_size = batch_size
+
+    def __call__(self, *inputvals):
+        assert len(inputvals) == len(self.nondata_inputs) + len(self.data_inputs)
+        nondata_vals = inputvals[0:len(self.nondata_inputs)]
+        data_vals = inputvals[len(self.nondata_inputs):]
+        feed_dict = dict(zip(self.nondata_inputs, nondata_vals))
+        n = data_vals[0].shape[0]
+        for v in data_vals[1:]:
+            assert v.shape[0] == n
+        for i_start in range(0, n, self.batch_size):
+            slice_vals = [v[i_start:builtins.min(i_start + self.batch_size, n)] for v in data_vals]
+            for (var, val) in zip(self.data_inputs, slice_vals):
+                feed_dict[var] = val
+            results = tf.get_default_session().run(self.outputs, feed_dict=feed_dict)
+            if i_start == 0:
+                sum_results = results
+            else:
+                for i in range(len(results)):
+                    sum_results[i] = sum_results[i] + results[i]
+        for i in range(len(results)):
+            sum_results[i] = sum_results[i] / n
+        return sum_results
+
+# ================================================================
+# Modules
+# ================================================================
+
+class Module(object):
+    def __init__(self, name):
+        self.name = name
+        self.first_time = True
+        self.scope = None
+        self.cache = {}
+
+    def __call__(self, *args):
+        if args in self.cache:
+            print("(%s) retrieving value from cache" % (self.name,))
+            return self.cache[args]
+        with tf.variable_scope(self.name, reuse=not self.first_time):
+            scope = tf.get_variable_scope().name
+            if self.first_time:
+                self.scope = scope
+                print("(%s) running function for the first time" % (self.name,))
+            else:
+                assert self.scope == scope, "Tried calling function with a different scope"
+                print("(%s) running function on new inputs" % (self.name,))
+            self.first_time = False
+            out = self._call(*args)
+        self.cache[args] = out
+        return out
+
+    def _call(self, *args):
+        raise NotImplementedError
+
+    @property
+    def trainable_variables(self):
+        assert self.scope is not None, "need to call module once before getting variables"
+        return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)
+
+    @property
+    def variables(self):
+        assert self.scope is not None, "need to call module once before getting variables"
+        return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope)
+
+def module(name):
+    @functools.wraps
+    def wrapper(f):
+        class WrapperModule(Module):
+            def _call(self, *args):
+                return f(*args)
+        return WrapperModule(name)
+    return wrapper
+
+# ================================================================
+# Graph traversal
+# ================================================================
+
+VARIABLES = {}
+
+def get_parents(node):
+    return node.op.inputs
+
+def topsorted(outputs):
+    """
+    Topological sort via non-recursive depth-first search
+    """
+    assert isinstance(outputs, (list, tuple))
+    marks = {}
+    out = []
+    stack = []  # pylint: disable=W0621
+    # i: node
+    # jidx = number of children visited so far from that node
+    # marks: state of each node, which is one of
+    #   0: haven't visited
+    #   1: have visited, but not done visiting children
+    #   2: done visiting children
+    for x in outputs:
+        stack.append((x, 0))
+        while stack:
+            (i, jidx) = stack.pop()
+            if jidx == 0:
+                m = marks.get(i, 0)
+                if m == 0:
+                    marks[i] = 1
+                elif m == 1:
+                    raise ValueError("not a dag")
+                else:
+                    continue
+            ps = get_parents(i)
+            if jidx == len(ps):
+                marks[i] = 2
+                out.append(i)
+            else:
+                stack.append((i, jidx + 1))
+                j = ps[jidx]
+                stack.append((j, 0))
+    return out
+
 # ================================================================
 # Flat vectors
 # ================================================================
@@ -235,14 +577,88 @@ class SetFromFlat(object):
        self.op = tf.group(*assigns)

    def __call__(self, theta):
-        tf.get_default_session().run(self.op, feed_dict={self.theta: theta})
+        get_session().run(self.op, feed_dict={self.theta: theta})

 class GetFlat(object):
    def __init__(self, var_list):
        self.op = tf.concat(axis=0, values=[tf.reshape(v, [numel(v)]) for v in var_list])

    def __call__(self):
-        return tf.get_default_session().run(self.op)
+        return get_session().run(self.op)
+
+# ================================================================
+# Misc
+# ================================================================
+
+def fancy_slice_2d(X, inds0, inds1):
+    """
+    like numpy X[inds0, inds1]
+    XXX this implementation is bad
+    """
+    inds0 = tf.cast(inds0, tf.int64)
+    inds1 = tf.cast(inds1, tf.int64)
+    shape = tf.cast(tf.shape(X), tf.int64)
+    ncols = shape[1]
+    Xflat = tf.reshape(X, [-1])
+    return tf.gather(Xflat, inds0 * ncols + inds1)
+
+# ================================================================
+# Scopes
+# ================================================================
+
+def scope_vars(scope, trainable_only=False):
+    """
+    Get variables inside a scope
+    The scope can be specified as a string
+
+    Parameters
+    ----------
+    scope: str or VariableScope
+        scope in which the variables reside.
+    trainable_only: bool
+        whether or not to return only the variables that were marked as trainable.
+
+    Returns
+    -------
+    vars: [tf.Variable]
+        list of variables in `scope`.
+    """
+    return tf.get_collection(
+        tf.GraphKeys.TRAINABLE_VARIABLES if trainable_only else tf.GraphKeys.GLOBAL_VARIABLES,
+        scope=scope if isinstance(scope, str) else scope.name
+    )
+
+def scope_name():
+    """Returns the name of current scope as a string, e.g. deepq/q_func"""
+    return tf.get_variable_scope().name
+
+def absolute_scope_name(relative_scope_name):
+    """Appends parent scope name to `relative_scope_name`"""
+    return scope_name() + "/" + relative_scope_name
+
+def lengths_to_mask(lengths_b, max_length):
+    """
+    Turns a vector of lengths into a boolean mask
+
+    Args:
+        lengths_b: an integer vector of lengths
+        max_length: maximum length to fill the mask
+
+    Returns:
+        a boolean array of shape (batch_size, max_length)
+        row[i] consists of True repeated lengths_b[i] times, followed by False
+    """
+    lengths_b = tf.convert_to_tensor(lengths_b)
+    assert lengths_b.get_shape().ndims == 1
+    mask_bt = tf.expand_dims(tf.range(max_length), 0) < tf.expand_dims(lengths_b, 1)
+    return mask_bt
+
+def in_session(f):
+    @functools.wraps(f)
+    def newfunc(*args, **kwargs):
+        with tf.Session():
+            f(*args, **kwargs)
+    return newfunc

 _PLACEHOLDER_CACHE = {}  # name -> (placeholder, dtype, shape)

@@ -262,19 +678,9 @@ def get_placeholder_cached(name):
 def flattenallbut0(x):
    return tf.reshape(x, [-1, intprod(x.get_shape().as_list()[1:])])

-
-# ================================================================
-# Diagnostics 
-# ================================================================
-
-def display_var_info(vars):
-    from baselines import logger
-    count_params = 0
-    for v in vars:
-        name = v.name
-        if "/Adam" in name or "beta1_power" in name or "beta2_power" in name: continue
-        count_params += np.prod(v.shape.as_list())
-        if "/b:" in name: continue    # Wx+b, bias is not interesting to look at => count params, but not print
-        logger.info("    %s%s%s" % (name, " "*(55-len(name)), str(v.shape)))
-    logger.info("Total model parameters: %0.1f million" % (count_params*1e-6))
-
+def reset():
+    global _PLACEHOLDER_CACHE
+    global VARIABLES
+    _PLACEHOLDER_CACHE = {}
+    VARIABLES = {}
+    tf.reset_default_graph()
--- a/baselines/common/vec_env/init.py
+++ b/baselines/common/vec_env/init.py
@@ -1,119 +1,19 @@
-from abc import ABC, abstractmethod
-from baselines import logger
+class VecEnv(object):
+    """
+    Vectorized environment base class
+    """
+    def step(self, vac):
+        """
+        Apply sequence of actions to sequence of environments
+        actions -> (observations, rewards, news)

-class AlreadySteppingError(Exception):
-    """
-    Raised when an asynchronous step is running while
-    step_async() is called again.
-    """
-    def __init__(self):
-        msg = 'already running an async step'
-        Exception.__init__(self, msg)
-
-class NotSteppingError(Exception):
-    """
-    Raised when an asynchronous step is not running but
-    step_wait() is called.
-    """
-    def __init__(self):
-        msg = 'not running an async step'
-        Exception.__init__(self, msg)
-
-class VecEnv(ABC):
-    """
-    An abstract asynchronous, vectorized environment.
-    """
-    def __init__(self, num_envs, observation_space, action_space):
-        self.num_envs = num_envs
-        self.observation_space = observation_space
-        self.action_space = action_space
-
-    @abstractmethod
+        where 'news' is a boolean vector indicating whether each element is new.
+        """
+        raise NotImplementedError
    def reset(self):
        """
-        Reset all the environments and return an array of
-        observations, or a tuple of observation arrays.
-
-        If step_async is still doing work, that work will
-        be cancelled and step_wait() should not be called
-        until step_async() is invoked again.
+        Reset all environments
        """
-        pass
-
-    @abstractmethod
-    def step_async(self, actions):
-        """
-        Tell all the environments to start taking a step
-        with the given actions.
-        Call step_wait() to get the results of the step.
-
-        You should not call this if a step_async run is
-        already pending.
-        """
-        pass
-
-    @abstractmethod
-    def step_wait(self):
-        """
-        Wait for the step taken with step_async().
-
-        Returns (obs, rews, dones, infos):
-         - obs: an array of observations, or a tuple of
-                arrays of observations.
-         - rews: an array of rewards
-         - dones: an array of "episode done" booleans
-         - infos: a sequence of info objects
-        """
-        pass
-
-    @abstractmethod
+        raise NotImplementedError
    def close(self):
-        """
-        Clean up the environments' resources.
-        """
-        pass
-
-    def step(self, actions):
-        self.step_async(actions)
-        return self.step_wait()
-
-    def render(self):
-        logger.warn('Render not defined for %s'%self)
-
-class VecEnvWrapper(VecEnv):
-    def __init__(self, venv, observation_space=None, action_space=None):
-        self.venv = venv
-        VecEnv.__init__(self, 
-            num_envs=venv.num_envs,
-            observation_space=observation_space or venv.observation_space, 
-            action_space=action_space or venv.action_space)
-
-    def step_async(self, actions):
-        self.venv.step_async(actions)
-
-    @abstractmethod
-    def reset(self):
-        pass
-
-    @abstractmethod
-    def step_wait(self):
-        pass
-
-    def close(self):
-        return self.venv.close()
-
-    def render(self):
-        self.venv.render()
-
-class CloudpickleWrapper(object):
-    """
-    Uses cloudpickle to serialize contents (otherwise multiprocessing tries to use pickle)
-    """
-    def __init__(self, x):
-        self.x = x
-    def __getstate__(self):
-        import cloudpickle
-        return cloudpickle.dumps(self.x)
-    def __setstate__(self, ob):
-        import pickle
-        self.x = pickle.loads(ob)
+        pass
--- a/baselines/common/vec_env/dummy_vec_env.py
+++ b/baselines/common/vec_env/dummy_vec_env.py
@@ -1,51 +0,0 @@
-import numpy as np
-import gym
-from . import VecEnv
-
-class DummyVecEnv(VecEnv):
-    def __init__(self, env_fns):
-        self.envs = [fn() for fn in env_fns]
-        env = self.envs[0]
-        VecEnv.__init__(self, len(env_fns), env.observation_space, env.action_space)
-
-        obs_spaces = self.observation_space.spaces if isinstance(self.observation_space, gym.spaces.Tuple) else (self.observation_space,)
-        self.buf_obs = [np.zeros((self.num_envs,) + tuple(s.shape), s.dtype) for s in obs_spaces]
-        self.buf_dones = np.zeros((self.num_envs,), dtype=np.bool)
-        self.buf_rews  = np.zeros((self.num_envs,), dtype=np.float32)
-        self.buf_infos = [{} for _ in range(self.num_envs)]
-        self.actions = None
-
-    def step_async(self, actions):
-        self.actions = actions
-
-    def step_wait(self):
-        for i in range(self.num_envs):
-            obs_tuple, self.buf_rews[i], self.buf_dones[i], self.buf_infos[i] = self.envs[i].step(self.actions[i])
-            if self.buf_dones[i]:
-                obs_tuple = self.envs[i].reset()
-            if isinstance(obs_tuple, (tuple, list)):
-                for t,x in enumerate(obs_tuple):
-                    self.buf_obs[t][i] = x
-            else:
-                self.buf_obs[0][i] = obs_tuple
-        return (self._obs_from_buf(), np.copy(self.buf_rews), np.copy(self.buf_dones),
-                self.buf_infos.copy())
-
-    def reset(self):
-        for i in range(self.num_envs):
-            obs_tuple = self.envs[i].reset()
-            if isinstance(obs_tuple, (tuple, list)):
-                for t,x in enumerate(obs_tuple):
-                    self.buf_obs[t][i] = x
-            else:
-                self.buf_obs[0][i] = obs_tuple
-        return self._obs_from_buf()
-
-    def close(self):
-        return
-
-    def _obs_from_buf(self):
-        if len(self.buf_obs) == 1:
-            return np.copy(self.buf_obs[0])
-        else:
-            return tuple(np.copy(x) for x in self.buf_obs)
--- a/baselines/common/vec_env/subproc_vec_env.py
+++ b/baselines/common/vec_env/subproc_vec_env.py
@@ -1,6 +1,6 @@
 import numpy as np
 from multiprocessing import Process, Pipe
-from baselines.common.vec_env import VecEnv, CloudpickleWrapper
+from baselines.common.vec_env import VecEnv


 def worker(remote, parent_remote, env_fn_wrapper):
@@ -23,17 +23,30 @@ def worker(remote, parent_remote, env_fn_wrapper):
            remote.close()
            break
        elif cmd == 'get_spaces':
-            remote.send((env.observation_space, env.action_space))
+            remote.send((env.action_space, env.observation_space))
        else:
            raise NotImplementedError


+class CloudpickleWrapper(object):
+    """
+    Uses cloudpickle to serialize contents (otherwise multiprocessing tries to use pickle)
+    """
+    def __init__(self, x):
+        self.x = x
+    def __getstate__(self):
+        import cloudpickle
+        return cloudpickle.dumps(self.x)
+    def __setstate__(self, ob):
+        import pickle
+        self.x = pickle.loads(ob)
+
+
 class SubprocVecEnv(VecEnv):
-    def __init__(self, env_fns, spaces=None):
+    def __init__(self, env_fns):
        """
        envs: list of gym environments to run in subprocesses
        """
-        self.waiting = False
        self.closed = False
        nenvs = len(env_fns)
        self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(nenvs)])
@@ -46,17 +59,13 @@ class SubprocVecEnv(VecEnv):
            remote.close()

        self.remotes[0].send(('get_spaces', None))
-        observation_space, action_space = self.remotes[0].recv()
-        VecEnv.__init__(self, len(env_fns), observation_space, action_space)
+        self.action_space, self.observation_space = self.remotes[0].recv()

-    def step_async(self, actions):
+
+    def step(self, actions):
        for remote, action in zip(self.remotes, actions):
            remote.send(('step', action))
-        self.waiting = True
-
-    def step_wait(self):
        results = [remote.recv() for remote in self.remotes]
-        self.waiting = False
        obs, rews, dones, infos = zip(*results)
        return np.stack(obs), np.stack(rews), np.stack(dones), infos

@@ -73,11 +82,13 @@ class SubprocVecEnv(VecEnv):
    def close(self):
        if self.closed:
            return
-        if self.waiting:
-            for remote in self.remotes:            
-                remote.recv()
+
        for remote in self.remotes:
            remote.send(('close', None))
        for p in self.ps:
            p.join()
        self.closed = True
+
+    @property
+    def num_envs(self):
+        return len(self.remotes)
--- a/baselines/common/vec_env/vec_frame_stack.py
+++ b/baselines/common/vec_env/vec_frame_stack.py
@@ -1,38 +0,0 @@
-from baselines.common.vec_env import VecEnvWrapper
-import numpy as np
-from gym import spaces
-
-class VecFrameStack(VecEnvWrapper):
-    """
-    Vectorized environment base class
-    """
-    def __init__(self, venv, nstack):
-        self.venv = venv
-        self.nstack = nstack
-        wos = venv.observation_space # wrapped ob space
-        low = np.repeat(wos.low, self.nstack, axis=-1)
-        high = np.repeat(wos.high, self.nstack, axis=-1)
-        self.stackedobs = np.zeros((venv.num_envs,)+low.shape, low.dtype)
-        observation_space = spaces.Box(low=low, high=high, dtype=venv.observation_space.dtype)
-        VecEnvWrapper.__init__(self, venv, observation_space=observation_space)
-
-    def step_wait(self):
-        obs, rews, news, infos = self.venv.step_wait()
-        self.stackedobs = np.roll(self.stackedobs, shift=-1, axis=-1)
-        for (i, new) in enumerate(news):
-            if new:
-                self.stackedobs[i] = 0
-        self.stackedobs[..., -obs.shape[-1]:] = obs
-        return self.stackedobs, rews, news, infos
-
-    def reset(self):
-        """
-        Reset all environments
-        """
-        obs = self.venv.reset()
-        self.stackedobs[...] = 0
-        self.stackedobs[..., -obs.shape[-1]:] = obs
-        return self.stackedobs
-
-    def close(self):
-        self.venv.close()
--- a/baselines/common/vec_env/vec_normalize.py
+++ b/baselines/common/vec_env/vec_normalize.py
@@ -1,47 +0,0 @@
-from baselines.common.vec_env import VecEnvWrapper
-from baselines.common.running_mean_std import RunningMeanStd
-import numpy as np
-
-class VecNormalize(VecEnvWrapper):
-    """
-    Vectorized environment base class
-    """
-    def __init__(self, venv, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8):
-        VecEnvWrapper.__init__(self, venv)
-        self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if ob else None
-        self.ret_rms = RunningMeanStd(shape=()) if ret else None
-        self.clipob = clipob
-        self.cliprew = cliprew
-        self.ret = np.zeros(self.num_envs)
-        self.gamma = gamma
-        self.epsilon = epsilon
-
-    def step_wait(self):
-        """
-        Apply sequence of actions to sequence of environments
-        actions -> (observations, rewards, news)
-
-        where 'news' is a boolean vector indicating whether each element is new.
-        """
-        obs, rews, news, infos = self.venv.step_wait()
-        self.ret = self.ret * self.gamma + rews
-        obs = self._obfilt(obs)
-        if self.ret_rms:
-            self.ret_rms.update(self.ret)
-            rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew)
-        return obs, rews, news, infos
-
-    def _obfilt(self, obs):
-        if self.ob_rms:
-            self.ob_rms.update(obs)
-            obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob)
-            return obs
-        else:
-            return obs
-
-    def reset(self):
-        """
-        Reset all environments
-        """
-        obs = self.venv.reset()
-        return self._obfilt(obs)
--- a/baselines/ddpg/ddpg.py
+++ b/baselines/ddpg/ddpg.py
@@ -9,7 +9,8 @@ from baselines import logger
 from baselines.common.mpi_adam import MpiAdam
 import baselines.common.tf_util as U
 from baselines.common.mpi_running_mean_std import RunningMeanStd
-from mpi4py import MPI
+from baselines.ddpg.util import reduce_std, mpi_mean
+

 def normalize(x, stats):
    if stats is None:
@@ -22,13 +23,6 @@ def denormalize(x, stats):
        return x
    return x * stats.std + stats.mean

-def reduce_std(x, axis=None, keepdims=False):
-    return tf.sqrt(reduce_var(x, axis=axis, keepdims=keepdims))
-
-def reduce_var(x, axis=None, keepdims=False):
-    m = tf.reduce_mean(x, axis=axis, keep_dims=True)
-    devs_squared = tf.square(x - m)
-    return tf.reduce_mean(devs_squared, axis=axis, keep_dims=keepdims)

 def get_target_updates(vars, target_vars, tau):
    logger.info('setting up target updates ...')
@@ -204,7 +198,7 @@ class DDPG(object):
        new_std = self.ret_rms.std
        self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean')
        new_mean = self.ret_rms.mean
-
+        
        self.renormalize_Q_outputs_op = []
        for vs in [self.critic.output_vars, self.target_critic.output_vars]:
            assert len(vs) == 2
@@ -219,15 +213,15 @@ class DDPG(object):
    def setup_stats(self):
        ops = []
        names = []
-
+        
        if self.normalize_returns:
            ops += [self.ret_rms.mean, self.ret_rms.std]
            names += ['ret_rms_mean', 'ret_rms_std']
-
+        
        if self.normalize_observations:
            ops += [tf.reduce_mean(self.obs_rms.mean), tf.reduce_mean(self.obs_rms.std)]
            names += ['obs_rms_mean', 'obs_rms_std']
-
+        
        ops += [tf.reduce_mean(self.critic_tf)]
        names += ['reference_Q_mean']
        ops += [reduce_std(self.critic_tf)]
@@ -237,7 +231,7 @@ class DDPG(object):
        names += ['reference_actor_Q_mean']
        ops += [reduce_std(self.critic_with_actor_tf)]
        names += ['reference_actor_Q_std']
-
+        
        ops += [tf.reduce_mean(self.actor_tf)]
        names += ['reference_action_mean']
        ops += [reduce_std(self.actor_tf)]
@@ -353,7 +347,7 @@ class DDPG(object):
    def adapt_param_noise(self):
        if self.param_noise is None:
            return 0.
-
+        
        # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation.
        batch = self.memory.sample(batch_size=self.batch_size)
        self.sess.run(self.perturb_adaptive_policy_ops, feed_dict={
@@ -364,7 +358,7 @@ class DDPG(object):
            self.param_noise_stddev: self.param_noise.current_stddev,
        })

-        mean_distance = MPI.COMM_WORLD.allreduce(distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size()
+        mean_distance = mpi_mean(distance)
        self.param_noise.adapt(mean_distance)
        return mean_distance

--- a/baselines/ddpg/main.py
+++ b/baselines/ddpg/main.py
@@ -25,6 +25,7 @@ def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs):
    # Create envs.
    env = gym.make(env_id)
    env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
+    gym.logger.setLevel(logging.WARN)

    if evaluation and rank==0:
        eval_env = gym.make(env_id)
--- a/baselines/ddpg/training.py
+++ b/baselines/ddpg/training.py
@@ -4,6 +4,7 @@ from collections import deque
 import pickle

 from baselines.ddpg.ddpg import DDPG
+from baselines.ddpg.util import mpi_mean, mpi_std, mpi_max, mpi_sum
 import baselines.common.tf_util as U

 from baselines import logger
@@ -34,7 +35,7 @@ def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, pa
        saver = tf.train.Saver()
    else:
        saver = None
-
+    
    step = 0
    episode = 0
    eval_episode_rewards_history = deque(maxlen=100)
@@ -109,7 +110,7 @@ def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, pa
                epoch_adaptive_distances = []
                for t_train in range(nb_train_steps):
                    # Adapt param noise, if necessary.
-                    if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0:
+                    if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0:
                        distance = agent.adapt_param_noise()
                        epoch_adaptive_distances.append(distance)

@@ -137,46 +138,42 @@ def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, pa
                            eval_episode_rewards_history.append(eval_episode_reward)
                            eval_episode_reward = 0.

-            mpi_size = MPI.COMM_WORLD.Get_size()
            # Log stats.
-            # XXX shouldn't call np.mean on variable length lists
+            epoch_train_duration = time.time() - epoch_start_time
            duration = time.time() - start_time
            stats = agent.get_stats()
-            combined_stats = stats.copy()
-            combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
-            combined_stats['rollout/return_history'] = np.mean(episode_rewards_history)
-            combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps)
-            combined_stats['rollout/actions_mean'] = np.mean(epoch_actions)
-            combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
-            combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
-            combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses)
-            combined_stats['train/param_noise_distance'] = np.mean(epoch_adaptive_distances)
-            combined_stats['total/duration'] = duration
-            combined_stats['total/steps_per_second'] = float(t) / float(duration)
-            combined_stats['total/episodes'] = episodes
-            combined_stats['rollout/episodes'] = epoch_episodes
-            combined_stats['rollout/actions_std'] = np.std(epoch_actions)
+            combined_stats = {}
+            for key in sorted(stats.keys()):
+                combined_stats[key] = mpi_mean(stats[key])
+
+            # Rollout statistics.
+            combined_stats['rollout/return'] = mpi_mean(epoch_episode_rewards)
+            combined_stats['rollout/return_history'] = mpi_mean(np.mean(episode_rewards_history))
+            combined_stats['rollout/episode_steps'] = mpi_mean(epoch_episode_steps)
+            combined_stats['rollout/episodes'] = mpi_sum(epoch_episodes)
+            combined_stats['rollout/actions_mean'] = mpi_mean(epoch_actions)
+            combined_stats['rollout/actions_std'] = mpi_std(epoch_actions)
+            combined_stats['rollout/Q_mean'] = mpi_mean(epoch_qs)
+    
+            # Train statistics.
+            combined_stats['train/loss_actor'] = mpi_mean(epoch_actor_losses)
+            combined_stats['train/loss_critic'] = mpi_mean(epoch_critic_losses)
+            combined_stats['train/param_noise_distance'] = mpi_mean(epoch_adaptive_distances)
+
            # Evaluation statistics.
            if eval_env is not None:
-                combined_stats['eval/return'] = eval_episode_rewards
-                combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history)
-                combined_stats['eval/Q'] = eval_qs
-                combined_stats['eval/episodes'] = len(eval_episode_rewards)
-            def as_scalar(x):
-                if isinstance(x, np.ndarray):
-                    assert x.size == 1
-                    return x[0]
-                elif np.isscalar(x):
-                    return x
-                else:
-                    raise ValueError('expected scalar, got %s'%x)
-            combined_stats_sums = MPI.COMM_WORLD.allreduce(np.array([as_scalar(x) for x in combined_stats.values()]))
-            combined_stats = {k : v / mpi_size for (k,v) in zip(combined_stats.keys(), combined_stats_sums)}
+                combined_stats['eval/return'] = mpi_mean(eval_episode_rewards)
+                combined_stats['eval/return_history'] = mpi_mean(np.mean(eval_episode_rewards_history))
+                combined_stats['eval/Q'] = mpi_mean(eval_qs)
+                combined_stats['eval/episodes'] = mpi_mean(len(eval_episode_rewards))

            # Total statistics.
+            combined_stats['total/duration'] = mpi_mean(duration)
+            combined_stats['total/steps_per_second'] = mpi_mean(float(t) / float(duration))
+            combined_stats['total/episodes'] = mpi_mean(episodes)
            combined_stats['total/epochs'] = epoch + 1
            combined_stats['total/steps'] = t
-
+            
            for key in sorted(combined_stats.keys()):
                logger.record_tabular(key, combined_stats[key])
            logger.dump_tabular()
@@ -189,3 +186,4 @@ def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, pa
                if eval_env and hasattr(eval_env, 'get_state'):
                    with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f:
                        pickle.dump(eval_env.get_state(), f)
+
--- a/baselines/ddpg/util.py
+++ b/baselines/ddpg/util.py
@@ -0,0 +1,44 @@
+import numpy as np
+import tensorflow as tf
+from mpi4py import MPI
+from baselines.common.mpi_moments import mpi_moments
+
+
+def reduce_var(x, axis=None, keepdims=False):
+    m = tf.reduce_mean(x, axis=axis, keep_dims=True)
+    devs_squared = tf.square(x - m)
+    return tf.reduce_mean(devs_squared, axis=axis, keep_dims=keepdims)
+
+
+def reduce_std(x, axis=None, keepdims=False):
+    return tf.sqrt(reduce_var(x, axis=axis, keepdims=keepdims))
+
+
+def mpi_mean(value):
+    if value == []:
+        value = [0.]
+    if not isinstance(value, list):
+        value = [value]
+    return mpi_moments(np.array(value))[0][0]
+
+
+def mpi_std(value):
+    if value == []:
+        value = [0.]
+    if not isinstance(value, list):
+        value = [value]
+    return mpi_moments(np.array(value))[1][0]
+
+
+def mpi_max(value):
+    global_max = np.zeros(1, dtype='float64')
+    local_max = np.max(value).astype('float64')
+    MPI.COMM_WORLD.Reduce(local_max, global_max, op=MPI.MAX)
+    return global_max[0]
+
+
+def mpi_sum(value):
+    global_sum = np.zeros(1, dtype='float64')
+    local_sum = np.sum(np.array(value)).astype('float64')
+    MPI.COMM_WORLD.Reduce(local_sum, global_sum, op=MPI.SUM)
+    return global_sum[0]
--- a/baselines/deepq/build_graph.py
+++ b/baselines/deepq/build_graph.py
@@ -97,37 +97,6 @@ import tensorflow as tf
 import baselines.common.tf_util as U


-def scope_vars(scope, trainable_only=False):
-    """
-    Get variables inside a scope
-    The scope can be specified as a string
-    Parameters
-    ----------
-    scope: str or VariableScope
-        scope in which the variables reside.
-    trainable_only: bool
-        whether or not to return only the variables that were marked as trainable.
-    Returns
-    -------
-    vars: [tf.Variable]
-        list of variables in `scope`.
-    """
-    return tf.get_collection(
-        tf.GraphKeys.TRAINABLE_VARIABLES if trainable_only else tf.GraphKeys.GLOBAL_VARIABLES,
-        scope=scope if isinstance(scope, str) else scope.name
-    )
-
-
-def scope_name():
-    """Returns the name of current scope as a string, e.g. deepq/q_func"""
-    return tf.get_variable_scope().name
-
-
-def absolute_scope_name(relative_scope_name):
-    """Appends parent scope name to `relative_scope_name`"""
-    return scope_name() + "/" + relative_scope_name
-
-
 def default_param_noise_filter(var):
    if var not in tf.trainable_variables():
        # We never perturb non-trainable vars.
@@ -174,7 +143,7 @@ def build_act(make_obs_ph, q_func, num_actions, scope="deepq", reuse=None):
 `       See the top of the file for details.
    """
    with tf.variable_scope(scope, reuse=reuse):
-        observations_ph = make_obs_ph("observation")
+        observations_ph = U.ensure_tf_input(make_obs_ph("observation"))
        stochastic_ph = tf.placeholder(tf.bool, (), name="stochastic")
        update_eps_ph = tf.placeholder(tf.float32, (), name="update_eps")

@@ -190,12 +159,10 @@ def build_act(make_obs_ph, q_func, num_actions, scope="deepq", reuse=None):

        output_actions = tf.cond(stochastic_ph, lambda: stochastic_actions, lambda: deterministic_actions)
        update_eps_expr = eps.assign(tf.cond(update_eps_ph >= 0, lambda: update_eps_ph, lambda: eps))
-        _act = U.function(inputs=[observations_ph, stochastic_ph, update_eps_ph],
+        act = U.function(inputs=[observations_ph, stochastic_ph, update_eps_ph],
                         outputs=output_actions,
                         givens={update_eps_ph: -1.0, stochastic_ph: True},
                         updates=[update_eps_expr])
-        def act(ob, stochastic=True, update_eps=-1):
-            return _act(ob, stochastic, update_eps)
        return act


@@ -236,7 +203,7 @@ def build_act_with_param_noise(make_obs_ph, q_func, num_actions, scope="deepq",
        param_noise_filter_func = default_param_noise_filter

    with tf.variable_scope(scope, reuse=reuse):
-        observations_ph = make_obs_ph("observation")
+        observations_ph = U.ensure_tf_input(make_obs_ph("observation"))
        stochastic_ph = tf.placeholder(tf.bool, (), name="stochastic")
        update_eps_ph = tf.placeholder(tf.float32, (), name="update_eps")
        update_param_noise_threshold_ph = tf.placeholder(tf.float32, (), name="update_param_noise_threshold")
@@ -256,8 +223,8 @@ def build_act_with_param_noise(make_obs_ph, q_func, num_actions, scope="deepq",
        # https://stackoverflow.com/questions/37063952/confused-by-the-behavior-of-tf-cond for
        # a more detailed discussion.
        def perturb_vars(original_scope, perturbed_scope):
-            all_vars = scope_vars(absolute_scope_name(original_scope))
-            all_perturbed_vars = scope_vars(absolute_scope_name(perturbed_scope))
+            all_vars = U.scope_vars(U.absolute_scope_name("q_func"))
+            all_perturbed_vars = U.scope_vars(U.absolute_scope_name("perturbed_q_func"))
            assert len(all_vars) == len(all_perturbed_vars)
            perturb_ops = []
            for var, perturbed_var in zip(all_vars, all_perturbed_vars):
@@ -305,12 +272,10 @@ def build_act_with_param_noise(make_obs_ph, q_func, num_actions, scope="deepq",
            tf.cond(update_param_noise_scale_ph, lambda: update_scale(), lambda: tf.Variable(0., trainable=False)),
            update_param_noise_threshold_expr,
        ]
-        _act = U.function(inputs=[observations_ph, stochastic_ph, update_eps_ph, reset_ph, update_param_noise_threshold_ph, update_param_noise_scale_ph],
+        act = U.function(inputs=[observations_ph, stochastic_ph, update_eps_ph, reset_ph, update_param_noise_threshold_ph, update_param_noise_scale_ph],
                         outputs=output_actions,
                         givens={update_eps_ph: -1.0, stochastic_ph: True, reset_ph: False, update_param_noise_threshold_ph: False, update_param_noise_scale_ph: False},
                         updates=updates)
-        def act(ob, reset, update_param_noise_threshold, update_param_noise_scale, stochastic=True, update_eps=-1):
-            return _act(ob, stochastic, update_eps, reset, update_param_noise_threshold, update_param_noise_scale)
        return act


@@ -377,20 +342,20 @@ def build_train(make_obs_ph, q_func, num_actions, optimizer, grad_norm_clipping=

    with tf.variable_scope(scope, reuse=reuse):
        # set up placeholders
-        obs_t_input = make_obs_ph("obs_t")
+        obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t"))
        act_t_ph = tf.placeholder(tf.int32, [None], name="action")
        rew_t_ph = tf.placeholder(tf.float32, [None], name="reward")
-        obs_tp1_input = make_obs_ph("obs_tp1")
+        obs_tp1_input = U.ensure_tf_input(make_obs_ph("obs_tp1"))
        done_mask_ph = tf.placeholder(tf.float32, [None], name="done")
        importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight")

        # q network evaluation
        q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True)  # reuse parameters from act
-        q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/q_func")
+        q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))

        # target q network evalution
        q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func")
-        target_q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/target_q_func")
+        target_q_func_vars = U.scope_vars(U.absolute_scope_name("target_q_func"))

        # q scores for actions which we know were selected in the given state.
        q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions), 1)
@@ -398,7 +363,7 @@ def build_train(make_obs_ph, q_func, num_actions, optimizer, grad_norm_clipping=
        # compute estimate of best possible value starting from state at t + 1
        if double_q:
            q_tp1_using_online_net = q_func(obs_tp1_input.get(), num_actions, scope="q_func", reuse=True)
-            q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1)
+            q_tp1_best_using_online_net = tf.arg_max(q_tp1_using_online_net, 1)
            q_tp1_best = tf.reduce_sum(q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions), 1)
        else:
            q_tp1_best = tf.reduce_max(q_tp1, 1)
@@ -414,11 +379,10 @@ def build_train(make_obs_ph, q_func, num_actions, optimizer, grad_norm_clipping=

        # compute optimization op (potentially with gradient clipping)
        if grad_norm_clipping is not None:
-            gradients = optimizer.compute_gradients(weighted_error, var_list=q_func_vars)
-            for i, (grad, var) in enumerate(gradients):
-                if grad is not None:
-                    gradients[i] = (tf.clip_by_norm(grad, grad_norm_clipping), var)
-            optimize_expr = optimizer.apply_gradients(gradients)
+            optimize_expr = U.minimize_and_clip(optimizer,
+                                                weighted_error,
+                                                var_list=q_func_vars,
+                                                clip_val=grad_norm_clipping)
        else:
            optimize_expr = optimizer.minimize(weighted_error, var_list=q_func_vars)

--- a/baselines/deepq/experiments/atari/init.py
+++ b/baselines/deepq/experiments/atari/init.py
--- a/baselines/deepq/experiments/atari/download_model.py
+++ b/baselines/deepq/experiments/atari/download_model.py
@@ -0,0 +1,51 @@
+import argparse
+import progressbar
+
+from baselines.common.azure_utils import Container
+
+
+def parse_args():
+    parser = argparse.ArgumentParser("Download a pretrained model from Azure.")
+    # Environment
+    parser.add_argument("--model-dir", type=str, default=None,
+                        help="save model in this directory this directory. ")
+    parser.add_argument("--account-name", type=str, default="openaisciszymon",
+                        help="account name for Azure Blob Storage")
+    parser.add_argument("--account-key", type=str, default=None,
+                        help="account key for Azure Blob Storage")
+    parser.add_argument("--container", type=str, default="dqn-blogpost",
+                        help="container name and blob name separated by colon serparated by colon")
+    parser.add_argument("--blob", type=str, default=None, help="blob with the model")
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+    c = Container(account_name=args.account_name,
+                  account_key=args.account_key,
+                  container_name=args.container)
+
+    if args.blob is None:
+        print("Listing available models:")
+        print()
+        for blob in sorted(c.list(prefix="model-")):
+            print(blob)
+    else:
+        print("Downloading {} to {}...".format(args.blob, args.model_dir))
+        bar = None
+
+        def callback(current, total):
+            nonlocal bar
+            if bar is None:
+                bar = progressbar.ProgressBar(max_value=total)
+            bar.update(current)
+
+        assert c.exists(args.blob), "model {} does not exist".format(args.blob)
+
+        assert args.model_dir is not None
+
+        c.get(args.model_dir, args.blob, callback=callback)
+
+
+if __name__ == '__main__':
+    main()
--- a/baselines/deepq/experiments/atari/enjoy.py
+++ b/baselines/deepq/experiments/atari/enjoy.py
@@ -0,0 +1,70 @@
+import argparse
+import gym
+import os
+import numpy as np
+
+from gym.monitoring import VideoRecorder
+
+import baselines.common.tf_util as U
+
+from baselines import deepq
+from baselines.common.misc_util import (
+    boolean_flag,
+)
+from baselines import bench
+from baselines.common.atari_wrappers_deprecated import wrap_dqn
+from baselines.deepq.experiments.atari.model import model, dueling_model
+
+
+def parse_args():
+    parser = argparse.ArgumentParser("Run an already learned DQN model.")
+    # Environment
+    parser.add_argument("--env", type=str, required=True, help="name of the game")
+    parser.add_argument("--model-dir", type=str, default=None, help="load model from this directory. ")
+    parser.add_argument("--video", type=str, default=None, help="Path to mp4 file where the video of first episode will be recorded.")
+    boolean_flag(parser, "stochastic", default=True, help="whether or not to use stochastic actions according to models eps value")
+    boolean_flag(parser, "dueling", default=False, help="whether or not to use dueling model")
+
+    return parser.parse_args()
+
+
+def make_env(game_name):
+    env = gym.make(game_name + "NoFrameskip-v4")
+    env = bench.Monitor(env, None)
+    env = wrap_dqn(env)
+    return env
+
+
+def play(env, act, stochastic, video_path):
+    num_episodes = 0
+    video_recorder = None
+    video_recorder = VideoRecorder(
+        env, video_path, enabled=video_path is not None)
+    obs = env.reset()
+    while True:
+        env.unwrapped.render()
+        video_recorder.capture_frame()
+        action = act(np.array(obs)[None], stochastic=stochastic)[0]
+        obs, rew, done, info = env.step(action)
+        if done:
+            obs = env.reset()
+        if len(info["rewards"]) > num_episodes:
+            if len(info["rewards"]) == 1 and video_recorder.enabled:
+                # save video of first episode
+                print("Saved video.")
+                video_recorder.close()
+                video_recorder.enabled = False
+            print(info["rewards"][-1])
+            num_episodes = len(info["rewards"])
+
+
+if __name__ == '__main__':
+    with U.make_session(4) as sess:
+        args = parse_args()
+        env = make_env(args.env)
+        act = deepq.build_act(
+            make_obs_ph=lambda name: U.Uint8Input(env.observation_space.shape, name=name),
+            q_func=dueling_model if args.dueling else model,
+            num_actions=env.action_space.n)
+        U.load_state(os.path.join(args.model_dir, "saved"))
+        play(env, act, args.stochastic, args.video)
--- a/baselines/deepq/experiments/atari/model.py
+++ b/baselines/deepq/experiments/atari/model.py
@@ -0,0 +1,60 @@
+import tensorflow as tf
+import tensorflow.contrib.layers as layers
+
+
+def layer_norm_fn(x, relu=True):
+    x = layers.layer_norm(x, scale=True, center=True)
+    if relu:
+        x = tf.nn.relu(x)
+    return x
+
+
+def model(img_in, num_actions, scope, reuse=False, layer_norm=False):
+    """As described in https://storage.googleapis.com/deepmind-data/assets/papers/DeepMindNature14236Paper.pdf"""
+    with tf.variable_scope(scope, reuse=reuse):
+        out = img_in
+        with tf.variable_scope("convnet"):
+            # original architecture
+            out = layers.convolution2d(out, num_outputs=32, kernel_size=8, stride=4, activation_fn=tf.nn.relu)
+            out = layers.convolution2d(out, num_outputs=64, kernel_size=4, stride=2, activation_fn=tf.nn.relu)
+            out = layers.convolution2d(out, num_outputs=64, kernel_size=3, stride=1, activation_fn=tf.nn.relu)
+        conv_out = layers.flatten(out)
+
+        with tf.variable_scope("action_value"):
+            value_out = layers.fully_connected(conv_out, num_outputs=512, activation_fn=None)
+            if layer_norm:
+                value_out = layer_norm_fn(value_out, relu=True)
+            else:
+                value_out = tf.nn.relu(value_out)
+            value_out = layers.fully_connected(value_out, num_outputs=num_actions, activation_fn=None)
+        return value_out
+
+
+def dueling_model(img_in, num_actions, scope, reuse=False, layer_norm=False):
+    """As described in https://arxiv.org/abs/1511.06581"""
+    with tf.variable_scope(scope, reuse=reuse):
+        out = img_in
+        with tf.variable_scope("convnet"):
+            # original architecture
+            out = layers.convolution2d(out, num_outputs=32, kernel_size=8, stride=4, activation_fn=tf.nn.relu)
+            out = layers.convolution2d(out, num_outputs=64, kernel_size=4, stride=2, activation_fn=tf.nn.relu)
+            out = layers.convolution2d(out, num_outputs=64, kernel_size=3, stride=1, activation_fn=tf.nn.relu)
+        conv_out = layers.flatten(out)
+
+        with tf.variable_scope("state_value"):
+            state_hidden = layers.fully_connected(conv_out, num_outputs=512, activation_fn=None)
+            if layer_norm:
+                state_hidden = layer_norm_fn(state_hidden, relu=True)
+            else:
+                state_hidden = tf.nn.relu(state_hidden)
+            state_score = layers.fully_connected(state_hidden, num_outputs=1, activation_fn=None)
+        with tf.variable_scope("action_value"):
+            actions_hidden = layers.fully_connected(conv_out, num_outputs=512, activation_fn=None)
+            if layer_norm:
+                actions_hidden = layer_norm_fn(actions_hidden, relu=True)
+            else:
+                actions_hidden = tf.nn.relu(actions_hidden)
+            action_scores = layers.fully_connected(actions_hidden, num_outputs=num_actions, activation_fn=None)
+            action_scores_mean = tf.reduce_mean(action_scores, 1)
+            action_scores = action_scores - tf.expand_dims(action_scores_mean, 1)
+        return state_score + action_scores
--- a/baselines/deepq/experiments/atari/train.py
+++ b/baselines/deepq/experiments/atari/train.py
@@ -0,0 +1,273 @@
+import argparse
+import gym
+import numpy as np
+import os
+import tensorflow as tf
+import tempfile
+import time
+import json
+
+import baselines.common.tf_util as U
+
+from baselines import logger
+from baselines import deepq
+from baselines.deepq.replay_buffer import ReplayBuffer, PrioritizedReplayBuffer
+from baselines.common.misc_util import (
+    boolean_flag,
+    pickle_load,
+    pretty_eta,
+    relatively_safe_pickle_dump,
+    set_global_seeds,
+    RunningAvg,
+)
+from baselines.common.schedules import LinearSchedule, PiecewiseSchedule
+from baselines import bench
+from baselines.common.atari_wrappers_deprecated import wrap_dqn
+from baselines.common.azure_utils import Container
+from .model import model, dueling_model
+
+
+def parse_args():
+    parser = argparse.ArgumentParser("DQN experiments for Atari games")
+    # Environment
+    parser.add_argument("--env", type=str, default="Pong", help="name of the game")
+    parser.add_argument("--seed", type=int, default=42, help="which seed to use")
+    # Core DQN parameters
+    parser.add_argument("--replay-buffer-size", type=int, default=int(1e6), help="replay buffer size")
+    parser.add_argument("--lr", type=float, default=1e-4, help="learning rate for Adam optimizer")
+    parser.add_argument("--num-steps", type=int, default=int(2e8), help="total number of steps to run the environment for")
+    parser.add_argument("--batch-size", type=int, default=32, help="number of transitions to optimize at the same time")
+    parser.add_argument("--learning-freq", type=int, default=4, help="number of iterations between every optimization step")
+    parser.add_argument("--target-update-freq", type=int, default=40000, help="number of iterations between every target network update")
+    parser.add_argument("--param-noise-update-freq", type=int, default=50, help="number of iterations between every re-scaling of the parameter noise")
+    parser.add_argument("--param-noise-reset-freq", type=int, default=10000, help="maximum number of steps to take per episode before re-perturbing the exploration policy")
+    # Bells and whistles
+    boolean_flag(parser, "double-q", default=True, help="whether or not to use double q learning")
+    boolean_flag(parser, "dueling", default=False, help="whether or not to use dueling model")
+    boolean_flag(parser, "prioritized", default=False, help="whether or not to use prioritized replay buffer")
+    boolean_flag(parser, "param-noise", default=False, help="whether or not to use parameter space noise for exploration")
+    boolean_flag(parser, "layer-norm", default=False, help="whether or not to use layer norm (should be True if param_noise is used)")
+    boolean_flag(parser, "gym-monitor", default=False, help="whether or not to use a OpenAI Gym monitor (results in slower training due to video recording)")
+    parser.add_argument("--prioritized-alpha", type=float, default=0.6, help="alpha parameter for prioritized replay buffer")
+    parser.add_argument("--prioritized-beta0", type=float, default=0.4, help="initial value of beta parameters for prioritized replay")
+    parser.add_argument("--prioritized-eps", type=float, default=1e-6, help="eps parameter for prioritized replay buffer")
+    # Checkpointing
+    parser.add_argument("--save-dir", type=str, default=None, help="directory in which training state and model should be saved.")
+    parser.add_argument("--save-azure-container", type=str, default=None,
+                        help="It present data will saved/loaded from Azure. Should be in format ACCOUNT_NAME:ACCOUNT_KEY:CONTAINER")
+    parser.add_argument("--save-freq", type=int, default=1e6, help="save model once every time this many iterations are completed")
+    boolean_flag(parser, "load-on-start", default=True, help="if true and model was previously saved then training will be resumed")
+    return parser.parse_args()
+
+
+def make_env(game_name):
+    env = gym.make(game_name + "NoFrameskip-v4")
+    monitored_env = bench.Monitor(env, logger.get_dir())  # puts rewards and number of steps in info, before environment is wrapped
+    env = wrap_dqn(monitored_env)  # applies a bunch of modification to simplify the observation space (downsample, make b/w)
+    return env, monitored_env
+
+
+def maybe_save_model(savedir, container, state):
+    """This function checkpoints the model and state of the training algorithm."""
+    if savedir is None:
+        return
+    start_time = time.time()
+    model_dir = "model-{}".format(state["num_iters"])
+    U.save_state(os.path.join(savedir, model_dir, "saved"))
+    if container is not None:
+        container.put(os.path.join(savedir, model_dir), model_dir)
+    relatively_safe_pickle_dump(state, os.path.join(savedir, 'training_state.pkl.zip'), compression=True)
+    if container is not None:
+        container.put(os.path.join(savedir, 'training_state.pkl.zip'), 'training_state.pkl.zip')
+    relatively_safe_pickle_dump(state["monitor_state"], os.path.join(savedir, 'monitor_state.pkl'))
+    if container is not None:
+        container.put(os.path.join(savedir, 'monitor_state.pkl'), 'monitor_state.pkl')
+    logger.log("Saved model in {} seconds\n".format(time.time() - start_time))
+
+
+def maybe_load_model(savedir, container):
+    """Load model if present at the specified path."""
+    if savedir is None:
+        return
+
+    state_path = os.path.join(os.path.join(savedir, 'training_state.pkl.zip'))
+    if container is not None:
+        logger.log("Attempting to download model from Azure")
+        found_model = container.get(savedir, 'training_state.pkl.zip')
+    else:
+        found_model = os.path.exists(state_path)
+    if found_model:
+        state = pickle_load(state_path, compression=True)
+        model_dir = "model-{}".format(state["num_iters"])
+        if container is not None:
+            container.get(savedir, model_dir)
+        U.load_state(os.path.join(savedir, model_dir, "saved"))
+        logger.log("Loaded models checkpoint at {} iterations".format(state["num_iters"]))
+        return state
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    
+    # Parse savedir and azure container.
+    savedir = args.save_dir
+    if savedir is None:
+        savedir = os.getenv('OPENAI_LOGDIR', None)
+    if args.save_azure_container is not None:
+        account_name, account_key, container_name = args.save_azure_container.split(":")
+        container = Container(account_name=account_name,
+                              account_key=account_key,
+                              container_name=container_name,
+                              maybe_create=True)
+        if savedir is None:
+            # Careful! This will not get cleaned up. Docker spoils the developers.
+            savedir = tempfile.TemporaryDirectory().name
+    else:
+        container = None
+    # Create and seed the env.
+    env, monitored_env = make_env(args.env)
+    if args.seed > 0:
+        set_global_seeds(args.seed)
+        env.unwrapped.seed(args.seed)
+
+    if args.gym_monitor and savedir:
+        env = gym.wrappers.Monitor(env, os.path.join(savedir, 'gym_monitor'), force=True)
+
+    if savedir:
+        with open(os.path.join(savedir, 'args.json'), 'w') as f:
+            json.dump(vars(args), f)
+
+    with U.make_session(4) as sess:
+        # Create training graph and replay buffer
+        def model_wrapper(img_in, num_actions, scope, **kwargs):
+            actual_model = dueling_model if args.dueling else model
+            return actual_model(img_in, num_actions, scope, layer_norm=args.layer_norm, **kwargs)
+        act, train, update_target, debug = deepq.build_train(
+            make_obs_ph=lambda name: U.Uint8Input(env.observation_space.shape, name=name),
+            q_func=model_wrapper,
+            num_actions=env.action_space.n,
+            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr, epsilon=1e-4),
+            gamma=0.99,
+            grad_norm_clipping=10,
+            double_q=args.double_q,
+            param_noise=args.param_noise
+        )
+
+        approximate_num_iters = args.num_steps / 4
+        exploration = PiecewiseSchedule([
+            (0, 1.0),
+            (approximate_num_iters / 50, 0.1),
+            (approximate_num_iters / 5, 0.01)
+        ], outside_value=0.01)
+
+        if args.prioritized:
+            replay_buffer = PrioritizedReplayBuffer(args.replay_buffer_size, args.prioritized_alpha)
+            beta_schedule = LinearSchedule(approximate_num_iters, initial_p=args.prioritized_beta0, final_p=1.0)
+        else:
+            replay_buffer = ReplayBuffer(args.replay_buffer_size)
+
+        U.initialize()
+        update_target()
+        num_iters = 0
+
+        # Load the model
+        state = maybe_load_model(savedir, container)
+        if state is not None:
+            num_iters, replay_buffer = state["num_iters"], state["replay_buffer"],
+            monitored_env.set_state(state["monitor_state"])
+
+        start_time, start_steps = None, None
+        steps_per_iter = RunningAvg(0.999)
+        iteration_time_est = RunningAvg(0.999)
+        obs = env.reset()
+        num_iters_since_reset = 0
+        reset = True
+
+        # Main trianing loop
+        while True:
+            num_iters += 1
+            num_iters_since_reset += 1
+
+            # Take action and store transition in the replay buffer.
+            kwargs = {}
+            if not args.param_noise:
+                update_eps = exploration.value(num_iters)
+                update_param_noise_threshold = 0.
+            else:
+                if args.param_noise_reset_freq > 0 and num_iters_since_reset > args.param_noise_reset_freq:
+                    # Reset param noise policy since we have exceeded the maximum number of steps without a reset.
+                    reset = True
+
+                update_eps = 0.01  # ensures that we cannot get stuck completely
+                # Compute the threshold such that the KL divergence between perturbed and non-perturbed
+                # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
+                # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
+                # for detailed explanation.
+                update_param_noise_threshold = -np.log(1. - exploration.value(num_iters) + exploration.value(num_iters) / float(env.action_space.n))
+                kwargs['reset'] = reset
+                kwargs['update_param_noise_threshold'] = update_param_noise_threshold
+                kwargs['update_param_noise_scale'] = (num_iters % args.param_noise_update_freq == 0)
+
+            action = act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0]
+            reset = False
+            new_obs, rew, done, info = env.step(action)
+            replay_buffer.add(obs, action, rew, new_obs, float(done))
+            obs = new_obs
+            if done:
+                num_iters_since_reset = 0
+                obs = env.reset()
+                reset = True
+
+            if (num_iters > max(5 * args.batch_size, args.replay_buffer_size // 20) and
+                    num_iters % args.learning_freq == 0):
+                # Sample a bunch of transitions from replay buffer
+                if args.prioritized:
+                    experience = replay_buffer.sample(args.batch_size, beta=beta_schedule.value(num_iters))
+                    (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience
+                else:
+                    obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(args.batch_size)
+                    weights = np.ones_like(rewards)
+                # Minimize the error in Bellman's equation and compute TD-error
+                td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights)
+                # Update the priorities in the replay buffer
+                if args.prioritized:
+                    new_priorities = np.abs(td_errors) + args.prioritized_eps
+                    replay_buffer.update_priorities(batch_idxes, new_priorities)
+            # Update target network.
+            if num_iters % args.target_update_freq == 0:
+                update_target()
+
+            if start_time is not None:
+                steps_per_iter.update(info['steps'] - start_steps)
+                iteration_time_est.update(time.time() - start_time)
+            start_time, start_steps = time.time(), info["steps"]
+
+            # Save the model and training state.
+            if num_iters > 0 and (num_iters % args.save_freq == 0 or info["steps"] > args.num_steps):
+                maybe_save_model(savedir, container, {
+                    'replay_buffer': replay_buffer,
+                    'num_iters': num_iters,
+                    'monitor_state': monitored_env.get_state(),
+                })
+
+            if info["steps"] > args.num_steps:
+                break
+
+            if done:
+                steps_left = args.num_steps - info["steps"]
+                completion = np.round(info["steps"] / args.num_steps, 1)
+
+                logger.record_tabular("% completion", completion)
+                logger.record_tabular("steps", info["steps"])
+                logger.record_tabular("iters", num_iters)
+                logger.record_tabular("episodes", len(info["rewards"]))
+                logger.record_tabular("reward (100 epi mean)", np.mean(info["rewards"][-100:]))
+                logger.record_tabular("exploration", exploration.value(num_iters))
+                if args.prioritized:
+                    logger.record_tabular("max priority", replay_buffer._max_priority)
+                fps_estimate = (float(steps_per_iter) / (float(iteration_time_est) + 1e-6)
+                                if steps_per_iter._value is not None else "calculating...")
+                logger.dump_tabular()
+                logger.log()
+                logger.log("ETA: " + pretty_eta(int(steps_left / fps_estimate)))
+                logger.log()
--- a/baselines/deepq/experiments/atari/wang2015_eval.py
+++ b/baselines/deepq/experiments/atari/wang2015_eval.py
@@ -0,0 +1,81 @@
+import argparse
+import gym
+import numpy as np
+import os
+
+import baselines.common.tf_util as U
+
+from baselines import deepq, bench
+from baselines.common.misc_util import get_wrapper_by_name, boolean_flag, set_global_seeds
+from baselines.common.atari_wrappers_deprecated import wrap_dqn
+from baselines.deepq.experiments.atari.model import model, dueling_model
+
+
+def make_env(game_name):
+    env = gym.make(game_name + "NoFrameskip-v4")
+    env_monitored = bench.Monitor(env, None)
+    env = wrap_dqn(env_monitored)
+    return env_monitored, env
+
+
+def parse_args():
+    parser = argparse.ArgumentParser("Evaluate an already learned DQN model.")
+    # Environment
+    parser.add_argument("--env", type=str, required=True, help="name of the game")
+    parser.add_argument("--model-dir", type=str, default=None, help="load model from this directory. ")
+    boolean_flag(parser, "stochastic", default=True, help="whether or not to use stochastic actions according to models eps value")
+    boolean_flag(parser, "dueling", default=False, help="whether or not to use dueling model")
+
+    return parser.parse_args()
+
+
+def wang2015_eval(game_name, act, stochastic):
+    print("==================== wang2015 evaluation ====================")
+    episode_rewards = []
+
+    for num_noops in range(1, 31):
+        env_monitored, eval_env = make_env(game_name)
+        eval_env.unwrapped.seed(1)
+
+        get_wrapper_by_name(eval_env, "NoopResetEnv").override_num_noops = num_noops
+
+        eval_episode_steps = 0
+        done = True
+        while True:
+            if done:
+                obs = eval_env.reset()
+            eval_episode_steps += 1
+            action = act(np.array(obs)[None], stochastic=stochastic)[0]
+
+            obs, _reward, done, info = eval_env.step(action)
+            if done:
+                obs = eval_env.reset()
+            if len(info["rewards"]) > 0:
+                episode_rewards.append(info["rewards"][0])
+                break
+            if info["steps"] > 108000:  # 5 minutes of gameplay
+                episode_rewards.append(sum(env_monitored.rewards))
+                break
+        print("Num steps in episode {} was {} yielding {} reward".format(
+              num_noops, eval_episode_steps, episode_rewards[-1]), flush=True)
+    print("Evaluation results: " + str(np.mean(episode_rewards)))
+    print("=============================================================")
+    return np.mean(episode_rewards)
+
+
+def main():
+    set_global_seeds(1)
+    args = parse_args()
+    with U.make_session(4):  # noqa
+        _, env = make_env(args.env)
+        act = deepq.build_act(
+            make_obs_ph=lambda name: U.Uint8Input(env.observation_space.shape, name=name),
+            q_func=dueling_model if args.dueling else model,
+            num_actions=env.action_space.n)
+
+        U.load_state(os.path.join(args.model_dir, "saved"))
+        wang2015_eval(args.env, act, stochastic=args.stochastic)
+
+
+if __name__ == '__main__':
+    main()
--- a/baselines/deepq/experiments/custom_cartpole.py
+++ b/baselines/deepq/experiments/custom_cartpole.py
@@ -9,7 +9,6 @@ import baselines.common.tf_util as U
 from baselines import logger
 from baselines import deepq
 from baselines.deepq.replay_buffer import ReplayBuffer
-from baselines.deepq.utils import BatchInput
 from baselines.common.schedules import LinearSchedule


@@ -28,7 +27,7 @@ if __name__ == '__main__':
        env = gym.make("CartPole-v0")
        # Create all the functions necessary to train the model
        act, train, update_target, debug = deepq.build_train(
-            make_obs_ph=lambda name: BatchInput(env.observation_space.shape, name=name),
+            make_obs_ph=lambda name: U.BatchInput(env.observation_space.shape, name=name),
            q_func=model,
            num_actions=env.action_space.n,
            optimizer=tf.train.AdamOptimizer(learning_rate=5e-4),
--- a/baselines/deepq/experiments/run_atari.py
+++ b/baselines/deepq/experiments/run_atari.py
@@ -1,3 +1,5 @@
+import gym
+
 from baselines import deepq
 from baselines.common import set_global_seeds
 from baselines import bench
--- a/baselines/deepq/experiments/train_cartpole.py
+++ b/baselines/deepq/experiments/train_cartpole.py
@@ -3,7 +3,7 @@ import gym
 from baselines import deepq


-def callback(lcl, _glb):
+def callback(lcl, glb):
    # stop training if reward exceeds 199
    is_solved = lcl['t'] > 100 and sum(lcl['episode_rewards'][-101:-1]) / 100 >= 199
    return is_solved
--- a/baselines/deepq/replay_buffer.py
+++ b/baselines/deepq/replay_buffer.py
@@ -6,7 +6,7 @@ from baselines.common.segment_tree import SumSegmentTree, MinSegmentTree

 class ReplayBuffer(object):
    def __init__(self, size):
-        """Create Replay buffer.
+        """Create Prioritized Replay buffer.

        Parameters
        ----------
--- a/baselines/deepq/simple.py
+++ b/baselines/deepq/simple.py
@@ -12,7 +12,6 @@ from baselines import logger
 from baselines.common.schedules import LinearSchedule
 from baselines import deepq
 from baselines.deepq.replay_buffer import ReplayBuffer, PrioritizedReplayBuffer
-from baselines.deepq.utils import BatchInput, load_state, save_state


 class ActWrapper(object):
@@ -33,7 +32,7 @@ class ActWrapper(object):
                f.write(model_data)

            zipfile.ZipFile(arc_path, 'r', zipfile.ZIP_DEFLATED).extractall(td)
-            load_state(os.path.join(td, "model"))
+            U.load_state(os.path.join(td, "model"))

        return ActWrapper(act, act_params)

@@ -46,7 +45,7 @@ class ActWrapper(object):
            path = os.path.join(logger.get_dir(), "model.pkl")

        with tempfile.TemporaryDirectory() as td:
-            save_state(os.path.join(td, "model"))
+            U.save_state(os.path.join(td, "model"))
            arc_name = os.path.join(td, "packed.zip")
            with zipfile.ZipFile(arc_name, 'w') as zipf:
                for root, dirs, files in os.walk(td):
@@ -172,7 +171,7 @@ def learn(env,
    # by cloudpickle when serializing make_obs_ph
    observation_space_shape = env.observation_space.shape
    def make_obs_ph(name):
-        return BatchInput(observation_space_shape, name=name)
+        return U.BatchInput(observation_space_shape, name=name)

    act, train, update_target, debug = deepq.build_train(
        make_obs_ph=make_obs_ph,
@@ -239,7 +238,11 @@ def learn(env,
                kwargs['update_param_noise_threshold'] = update_param_noise_threshold
                kwargs['update_param_noise_scale'] = True
            action = act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0]
-            env_action = action
+            if isinstance(env.action_space, gym.spaces.MultiBinary):
+                env_action = np.zeros(env.action_space.n)
+                env_action[action] = 1
+            else:
+                env_action = action
            reset = False
            new_obs, rew, done, _ = env.step(env_action)
            # Store transition in the replay buffer.
@@ -284,12 +287,12 @@ def learn(env,
                    if print_freq is not None:
                        logger.log("Saving model due to mean reward increase: {} -> {}".format(
                                   saved_mean_reward, mean_100ep_reward))
-                    save_state(model_file)
+                    U.save_state(model_file)
                    model_saved = True
                    saved_mean_reward = mean_100ep_reward
        if model_saved:
            if print_freq is not None:
                logger.log("Restored model with mean reward: {}".format(saved_mean_reward))
-            load_state(model_file)
+            U.load_state(model_file)

    return act
--- a/baselines/deepq/utils.py
+++ b/baselines/deepq/utils.py
@@ -1,88 +0,0 @@
-import os
-
-import tensorflow as tf
-
-# ================================================================
-# Saving variables
-# ================================================================
-
-def load_state(fname):
-    saver = tf.train.Saver()
-    saver.restore(tf.get_default_session(), fname)
-
-def save_state(fname):
-    os.makedirs(os.path.dirname(fname), exist_ok=True)
-    saver = tf.train.Saver()
-    saver.save(tf.get_default_session(), fname)
-
-# ================================================================
-# Placeholders
-# ================================================================
-
-class TfInput(object):
-    def __init__(self, name="(unnamed)"):
-        """Generalized Tensorflow placeholder. The main differences are:
-            - possibly uses multiple placeholders internally and returns multiple values
-            - can apply light postprocessing to the value feed to placeholder.
-        """
-        self.name = name
-
-    def get(self):
-        """Return the tf variable(s) representing the possibly postprocessed value
-        of placeholder(s).
-        """
-        raise NotImplemented()
-
-    def make_feed_dict(data):
-        """Given data input it to the placeholder(s)."""
-        raise NotImplemented()
-
-
-class PlaceholderTfInput(TfInput):
-    def __init__(self, placeholder):
-        """Wrapper for regular tensorflow placeholder."""
-        super().__init__(placeholder.name)
-        self._placeholder = placeholder
-
-    def get(self):
-        return self._placeholder
-
-    def make_feed_dict(self, data):
-        return {self._placeholder: data}
-
-class BatchInput(PlaceholderTfInput):
-    def __init__(self, shape, dtype=tf.float32, name=None):
-        """Creates a placeholder for a batch of tensors of a given shape and dtype
-
-        Parameters
-        ----------
-        shape: [int]
-            shape of a single elemenet of the batch
-        dtype: tf.dtype
-            number representation used for tensor contents
-        name: str
-            name of the underlying placeholder
-        """
-        super().__init__(tf.placeholder(dtype, [None] + list(shape), name=name))
-
-class Uint8Input(PlaceholderTfInput):
-    def __init__(self, shape, name=None):
-        """Takes input in uint8 format which is cast to float32 and divided by 255
-        before passing it to the model.
-
-        On GPU this ensures lower data transfer times.
-
-        Parameters
-        ----------
-        shape: [int]
-            shape of the tensor.
-        name: str
-            name of the underlying placeholder
-        """
-
-        super().__init__(tf.placeholder(tf.uint8, [None] + list(shape), name=name))
-        self._shape = shape
-        self._output = tf.cast(super().get(), tf.float32) / 255.0
-
-    def get(self):
-        return self._output
--- a/baselines/gail/README.md
+++ b/baselines/gail/README.md
@@ -1,52 +0,0 @@
-# Generative Adversarial Imitation Learning (GAIL)
-
- Original paper: https://arxiv.org/abs/1606.03476
-
-For results benchmarking on MuJoCo, please navigate to [here](result/gail-result.md)
-
-## If you want to train an imitation learning agent
-
-### Step 1: Download expert data
-
-Download the expert data into `./data`, [download link](https://drive.google.com/drive/folders/1h3H4AY_ZBx08hz-Ct0Nxxus-V1melu1U?usp=sharing)
-
-### Step 2: Run GAIL
-
-Run with single thread:
-
-```bash
-python -m baselines.gail.run_mujoco
-```
-
-Run with multiple threads:
-
-```bash
-mpirun -np 16 python -m baselines.gail.run_mujoco
-```
-
-See help (`-h`) for more options.
-
-#### In case you want to run Behavior Cloning (BC)
-
-```bash
-python -m baselines.gail.behavior_clone
-```
-
-See help (`-h`) for more options.
-
-
-## Contributing
-
-Bug reports and pull requests are welcome on GitHub at https://github.com/openai/baselines/pulls.
-
-## Maintainers
-
- Yuan-Hong Liao, andrewliao11_at_gmail_dot_com
- Ryan Julian, ryanjulian_at_gmail_dot_com
-
-## Others
-
-Thanks to the open source:
-
- @openai/imitation
- @carpedm20/deep-rl-tensorflow
--- a/baselines/gail/init.py
+++ b/baselines/gail/init.py
--- a/baselines/gail/adversary.py
+++ b/baselines/gail/adversary.py
@@ -1,87 +0,0 @@
-'''
-Reference: https://github.com/openai/imitation
-I follow the architecture from the official repository
-'''
-import tensorflow as tf
-import numpy as np
-
-from baselines.common.mpi_running_mean_std import RunningMeanStd
-from baselines.common import tf_util as U
-
-def logsigmoid(a):
-    '''Equivalent to tf.log(tf.sigmoid(a))'''
-    return -tf.nn.softplus(-a)
-
-""" Reference: https://github.com/openai/imitation/blob/99fbccf3e060b6e6c739bdf209758620fcdefd3c/policyopt/thutil.py#L48-L51"""
-def logit_bernoulli_entropy(logits):
-    ent = (1.-tf.nn.sigmoid(logits))*logits - logsigmoid(logits)
-    return ent
-
-class TransitionClassifier(object):
-    def __init__(self, env, hidden_size, entcoeff=0.001, lr_rate=1e-3, scope="adversary"):
-        self.scope = scope
-        self.observation_shape = env.observation_space.shape
-        self.actions_shape = env.action_space.shape
-        self.input_shape = tuple([o+a for o, a in zip(self.observation_shape, self.actions_shape)])
-        self.num_actions = env.action_space.shape[0]
-        self.hidden_size = hidden_size
-        self.build_ph()
-        # Build grpah
-        generator_logits = self.build_graph(self.generator_obs_ph, self.generator_acs_ph, reuse=False)
-        expert_logits = self.build_graph(self.expert_obs_ph, self.expert_acs_ph, reuse=True)
-        # Build accuracy
-        generator_acc = tf.reduce_mean(tf.to_float(tf.nn.sigmoid(generator_logits) < 0.5))
-        expert_acc = tf.reduce_mean(tf.to_float(tf.nn.sigmoid(expert_logits) > 0.5))
-        # Build regression loss
-        # let x = logits, z = targets.
-        # z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
-        generator_loss = tf.nn.sigmoid_cross_entropy_with_logits(logits=generator_logits, labels=tf.zeros_like(generator_logits))
-        generator_loss = tf.reduce_mean(generator_loss)
-        expert_loss = tf.nn.sigmoid_cross_entropy_with_logits(logits=expert_logits, labels=tf.ones_like(expert_logits))
-        expert_loss = tf.reduce_mean(expert_loss)
-        # Build entropy loss
-        logits = tf.concat([generator_logits, expert_logits], 0)
-        entropy = tf.reduce_mean(logit_bernoulli_entropy(logits))
-        entropy_loss = -entcoeff*entropy
-        # Loss + Accuracy terms
-        self.losses = [generator_loss, expert_loss, entropy, entropy_loss, generator_acc, expert_acc]
-        self.loss_name = ["generator_loss", "expert_loss", "entropy", "entropy_loss", "generator_acc", "expert_acc"]
-        self.total_loss = generator_loss + expert_loss + entropy_loss
-        # Build Reward for policy
-        self.reward_op = -tf.log(1-tf.nn.sigmoid(generator_logits)+1e-8)
-        var_list = self.get_trainable_variables()
-        self.lossandgrad = U.function([self.generator_obs_ph, self.generator_acs_ph, self.expert_obs_ph, self.expert_acs_ph],
-                                      self.losses + [U.flatgrad(self.total_loss, var_list)])
-
-    def build_ph(self):
-        self.generator_obs_ph = tf.placeholder(tf.float32, (None, ) + self.observation_shape, name="observations_ph")
-        self.generator_acs_ph = tf.placeholder(tf.float32, (None, ) + self.actions_shape, name="actions_ph")
-        self.expert_obs_ph = tf.placeholder(tf.float32, (None, ) + self.observation_shape, name="expert_observations_ph")
-        self.expert_acs_ph = tf.placeholder(tf.float32, (None, ) + self.actions_shape, name="expert_actions_ph")
-
-    def build_graph(self, obs_ph, acs_ph, reuse=False):
-        with tf.variable_scope(self.scope):
-            if reuse:
-                tf.get_variable_scope().reuse_variables()
-
-            with tf.variable_scope("obfilter"):
-                self.obs_rms = RunningMeanStd(shape=self.observation_shape)
-            obs = (obs_ph - self.obs_rms.mean / self.obs_rms.std)
-            _input = tf.concat([obs, acs_ph], axis=1)  # concatenate the two input -> form a transition
-            p_h1 = tf.contrib.layers.fully_connected(_input, self.hidden_size, activation_fn=tf.nn.tanh)
-            p_h2 = tf.contrib.layers.fully_connected(p_h1, self.hidden_size, activation_fn=tf.nn.tanh)
-            logits = tf.contrib.layers.fully_connected(p_h2, 1, activation_fn=tf.identity)
-        return logits
-
-    def get_trainable_variables(self):
-        return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)
-
-    def get_reward(self, obs, acs):
-        sess = tf.get_default_session()
-        if len(obs.shape) == 1:
-            obs = np.expand_dims(obs, 0)
-        if len(acs.shape) == 1:
-            acs = np.expand_dims(acs, 0)
-        feed_dict = {self.generator_obs_ph: obs, self.generator_acs_ph: acs}
-        reward = sess.run(self.reward_op, feed_dict)
-        return reward
--- a/baselines/gail/behavior_clone.py
+++ b/baselines/gail/behavior_clone.py
@@ -1,124 +0,0 @@
-'''
-The code is used to train BC imitator, or pretrained GAIL imitator
-'''
-
-import argparse
-import tempfile
-import os.path as osp
-import gym
-import logging
-from tqdm import tqdm
-
-import tensorflow as tf
-
-from baselines.gail import mlp_policy
-from baselines import bench
-from baselines import logger
-from baselines.common import set_global_seeds, tf_util as U
-from baselines.common.misc_util import boolean_flag
-from baselines.common.mpi_adam import MpiAdam
-from baselines.gail.run_mujoco import runner
-from baselines.gail.dataset.mujoco_dset import Mujoco_Dset
-
-
-def argsparser():
-    parser = argparse.ArgumentParser("Tensorflow Implementation of Behavior Cloning")
-    parser.add_argument('--env_id', help='environment ID', default='Hopper-v1')
-    parser.add_argument('--seed', help='RNG seed', type=int, default=0)
-    parser.add_argument('--expert_path', type=str, default='data/deterministic.trpo.Hopper.0.00.npz')
-    parser.add_argument('--checkpoint_dir', help='the directory to save model', default='checkpoint')
-    parser.add_argument('--log_dir', help='the directory to save log file', default='log')
-    #  Mujoco Dataset Configuration
-    parser.add_argument('--traj_limitation', type=int, default=-1)
-    # Network Configuration (Using MLP Policy)
-    parser.add_argument('--policy_hidden_size', type=int, default=100)
-    # for evaluatation
-    boolean_flag(parser, 'stochastic_policy', default=False, help='use stochastic/deterministic policy to evaluate')
-    boolean_flag(parser, 'save_sample', default=False, help='save the trajectories or not')
-    parser.add_argument('--BC_max_iter', help='Max iteration for training BC', type=int, default=1e5)
-    return parser.parse_args()
-
-
-def learn(env, policy_func, dataset, optim_batch_size=128, max_iters=1e4,
-          adam_epsilon=1e-5, optim_stepsize=3e-4,
-          ckpt_dir=None, log_dir=None, task_name=None,
-          verbose=False):
-
-    val_per_iter = int(max_iters/10)
-    ob_space = env.observation_space
-    ac_space = env.action_space
-    pi = policy_func("pi", ob_space, ac_space)  # Construct network for new policy
-    # placeholder
-    ob = U.get_placeholder_cached(name="ob")
-    ac = pi.pdtype.sample_placeholder([None])
-    stochastic = U.get_placeholder_cached(name="stochastic")
-    loss = tf.reduce_mean(tf.square(ac-pi.ac))
-    var_list = pi.get_trainable_variables()
-    adam = MpiAdam(var_list, epsilon=adam_epsilon)
-    lossandgrad = U.function([ob, ac, stochastic], [loss]+[U.flatgrad(loss, var_list)])
-
-    U.initialize()
-    adam.sync()
-    logger.log("Pretraining with Behavior Cloning...")
-    for iter_so_far in tqdm(range(int(max_iters))):
-        ob_expert, ac_expert = dataset.get_next_batch(optim_batch_size, 'train')
-        train_loss, g = lossandgrad(ob_expert, ac_expert, True)
-        adam.update(g, optim_stepsize)
-        if verbose and iter_so_far % val_per_iter == 0:
-            ob_expert, ac_expert = dataset.get_next_batch(-1, 'val')
-            val_loss, _ = lossandgrad(ob_expert, ac_expert, True)
-            logger.log("Training loss: {}, Validation loss: {}".format(train_loss, val_loss))
-
-    if ckpt_dir is None:
-        savedir_fname = tempfile.TemporaryDirectory().name
-    else:
-        savedir_fname = osp.join(ckpt_dir, task_name)
-    U.save_state(savedir_fname, var_list=pi.get_variables())
-    return savedir_fname
-
-
-def get_task_name(args):
-    task_name = 'BC'
-    task_name += '.{}'.format(args.env_id.split("-")[0])
-    task_name += '.traj_limitation_{}'.format(args.traj_limitation)
-    task_name += ".seed_{}".format(args.seed)
-    return task_name
-
-
-def main(args):
-    U.make_session(num_cpu=1).__enter__()
-    set_global_seeds(args.seed)
-    env = gym.make(args.env_id)
-
-    def policy_fn(name, ob_space, ac_space, reuse=False):
-        return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
-                                    reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2)
-    env = bench.Monitor(env, logger.get_dir() and
-                        osp.join(logger.get_dir(), "monitor.json"))
-    env.seed(args.seed)
-    gym.logger.setLevel(logging.WARN)
-    task_name = get_task_name(args)
-    args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name)
-    args.log_dir = osp.join(args.log_dir, task_name)
-    dataset = Mujoco_Dset(expert_path=args.expert_path, traj_limitation=args.traj_limitation)
-    savedir_fname = learn(env,
-                          policy_fn,
-                          dataset,
-                          max_iters=args.BC_max_iter,
-                          ckpt_dir=args.checkpoint_dir,
-                          log_dir=args.log_dir,
-                          task_name=task_name,
-                          verbose=True)
-    avg_len, avg_ret = runner(env,
-                              policy_fn,
-                              savedir_fname,
-                              timesteps_per_batch=1024,
-                              number_trajs=10,
-                              stochastic_policy=args.stochastic_policy,
-                              save=args.save_sample,
-                              reuse=True)
-
-
-if __name__ == '__main__':
-    args = argsparser()
-    main(args)
--- a/baselines/gail/dataset/init.py
+++ b/baselines/gail/dataset/init.py
--- a/baselines/gail/dataset/mujoco_dset.py
+++ b/baselines/gail/dataset/mujoco_dset.py
@@ -1,116 +0,0 @@
-'''
-Data structure of the input .npz:
-the data is save in python dictionary format with keys: 'acs', 'ep_rets', 'rews', 'obs'
-the values of each item is a list storing the expert trajectory sequentially
-a transition can be: (data['obs'][t], data['acs'][t], data['obs'][t+1]) and get reward data['rews'][t]
-'''
-
-from baselines import logger
-import numpy as np
-
-
-class Dset(object):
-    def __init__(self, inputs, labels, randomize):
-        self.inputs = inputs
-        self.labels = labels
-        assert len(self.inputs) == len(self.labels)
-        self.randomize = randomize
-        self.num_pairs = len(inputs)
-        self.init_pointer()
-
-    def init_pointer(self):
-        self.pointer = 0
-        if self.randomize:
-            idx = np.arange(self.num_pairs)
-            np.random.shuffle(idx)
-            self.inputs = self.inputs[idx, :]
-            self.labels = self.labels[idx, :]
-
-    def get_next_batch(self, batch_size):
-        # if batch_size is negative -> return all
-        if batch_size < 0:
-            return self.inputs, self.labels
-        if self.pointer + batch_size >= self.num_pairs:
-            self.init_pointer()
-        end = self.pointer + batch_size
-        inputs = self.inputs[self.pointer:end, :]
-        labels = self.labels[self.pointer:end, :]
-        self.pointer = end
-        return inputs, labels
-
-
-class Mujoco_Dset(object):
-    def __init__(self, expert_path, train_fraction=0.7, traj_limitation=-1, randomize=True):
-        traj_data = np.load(expert_path)
-        if traj_limitation < 0:
-            traj_limitation = len(traj_data['obs'])
-        obs = traj_data['obs'][:traj_limitation]
-        acs = traj_data['acs'][:traj_limitation]
-
-        def flatten(x):
-            # x.shape = (E,), or (E, L, D)
-            _, size = x[0].shape
-            episode_length = [len(i) for i in x]
-            y = np.zeros((sum(episode_length), size))
-            start_idx = 0
-            for l, x_i in zip(episode_length, x):
-                y[start_idx:(start_idx+l)] = x_i
-                start_idx += l
-                return y
-        self.obs = np.array(flatten(obs))
-        self.acs = np.array(flatten(acs))
-        self.rets = traj_data['ep_rets'][:traj_limitation]
-        self.avg_ret = sum(self.rets)/len(self.rets)
-        self.std_ret = np.std(np.array(self.rets))
-        if len(self.acs) > 2:
-            self.acs = np.squeeze(self.acs)
-        assert len(self.obs) == len(self.acs)
-        self.num_traj = min(traj_limitation, len(traj_data['obs']))
-        self.num_transition = len(self.obs)
-        self.randomize = randomize
-        self.dset = Dset(self.obs, self.acs, self.randomize)
-        # for behavior cloning
-        self.train_set = Dset(self.obs[:int(self.num_transition*train_fraction), :],
-                              self.acs[:int(self.num_transition*train_fraction), :],
-                              self.randomize)
-        self.val_set = Dset(self.obs[int(self.num_transition*train_fraction):, :],
-                            self.acs[int(self.num_transition*train_fraction):, :],
-                            self.randomize)
-        self.log_info()
-
-    def log_info(self):
-        logger.log("Total trajectorues: %d" % self.num_traj)
-        logger.log("Total transitions: %d" % self.num_transition)
-        logger.log("Average returns: %f" % self.avg_ret)
-        logger.log("Std for returns: %f" % self.std_ret)
-
-    def get_next_batch(self, batch_size, split=None):
-        if split is None:
-            return self.dset.get_next_batch(batch_size)
-        elif split == 'train':
-            return self.train_set.get_next_batch(batch_size)
-        elif split == 'val':
-            return self.val_set.get_next_batch(batch_size)
-        else:
-            raise NotImplementedError
-
-    def plot(self):
-        import matplotlib.pyplot as plt
-        plt.hist(self.rets)
-        plt.savefig("histogram_rets.png")
-        plt.close()
-
-
-def test(expert_path, traj_limitation, plot):
-    dset = Mujoco_Dset(expert_path, traj_limitation=traj_limitation)
-    if plot:
-        dset.plot()
-
-if __name__ == '__main__':
-    import argparse
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--expert_path", type=str, default="../data/deterministic.trpo.Hopper.0.00.npz")
-    parser.add_argument("--traj_limitation", type=int, default=None)
-    parser.add_argument("--plot", type=bool, default=False)
-    args = parser.parse_args()
-    test(args.expert_path, args.traj_limitation, args.plot)
--- a/baselines/gail/gail-eval.py
+++ b/baselines/gail/gail-eval.py
@@ -1,147 +0,0 @@
-'''
-This code is used to evalaute the imitators trained with different number of trajectories
-and plot the results in the same figure for easy comparison.
-'''
-
-import argparse
-import os
-import glob
-import gym
-
-import matplotlib.pyplot as plt
-import numpy as np
-import tensorflow as tf
-
-from baselines.gail import run_mujoco
-from baselines.gail import mlp_policy
-from baselines.common import set_global_seeds, tf_util as U
-from baselines.common.misc_util import boolean_flag
-from baselines.gail.dataset.mujoco_dset import Mujoco_Dset
-
-
-plt.style.use('ggplot')
-CONFIG = {
-    'traj_limitation': [1, 5, 10, 50],
-}
-
-
-def load_dataset(expert_path):
-    dataset = Mujoco_Dset(expert_path=expert_path)
-    return dataset
-
-
-def argsparser():
-    parser = argparse.ArgumentParser('Do evaluation')
-    parser.add_argument('--seed', type=int, default=0)
-    parser.add_argument('--policy_hidden_size', type=int, default=100)
-    parser.add_argument('--env', type=str, choices=['Hopper', 'Walker2d', 'HalfCheetah',
-                                                    'Humanoid', 'HumanoidStandup'])
-    boolean_flag(parser, 'stochastic_policy', default=False, help='use stochastic/deterministic policy to evaluate')
-    return parser.parse_args()
-
-
-def evaluate_env(env_name, seed, policy_hidden_size, stochastic, reuse, prefix):
-
-    def get_checkpoint_dir(checkpoint_list, limit, prefix):
-        for checkpoint in checkpoint_list:
-            if ('limitation_'+str(limit) in checkpoint) and (prefix in checkpoint):
-                return checkpoint
-        return None
-
-    def policy_fn(name, ob_space, ac_space, reuse=False):
-        return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
-                                    reuse=reuse, hid_size=policy_hidden_size, num_hid_layers=2)
-
-    data_path = os.path.join('data', 'deterministic.trpo.' + env_name + '.0.00.npz')
-    dataset = load_dataset(data_path)
-    checkpoint_list = glob.glob(os.path.join('checkpoint', '*' + env_name + ".*"))
-    log = {
-        'traj_limitation': [],
-        'upper_bound': [],
-        'avg_ret': [],
-        'avg_len': [],
-        'normalized_ret': []
-    }
-    for i, limit in enumerate(CONFIG['traj_limitation']):
-        # Do one evaluation
-        upper_bound = sum(dataset.rets[:limit])/limit
-        checkpoint_dir = get_checkpoint_dir(checkpoint_list, limit, prefix=prefix)
-        checkpoint_path = tf.train.latest_checkpoint(checkpoint_dir)
-        env = gym.make(env_name + '-v1')
-        env.seed(seed)
-        print('Trajectory limitation: {}, Load checkpoint: {}, '.format(limit, checkpoint_path))
-        avg_len, avg_ret = run_mujoco.runner(env,
-                                             policy_fn,
-                                             checkpoint_path,
-                                             timesteps_per_batch=1024,
-                                             number_trajs=10,
-                                             stochastic_policy=stochastic,
-                                             reuse=((i != 0) or reuse))
-        normalized_ret = avg_ret/upper_bound
-        print('Upper bound: {}, evaluation returns: {}, normalized scores: {}'.format(
-            upper_bound, avg_ret, normalized_ret))
-        log['traj_limitation'].append(limit)
-        log['upper_bound'].append(upper_bound)
-        log['avg_ret'].append(avg_ret)
-        log['avg_len'].append(avg_len)
-        log['normalized_ret'].append(normalized_ret)
-        env.close()
-    return log
-
-
-def plot(env_name, bc_log, gail_log, stochastic):
-    upper_bound = bc_log['upper_bound']
-    bc_avg_ret = bc_log['avg_ret']
-    gail_avg_ret = gail_log['avg_ret']
-    plt.plot(CONFIG['traj_limitation'], upper_bound)
-    plt.plot(CONFIG['traj_limitation'], bc_avg_ret)
-    plt.plot(CONFIG['traj_limitation'], gail_avg_ret)
-    plt.xlabel('Number of expert trajectories')
-    plt.ylabel('Accumulated reward')
-    plt.title('{} unnormalized scores'.format(env_name))
-    plt.legend(['expert', 'bc-imitator', 'gail-imitator'], loc='lower right')
-    plt.grid(b=True, which='major', color='gray', linestyle='--')
-    if stochastic:
-        title_name = 'result/{}-unnormalized-stochastic-scores.png'.format(env_name)
-    else:
-        title_name = 'result/{}-unnormalized-deterministic-scores.png'.format(env_name)
-    plt.savefig(title_name)
-    plt.close()
-
-    bc_normalized_ret = bc_log['normalized_ret']
-    gail_normalized_ret = gail_log['normalized_ret']
-    plt.plot(CONFIG['traj_limitation'], np.ones(len(CONFIG['traj_limitation'])))
-    plt.plot(CONFIG['traj_limitation'], bc_normalized_ret)
-    plt.plot(CONFIG['traj_limitation'], gail_normalized_ret)
-    plt.xlabel('Number of expert trajectories')
-    plt.ylabel('Normalized performance')
-    plt.title('{} normalized scores'.format(env_name))
-    plt.legend(['expert', 'bc-imitator', 'gail-imitator'], loc='lower right')
-    plt.grid(b=True, which='major', color='gray', linestyle='--')
-    if stochastic:
-        title_name = 'result/{}-normalized-stochastic-scores.png'.format(env_name)
-    else:
-        title_name = 'result/{}-normalized-deterministic-scores.png'.format(env_name)
-    plt.ylim(0, 1.6)
-    plt.savefig(title_name)
-    plt.close()
-
-
-def main(args):
-    U.make_session(num_cpu=1).__enter__()
-    set_global_seeds(args.seed)
-    print('Evaluating {}'.format(args.env))
-    bc_log = evaluate_env(args.env, args.seed, args.policy_hidden_size,
-                          args.stochastic_policy, False, 'BC')
-    print('Evaluation for {}'.format(args.env))
-    print(bc_log)
-    gail_log = evaluate_env(args.env, args.seed, args.policy_hidden_size,
-                            args.stochastic_policy, True, 'gail')
-    print('Evaluation for {}'.format(args.env))
-    print(gail_log)
-    plot(args.env, bc_log, gail_log, args.stochastic_policy)
-
-
-if __name__ == '__main__':
-    args = argsparser()
-    main(args)
--- a/baselines/gail/mlp_policy.py
+++ b/baselines/gail/mlp_policy.py
@@ -1,75 +0,0 @@
-'''
-from baselines/ppo1/mlp_policy.py and add simple modification
-(1) add reuse argument
-(2) cache the `stochastic` placeholder
-'''
-import tensorflow as tf
-import gym
-
-import baselines.common.tf_util as U
-from baselines.common.mpi_running_mean_std import RunningMeanStd
-from baselines.common.distributions import make_pdtype
-from baselines.acktr.utils import dense
-
-
-class MlpPolicy(object):
-    recurrent = False
-
-    def __init__(self, name, reuse=False, *args, **kwargs):
-        with tf.variable_scope(name):
-            if reuse:
-                tf.get_variable_scope().reuse_variables()
-            self._init(*args, **kwargs)
-            self.scope = tf.get_variable_scope().name
-
-    def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True):
-        assert isinstance(ob_space, gym.spaces.Box)
-
-        self.pdtype = pdtype = make_pdtype(ac_space)
-        sequence_length = None
-
-        ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))
-
-        with tf.variable_scope("obfilter"):
-            self.ob_rms = RunningMeanStd(shape=ob_space.shape)
-
-        obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
-        last_out = obz
-        for i in range(num_hid_layers):
-            last_out = tf.nn.tanh(dense(last_out, hid_size, "vffc%i" % (i+1), weight_init=U.normc_initializer(1.0)))
-        self.vpred = dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:, 0]
-
-        last_out = obz
-        for i in range(num_hid_layers):
-            last_out = tf.nn.tanh(dense(last_out, hid_size, "polfc%i" % (i+1), weight_init=U.normc_initializer(1.0)))
-
-        if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
-            mean = dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01))
-            logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer())
-            pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
-        else:
-            pdparam = dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01))
-
-        self.pd = pdtype.pdfromflat(pdparam)
-
-        self.state_in = []
-        self.state_out = []
-
-        # change for BC
-        stochastic = U.get_placeholder(name="stochastic", dtype=tf.bool, shape=())
-        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
-        self.ac = ac
-        self._act = U.function([stochastic, ob], [ac, self.vpred])
-
-    def act(self, stochastic, ob):
-        ac1, vpred1 = self._act(stochastic, ob[None])
-        return ac1[0], vpred1[0]
-
-    def get_variables(self):
-        return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope)
-
-    def get_trainable_variables(self):
-        return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)
-
-    def get_initial_state(self):
-        return []
--- a/baselines/gail/result/HalfCheetah-normalized-deterministic-scores.png
+++ b/baselines/gail/result/HalfCheetah-normalized-deterministic-scores.png
--- a/baselines/gail/result/HalfCheetah-normalized-stochastic-scores.png
+++ b/baselines/gail/result/HalfCheetah-normalized-stochastic-scores.png
--- a/baselines/gail/result/HalfCheetah-unnormalized-deterministic-scores.png
+++ b/baselines/gail/result/HalfCheetah-unnormalized-deterministic-scores.png
--- a/baselines/gail/result/HalfCheetah-unnormalized-stochastic-scores.png
+++ b/baselines/gail/result/HalfCheetah-unnormalized-stochastic-scores.png
--- a/baselines/gail/result/Hopper-normalized-deterministic-scores.png
+++ b/baselines/gail/result/Hopper-normalized-deterministic-scores.png
--- a/baselines/gail/result/Hopper-normalized-stochastic-scores.png
+++ b/baselines/gail/result/Hopper-normalized-stochastic-scores.png
--- a/baselines/gail/result/Hopper-unnormalized-deterministic-scores.png
+++ b/baselines/gail/result/Hopper-unnormalized-deterministic-scores.png
--- a/baselines/gail/result/Hopper-unnormalized-stochastic-scores.png
+++ b/baselines/gail/result/Hopper-unnormalized-stochastic-scores.png
--- a/baselines/gail/result/Humanoid-normalized-deterministic-scores.png
+++ b/baselines/gail/result/Humanoid-normalized-deterministic-scores.png
--- a/baselines/gail/result/Humanoid-normalized-stochastic-scores.png
+++ b/baselines/gail/result/Humanoid-normalized-stochastic-scores.png
--- a/baselines/gail/result/Humanoid-unnormalized-deterministic-scores.png
+++ b/baselines/gail/result/Humanoid-unnormalized-deterministic-scores.png
--- a/baselines/gail/result/Humanoid-unnormalized-stochastic-scores.png
+++ b/baselines/gail/result/Humanoid-unnormalized-stochastic-scores.png
--- a/baselines/gail/result/HumanoidStandup-normalized-deterministic-scores.png
+++ b/baselines/gail/result/HumanoidStandup-normalized-deterministic-scores.png
--- a/baselines/gail/result/HumanoidStandup-normalized-stochastic-scores.png
+++ b/baselines/gail/result/HumanoidStandup-normalized-stochastic-scores.png
--- a/baselines/gail/result/HumanoidStandup-unnormalized-deterministic-scores.png
+++ b/baselines/gail/result/HumanoidStandup-unnormalized-deterministic-scores.png
--- a/baselines/gail/result/HumanoidStandup-unnormalized-stochastic-scores.png
+++ b/baselines/gail/result/HumanoidStandup-unnormalized-stochastic-scores.png
--- a/baselines/gail/result/Walker2d-normalized-deterministic-scores.png
+++ b/baselines/gail/result/Walker2d-normalized-deterministic-scores.png
--- a/baselines/gail/result/Walker2d-normalized-stochastic-scores.png
+++ b/baselines/gail/result/Walker2d-normalized-stochastic-scores.png
--- a/baselines/gail/result/Walker2d-unnormalized-deterministic-scores.png
+++ b/baselines/gail/result/Walker2d-unnormalized-deterministic-scores.png
--- a/baselines/gail/result/Walker2d-unnormalized-stochastic-scores.png
+++ b/baselines/gail/result/Walker2d-unnormalized-stochastic-scores.png
--- a/baselines/gail/result/gail-result.md
+++ b/baselines/gail/result/gail-result.md
@@ -1,53 +0,0 @@
-# Results of GAIL/BC on Mujoco
-
-Here's the extensive experimental results of applying GAIL/BC on Mujoco environments, including 
-Hopper-v1, Walker2d-v1, HalfCheetah-v1, Humanoid-v1, HumanoidStandup-v1. Every imitator is evaluated with seed to be 0.
-
-## Results
-
-### Training through iterations
-
- Hoppers-v1
-<img src='hopper-training.png'> 
-
- HalfCheetah-v1
-<img src='halfcheetah-training.png'> 
-
- Walker2d-v1
-<img src='walker2d-training.png'> 
-
- Humanoid-v1
-<img src='humanoid-training.png'> 
-
- HumanoidStandup-v1
-<img src='humanoidstandup-training.png'> 
-
-For details (e.g., adversarial loss, discriminator accuracy, etc.) about GAIL training, please see [here](https://drive.google.com/drive/folders/1nnU8dqAV9i37-_5_vWIspyFUJFQLCsDD?usp=sharing)
-
-### Determinstic Polciy (Set std=0)
-|   | Un-normalized | Normalized |
-|---|---|---|
-| Hopper-v1 | <img src='Hopper-unnormalized-deterministic-scores.png'> | <img src='Hopper-normalized-deterministic-scores.png'> |
-| HalfCheetah-v1 | <img src='HalfCheetah-unnormalized-deterministic-scores.png'> | <img src='HalfCheetah-normalized-deterministic-scores.png'> |
-| Walker2d-v1 | <img src='Walker2d-unnormalized-deterministic-scores.png'> | <img src='Walker2d-normalized-deterministic-scores.png'> |
-| Humanoid-v1 | <img src='Humanoid-unnormalized-deterministic-scores.png'> | <img src='Humanoid-normalized-deterministic-scores.png'> |
-| HumanoidStandup-v1 | <img src='HumanoidStandup-unnormalized-deterministic-scores.png'> | <img src='HumanoidStandup-normalized-deterministic-scores.png'> |
-
-### Stochatic Policy 
-|   | Un-normalized | Normalized |
-|---|---|---|
-| Hopper-v1 | <img src='Hopper-unnormalized-stochastic-scores.png'> | <img src='Hopper-normalized-stochastic-scores.png'> |
-| HalfCheetah-v1 | <img src='HalfCheetah-unnormalized-stochastic-scores.png'> | <img src='HalfCheetah-normalized-stochastic-scores.png'> |
-| Walker2d-v1 | <img src='Walker2d-unnormalized-stochastic-scores.png'> | <img src='Walker2d-normalized-stochastic-scores.png'> |
-| Humanoid-v1 | <img src='Humanoid-unnormalized-stochastic-scores.png'> | <img src='Humanoid-normalized-stochastic-scores.png'> |
-| HumanoidStandup-v1 | <img src='HumanoidStandup-unnormalized-stochastic-scores.png'> | <img src='HumanoidStandup-normalized-stochastic-scores.png'> |
-
-### details about GAIL imitator
-
-For all environments, the 
-imitator is trained with 1, 5, 10, 50 trajectories, where each trajectory contains at most 
-1024 transitions, and seed 0, 1, 2, 3, respectively.
-
-### details about the BC imitators
-
-All BC imitators are trained with seed 0.
--- a/baselines/gail/result/halfcheetah-training.png
+++ b/baselines/gail/result/halfcheetah-training.png
--- a/baselines/gail/result/hopper-training.png
+++ b/baselines/gail/result/hopper-training.png
--- a/baselines/gail/result/humanoid-training.png
+++ b/baselines/gail/result/humanoid-training.png
--- a/baselines/gail/result/humanoidstandup-training.png
+++ b/baselines/gail/result/humanoidstandup-training.png
--- a/baselines/gail/result/walker2d-training.png
+++ b/baselines/gail/result/walker2d-training.png
--- a/baselines/gail/run_mujoco.py
+++ b/baselines/gail/run_mujoco.py
@@ -1,239 +0,0 @@
-'''
-Disclaimer: this code is highly based on trpo_mpi at @openai/baselines and @openai/imitation
-'''
-
-import argparse
-import os.path as osp
-import logging
-from mpi4py import MPI
-from tqdm import tqdm
-
-import numpy as np
-import gym
-
-from baselines.gail import mlp_policy
-from baselines.common import set_global_seeds, tf_util as U
-from baselines.common.misc_util import boolean_flag
-from baselines import bench
-from baselines import logger
-from baselines.gail.dataset.mujoco_dset import Mujoco_Dset
-from baselines.gail.adversary import TransitionClassifier
-
-
-def argsparser():
-    parser = argparse.ArgumentParser("Tensorflow Implementation of GAIL")
-    parser.add_argument('--env_id', help='environment ID', default='Hopper-v2')
-    parser.add_argument('--seed', help='RNG seed', type=int, default=0)
-    parser.add_argument('--expert_path', type=str, default='data/deterministic.trpo.Hopper.0.00.npz')
-    parser.add_argument('--checkpoint_dir', help='the directory to save model', default='checkpoint')
-    parser.add_argument('--log_dir', help='the directory to save log file', default='log')
-    parser.add_argument('--load_model_path', help='if provided, load the model', type=str, default=None)
-    # Task
-    parser.add_argument('--task', type=str, choices=['train', 'evaluate', 'sample'], default='train')
-    # for evaluatation
-    boolean_flag(parser, 'stochastic_policy', default=False, help='use stochastic/deterministic policy to evaluate')
-    boolean_flag(parser, 'save_sample', default=False, help='save the trajectories or not')
-    #  Mujoco Dataset Configuration
-    parser.add_argument('--traj_limitation', type=int, default=-1)
-    # Optimization Configuration
-    parser.add_argument('--g_step', help='number of steps to train policy in each epoch', type=int, default=3)
-    parser.add_argument('--d_step', help='number of steps to train discriminator in each epoch', type=int, default=1)
-    # Network Configuration (Using MLP Policy)
-    parser.add_argument('--policy_hidden_size', type=int, default=100)
-    parser.add_argument('--adversary_hidden_size', type=int, default=100)
-    # Algorithms Configuration
-    parser.add_argument('--algo', type=str, choices=['trpo', 'ppo'], default='trpo')
-    parser.add_argument('--max_kl', type=float, default=0.01)
-    parser.add_argument('--policy_entcoeff', help='entropy coefficiency of policy', type=float, default=0)
-    parser.add_argument('--adversary_entcoeff', help='entropy coefficiency of discriminator', type=float, default=1e-3)
-    # Traing Configuration
-    parser.add_argument('--save_per_iter', help='save model every xx iterations', type=int, default=100)
-    parser.add_argument('--num_timesteps', help='number of timesteps per episode', type=int, default=5e6)
-    # Behavior Cloning
-    boolean_flag(parser, 'pretrained', default=False, help='Use BC to pretrain')
-    parser.add_argument('--BC_max_iter', help='Max iteration for training BC', type=int, default=1e4)
-    return parser.parse_args()
-
-
-def get_task_name(args):
-    task_name = args.algo + "_gail."
-    if args.pretrained:
-        task_name += "with_pretrained."
-    if args.traj_limitation != np.inf:
-        task_name += "transition_limitation_%d." % args.traj_limitation
-    task_name += args.env_id.split("-")[0]
-    task_name = task_name + ".g_step_" + str(args.g_step) + ".d_step_" + str(args.d_step) + \
-        ".policy_entcoeff_" + str(args.policy_entcoeff) + ".adversary_entcoeff_" + str(args.adversary_entcoeff)
-    task_name += ".seed_" + str(args.seed)
-    return task_name
-
-
-def main(args):
-    U.make_session(num_cpu=1).__enter__()
-    set_global_seeds(args.seed)
-    env = gym.make(args.env_id)
-
-    def policy_fn(name, ob_space, ac_space, reuse=False):
-        return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
-                                    reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2)
-    env = bench.Monitor(env, logger.get_dir() and
-                        osp.join(logger.get_dir(), "monitor.json"))
-    env.seed(args.seed)
-    gym.logger.setLevel(logging.WARN)
-    task_name = get_task_name(args)
-    args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name)
-    args.log_dir = osp.join(args.log_dir, task_name)
-
-    if args.task == 'train':
-        dataset = Mujoco_Dset(expert_path=args.expert_path, traj_limitation=args.traj_limitation)
-        reward_giver = TransitionClassifier(env, args.adversary_hidden_size, entcoeff=args.adversary_entcoeff)
-        train(env,
-              args.seed,
-              policy_fn,
-              reward_giver,
-              dataset,
-              args.algo,
-              args.g_step,
-              args.d_step,
-              args.policy_entcoeff,
-              args.num_timesteps,
-              args.save_per_iter,
-              args.checkpoint_dir,
-              args.log_dir,
-              args.pretrained,
-              args.BC_max_iter,
-              task_name
-              )
-    elif args.task == 'evaluate':
-        runner(env,
-               policy_fn,
-               args.load_model_path,
-               timesteps_per_batch=1024,
-               number_trajs=10,
-               stochastic_policy=args.stochastic_policy,
-               save=args.save_sample
-               )
-    else:
-        raise NotImplementedError
-    env.close()
-
-
-def train(env, seed, policy_fn, reward_giver, dataset, algo,
-          g_step, d_step, policy_entcoeff, num_timesteps, save_per_iter,
-          checkpoint_dir, log_dir, pretrained, BC_max_iter, task_name=None):
-
-    pretrained_weight = None
-    if pretrained and (BC_max_iter > 0):
-        # Pretrain with behavior cloning
-        from baselines.gail import behavior_clone
-        pretrained_weight = behavior_clone.learn(env, policy_fn, dataset,
-                                                 max_iters=BC_max_iter)
-
-    if algo == 'trpo':
-        from baselines.gail import trpo_mpi
-        # Set up for MPI seed
-        rank = MPI.COMM_WORLD.Get_rank()
-        if rank != 0:
-            logger.set_level(logger.DISABLED)
-        workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
-        set_global_seeds(workerseed)
-        env.seed(workerseed)
-        trpo_mpi.learn(env, policy_fn, reward_giver, dataset, rank,
-                       pretrained=pretrained, pretrained_weight=pretrained_weight,
-                       g_step=g_step, d_step=d_step,
-                       entcoeff=policy_entcoeff,
-                       max_timesteps=num_timesteps,
-                       ckpt_dir=checkpoint_dir, log_dir=log_dir,
-                       save_per_iter=save_per_iter,
-                       timesteps_per_batch=1024,
-                       max_kl=0.01, cg_iters=10, cg_damping=0.1,
-                       gamma=0.995, lam=0.97,
-                       vf_iters=5, vf_stepsize=1e-3,
-                       task_name=task_name)
-    else:
-        raise NotImplementedError
-
-
-def runner(env, policy_func, load_model_path, timesteps_per_batch, number_trajs,
-           stochastic_policy, save=False, reuse=False):
-
-    # Setup network
-    # ----------------------------------------
-    ob_space = env.observation_space
-    ac_space = env.action_space
-    pi = policy_func("pi", ob_space, ac_space, reuse=reuse)
-    U.initialize()
-    # Prepare for rollouts
-    # ----------------------------------------
-    U.load_state(load_model_path)
-
-    obs_list = []
-    acs_list = []
-    len_list = []
-    ret_list = []
-    for _ in tqdm(range(number_trajs)):
-        traj = traj_1_generator(pi, env, timesteps_per_batch, stochastic=stochastic_policy)
-        obs, acs, ep_len, ep_ret = traj['ob'], traj['ac'], traj['ep_len'], traj['ep_ret']
-        obs_list.append(obs)
-        acs_list.append(acs)
-        len_list.append(ep_len)
-        ret_list.append(ep_ret)
-    if stochastic_policy:
-        print('stochastic policy:')
-    else:
-        print('deterministic policy:')
-    if save:
-        filename = load_model_path.split('/')[-1] + '.' + env.spec.id
-        np.savez(filename, obs=np.array(obs_list), acs=np.array(acs_list),
-                 lens=np.array(len_list), rets=np.array(ret_list))
-    avg_len = sum(len_list)/len(len_list)
-    avg_ret = sum(ret_list)/len(ret_list)
-    print("Average length:", avg_len)
-    print("Average return:", avg_ret)
-    return avg_len, avg_ret
-
-
-# Sample one trajectory (until trajectory end)
-def traj_1_generator(pi, env, horizon, stochastic):
-
-    t = 0
-    ac = env.action_space.sample()  # not used, just so we have the datatype
-    new = True  # marks if we're on first timestep of an episode
-
-    ob = env.reset()
-    cur_ep_ret = 0  # return in current episode
-    cur_ep_len = 0  # len of current episode
-
-    # Initialize history arrays
-    obs = []
-    rews = []
-    news = []
-    acs = []
-
-    while True:
-        ac, vpred = pi.act(stochastic, ob)
-        obs.append(ob)
-        news.append(new)
-        acs.append(ac)
-
-        ob, rew, new, _ = env.step(ac)
-        rews.append(rew)
-
-        cur_ep_ret += rew
-        cur_ep_len += 1
-        if new or t >= horizon:
-            break
-        t += 1
-
-    obs = np.array(obs)
-    rews = np.array(rews)
-    news = np.array(news)
-    acs = np.array(acs)
-    traj = {"ob": obs, "rew": rews, "new": news, "ac": acs,
-            "ep_ret": cur_ep_ret, "ep_len": cur_ep_len}
-    return traj
-
-
-if __name__ == '__main__':
-    args = argsparser()
-    main(args)
--- a/baselines/gail/statistics.py
+++ b/baselines/gail/statistics.py
@@ -1,45 +0,0 @@
-'''
-This code is highly based on https://github.com/carpedm20/deep-rl-tensorflow/blob/master/agents/statistic.py
-'''
-
-import tensorflow as tf
-import numpy as np
-
-import baselines.common.tf_util as U
-
-
-class stats():
-
-    def __init__(self, scalar_keys=[], histogram_keys=[]):
-        self.scalar_keys = scalar_keys
-        self.histogram_keys = histogram_keys
-        self.scalar_summaries = []
-        self.scalar_summaries_ph = []
-        self.histogram_summaries_ph = []
-        self.histogram_summaries = []
-        with tf.variable_scope('summary'):
-            for k in scalar_keys:
-                ph = tf.placeholder('float32', None, name=k+'.scalar.summary')
-                sm = tf.summary.scalar(k+'.scalar.summary', ph)
-                self.scalar_summaries_ph.append(ph)
-                self.scalar_summaries.append(sm)
-            for k in histogram_keys:
-                ph = tf.placeholder('float32', None, name=k+'.histogram.summary')
-                sm = tf.summary.scalar(k+'.histogram.summary', ph)
-                self.histogram_summaries_ph.append(ph)
-                self.histogram_summaries.append(sm)
-
-        self.summaries = tf.summary.merge(self.scalar_summaries+self.histogram_summaries)
-
-    def add_all_summary(self, writer, values, iter):
-        # Note that the order of the incoming ```values``` should be the same as the that of the
-        #            ```scalar_keys``` given in ```__init__```
-        if np.sum(np.isnan(values)+0) != 0:
-            return
-        sess = U.get_session()
-        keys = self.scalar_summaries_ph + self.histogram_summaries_ph
-        feed_dict = {}
-        for k, v in zip(keys, values):
-            feed_dict.update({k: v})
-        summaries_str = sess.run(self.summaries, feed_dict)
-        writer.add_summary(summaries_str, iter)
--- a/baselines/gail/trpo_mpi.py
+++ b/baselines/gail/trpo_mpi.py
@@ -1,354 +0,0 @@
-'''
-Disclaimer: The trpo part highly rely on trpo_mpi at @openai/baselines
-'''
-
-import time
-import os
-from contextlib import contextmanager
-from mpi4py import MPI
-from collections import deque
-
-import tensorflow as tf
-import numpy as np
-
-import baselines.common.tf_util as U
-from baselines.common import explained_variance, zipsame, dataset, fmt_row
-from baselines import logger
-from baselines.common import colorize
-from baselines.common.mpi_adam import MpiAdam
-from baselines.common.cg import cg
-from baselines.gail.statistics import stats
-
-
-def traj_segment_generator(pi, env, reward_giver, horizon, stochastic):
-
-    # Initialize state variables
-    t = 0
-    ac = env.action_space.sample()
-    new = True
-    rew = 0.0
-    true_rew = 0.0
-    ob = env.reset()
-
-    cur_ep_ret = 0
-    cur_ep_len = 0
-    cur_ep_true_ret = 0
-    ep_true_rets = []
-    ep_rets = []
-    ep_lens = []
-
-    # Initialize history arrays
-    obs = np.array([ob for _ in range(horizon)])
-    true_rews = np.zeros(horizon, 'float32')
-    rews = np.zeros(horizon, 'float32')
-    vpreds = np.zeros(horizon, 'float32')
-    news = np.zeros(horizon, 'int32')
-    acs = np.array([ac for _ in range(horizon)])
-    prevacs = acs.copy()
-
-    while True:
-        prevac = ac
-        ac, vpred = pi.act(stochastic, ob)
-        # Slight weirdness here because we need value function at time T
-        # before returning segment [0, T-1] so we get the correct
-        # terminal value
-        if t > 0 and t % horizon == 0:
-            yield {"ob": obs, "rew": rews, "vpred": vpreds, "new": news,
-                   "ac": acs, "prevac": prevacs, "nextvpred": vpred * (1 - new),
-                   "ep_rets": ep_rets, "ep_lens": ep_lens, "ep_true_rets": ep_true_rets}
-            _, vpred = pi.act(stochastic, ob)
-            # Be careful!!! if you change the downstream algorithm to aggregate
-            # several of these batches, then be sure to do a deepcopy
-            ep_rets = []
-            ep_true_rets = []
-            ep_lens = []
-        i = t % horizon
-        obs[i] = ob
-        vpreds[i] = vpred
-        news[i] = new
-        acs[i] = ac
-        prevacs[i] = prevac
-
-        rew = reward_giver.get_reward(ob, ac)
-        ob, true_rew, new, _ = env.step(ac)
-        rews[i] = rew
-        true_rews[i] = true_rew
-
-        cur_ep_ret += rew
-        cur_ep_true_ret += true_rew
-        cur_ep_len += 1
-        if new:
-            ep_rets.append(cur_ep_ret)
-            ep_true_rets.append(cur_ep_true_ret)
-            ep_lens.append(cur_ep_len)
-            cur_ep_ret = 0
-            cur_ep_true_ret = 0
-            cur_ep_len = 0
-            ob = env.reset()
-        t += 1
-
-
-def add_vtarg_and_adv(seg, gamma, lam):
-    new = np.append(seg["new"], 0)  # last element is only used for last vtarg, but we already zeroed it if last new = 1
-    vpred = np.append(seg["vpred"], seg["nextvpred"])
-    T = len(seg["rew"])
-    seg["adv"] = gaelam = np.empty(T, 'float32')
-    rew = seg["rew"]
-    lastgaelam = 0
-    for t in reversed(range(T)):
-        nonterminal = 1-new[t+1]
-        delta = rew[t] + gamma * vpred[t+1] * nonterminal - vpred[t]
-        gaelam[t] = lastgaelam = delta + gamma * lam * nonterminal * lastgaelam
-    seg["tdlamret"] = seg["adv"] + seg["vpred"]
-
-
-def learn(env, policy_func, reward_giver, expert_dataset, rank,
-          pretrained, pretrained_weight, *,
-          g_step, d_step, entcoeff, save_per_iter,
-          ckpt_dir, log_dir, timesteps_per_batch, task_name,
-          gamma, lam,
-          max_kl, cg_iters, cg_damping=1e-2,
-          vf_stepsize=3e-4, d_stepsize=3e-4, vf_iters=3,
-          max_timesteps=0, max_episodes=0, max_iters=0,
-          callback=None
-          ):
-
-    nworkers = MPI.COMM_WORLD.Get_size()
-    rank = MPI.COMM_WORLD.Get_rank()
-    np.set_printoptions(precision=3)
-    # Setup losses and stuff
-    # ----------------------------------------
-    ob_space = env.observation_space
-    ac_space = env.action_space
-    pi = policy_func("pi", ob_space, ac_space, reuse=(pretrained_weight != None))
-    oldpi = policy_func("oldpi", ob_space, ac_space)
-    atarg = tf.placeholder(dtype=tf.float32, shape=[None])  # Target advantage function (if applicable)
-    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return
-
-    ob = U.get_placeholder_cached(name="ob")
-    ac = pi.pdtype.sample_placeholder([None])
-
-    kloldnew = oldpi.pd.kl(pi.pd)
-    ent = pi.pd.entropy()
-    meankl = tf.reduce_mean(kloldnew)
-    meanent = tf.reduce_mean(ent)
-    entbonus = entcoeff * meanent
-
-    vferr = tf.reduce_mean(tf.square(pi.vpred - ret))
-
-    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac))  # advantage * pnew / pold
-    surrgain = tf.reduce_mean(ratio * atarg)
-
-    optimgain = surrgain + entbonus
-    losses = [optimgain, meankl, entbonus, surrgain, meanent]
-    loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"]
-
-    dist = meankl
-
-    all_var_list = pi.get_trainable_variables()
-    var_list = [v for v in all_var_list if v.name.startswith("pi/pol") or v.name.startswith("pi/logstd")]
-    vf_var_list = [v for v in all_var_list if v.name.startswith("pi/vff")]
-    assert len(var_list) == len(vf_var_list) + 1
-    d_adam = MpiAdam(reward_giver.get_trainable_variables())
-    vfadam = MpiAdam(vf_var_list)
-
-    get_flat = U.GetFlat(var_list)
-    set_from_flat = U.SetFromFlat(var_list)
-    klgrads = tf.gradients(dist, var_list)
-    flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan")
-    shapes = [var.get_shape().as_list() for var in var_list]
-    start = 0
-    tangents = []
-    for shape in shapes:
-        sz = U.intprod(shape)
-        tangents.append(tf.reshape(flat_tangent[start:start+sz], shape))
-        start += sz
-    gvp = tf.add_n([tf.reduce_sum(g*tangent) for (g, tangent) in zipsame(klgrads, tangents)])  # pylint: disable=E1111
-    fvp = U.flatgrad(gvp, var_list)
-
-    assign_old_eq_new = U.function([], [], updates=[tf.assign(oldv, newv)
-                                                    for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())])
-    compute_losses = U.function([ob, ac, atarg], losses)
-    compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)])
-    compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp)
-    compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list))
-
-    @contextmanager
-    def timed(msg):
-        if rank == 0:
-            print(colorize(msg, color='magenta'))
-            tstart = time.time()
-            yield
-            print(colorize("done in %.3f seconds" % (time.time() - tstart), color='magenta'))
-        else:
-            yield
-
-    def allmean(x):
-        assert isinstance(x, np.ndarray)
-        out = np.empty_like(x)
-        MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM)
-        out /= nworkers
-        return out
-
-    U.initialize()
-    th_init = get_flat()
-    MPI.COMM_WORLD.Bcast(th_init, root=0)
-    set_from_flat(th_init)
-    d_adam.sync()
-    vfadam.sync()
-    if rank == 0:
-        print("Init param sum", th_init.sum(), flush=True)
-
-    # Prepare for rollouts
-    # ----------------------------------------
-    seg_gen = traj_segment_generator(pi, env, reward_giver, timesteps_per_batch, stochastic=True)
-
-    episodes_so_far = 0
-    timesteps_so_far = 0
-    iters_so_far = 0
-    tstart = time.time()
-    lenbuffer = deque(maxlen=40)  # rolling buffer for episode lengths
-    rewbuffer = deque(maxlen=40)  # rolling buffer for episode rewards
-    true_rewbuffer = deque(maxlen=40)
-
-    assert sum([max_iters > 0, max_timesteps > 0, max_episodes > 0]) == 1
-
-    g_loss_stats = stats(loss_names)
-    d_loss_stats = stats(reward_giver.loss_name)
-    ep_stats = stats(["True_rewards", "Rewards", "Episode_length"])
-    # if provide pretrained weight
-    if pretrained_weight is not None:
-        U.load_state(pretrained_weight, var_list=pi.get_variables())
-
-    while True:
-        if callback: callback(locals(), globals())
-        if max_timesteps and timesteps_so_far >= max_timesteps:
-            break
-        elif max_episodes and episodes_so_far >= max_episodes:
-            break
-        elif max_iters and iters_so_far >= max_iters:
-            break
-
-        # Save model
-        if rank == 0 and iters_so_far % save_per_iter == 0 and ckpt_dir is not None:
-            fname = os.path.join(ckpt_dir, task_name)
-            os.makedirs(os.path.dirname(fname), exist_ok=True)
-            saver = tf.train.Saver()
-            saver.save(tf.get_default_session(), fname)
-
-        logger.log("********** Iteration %i ************" % iters_so_far)
-
-        def fisher_vector_product(p):
-            return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p
-        # ------------------ Update G ------------------
-        logger.log("Optimizing Policy...")
-        for _ in range(g_step):
-            with timed("sampling"):
-                seg = seg_gen.__next__()
-            add_vtarg_and_adv(seg, gamma, lam)
-            # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
-            ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"]
-            vpredbefore = seg["vpred"]  # predicted value function before udpate
-            atarg = (atarg - atarg.mean()) / atarg.std()  # standardized advantage function estimate
-
-            if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob)  # update running mean/std for policy
-
-            args = seg["ob"], seg["ac"], atarg
-            fvpargs = [arr[::5] for arr in args]
-
-            assign_old_eq_new()  # set old parameter values to new parameter values
-            with timed("computegrad"):
-                *lossbefore, g = compute_lossandgrad(*args)
-            lossbefore = allmean(np.array(lossbefore))
-            g = allmean(g)
-            if np.allclose(g, 0):
-                logger.log("Got zero gradient. not updating")
-            else:
-                with timed("cg"):
-                    stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank == 0)
-                assert np.isfinite(stepdir).all()
-                shs = .5*stepdir.dot(fisher_vector_product(stepdir))
-                lm = np.sqrt(shs / max_kl)
-                # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g))
-                fullstep = stepdir / lm
-                expectedimprove = g.dot(fullstep)
-                surrbefore = lossbefore[0]
-                stepsize = 1.0
-                thbefore = get_flat()
-                for _ in range(10):
-                    thnew = thbefore + fullstep * stepsize
-                    set_from_flat(thnew)
-                    meanlosses = surr, kl, *_ = allmean(np.array(compute_losses(*args)))
-                    improve = surr - surrbefore
-                    logger.log("Expected: %.3f Actual: %.3f" % (expectedimprove, improve))
-                    if not np.isfinite(meanlosses).all():
-                        logger.log("Got non-finite value of losses -- bad!")
-                    elif kl > max_kl * 1.5:
-                        logger.log("violated KL constraint. shrinking step.")
-                    elif improve < 0:
-                        logger.log("surrogate didn't improve. shrinking step.")
-                    else:
-                        logger.log("Stepsize OK!")
-                        break
-                    stepsize *= .5
-                else:
-                    logger.log("couldn't compute a good step")
-                    set_from_flat(thbefore)
-                if nworkers > 1 and iters_so_far % 20 == 0:
-                    paramsums = MPI.COMM_WORLD.allgather((thnew.sum(), vfadam.getflat().sum()))  # list of tuples
-                    assert all(np.allclose(ps, paramsums[0]) for ps in paramsums[1:])
-            with timed("vf"):
-                for _ in range(vf_iters):
-                    for (mbob, mbret) in dataset.iterbatches((seg["ob"], seg["tdlamret"]),
-                                                             include_final_partial_batch=False, batch_size=128):
-                        if hasattr(pi, "ob_rms"):
-                            pi.ob_rms.update(mbob)  # update running mean/std for policy
-                        g = allmean(compute_vflossandgrad(mbob, mbret))
-                        vfadam.update(g, vf_stepsize)
-
-        g_losses = meanlosses
-        for (lossname, lossval) in zip(loss_names, meanlosses):
-            logger.record_tabular(lossname, lossval)
-        logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret))
-        # ------------------ Update D ------------------
-        logger.log("Optimizing Discriminator...")
-        logger.log(fmt_row(13, reward_giver.loss_name))
-        ob_expert, ac_expert = expert_dataset.get_next_batch(len(ob))
-        batch_size = len(ob) // d_step
-        d_losses = []  # list of tuples, each of which gives the loss for a minibatch
-        for ob_batch, ac_batch in dataset.iterbatches((ob, ac),
-                                                      include_final_partial_batch=False,
-                                                      batch_size=batch_size):
-            ob_expert, ac_expert = expert_dataset.get_next_batch(len(ob_batch))
-            # update running mean/std for reward_giver
-            if hasattr(reward_giver, "obs_rms"): reward_giver.obs_rms.update(np.concatenate((ob_batch, ob_expert), 0))
-            *newlosses, g = reward_giver.lossandgrad(ob_batch, ac_batch, ob_expert, ac_expert)
-            d_adam.update(allmean(g), d_stepsize)
-            d_losses.append(newlosses)
-        logger.log(fmt_row(13, np.mean(d_losses, axis=0)))
-
-        lrlocal = (seg["ep_lens"], seg["ep_rets"], seg["ep_true_rets"])  # local values
-        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
-        lens, rews, true_rets = map(flatten_lists, zip(*listoflrpairs))
-        true_rewbuffer.extend(true_rets)
-        lenbuffer.extend(lens)
-        rewbuffer.extend(rews)
-
-        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
-        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
-        logger.record_tabular("EpTrueRewMean", np.mean(true_rewbuffer))
-        logger.record_tabular("EpThisIter", len(lens))
-        episodes_so_far += len(lens)
-        timesteps_so_far += sum(lens)
-        iters_so_far += 1
-
-        logger.record_tabular("EpisodesSoFar", episodes_so_far)
-        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
-        logger.record_tabular("TimeElapsed", time.time() - tstart)
-
-        if rank == 0:
-            logger.dump_tabular()
-
-
-def flatten_lists(listoflists):
-    return [el for list_ in listoflists for el in list_]
--- a/baselines/her/README.md
+++ b/baselines/her/README.md
@@ -1,35 +0,0 @@
-# Hindsight Experience Replay
-For details on Hindsight Experience Replay (HER), please read the [paper](https://arxiv.org/pdf/1707.01495.pdf).
-
-## How to use Hindsight Experience Replay
-
-### Getting started
-Training an agent is very simple:
-```bash
-python -m baselines.her.experiment.train
-```
-This will train a DDPG+HER agent on the `FetchReach` environment.
-You should see the success rate go up quickly to `1.0`, which means that the agent achieves the
-desired goal in 100% of the cases.
-The training script logs other diagnostics as well and pickles the best policy so far (w.r.t. to its test success rate),
-the latest policy, and, if enabled, a history of policies every K epochs.
-
-To inspect what the agent has learned, use the play script:
-```bash
-python -m baselines.her.experiment.play /path/to/an/experiment/policy_best.pkl
-```
-You can try it right now with the results of the training step (the script prints out the path for you).
-This should visualize the current policy for 10 episodes and will also print statistics.
-
-
-### Advanced usage
-The train script comes with advanced features like MPI support, that allows to scale across all cores of a single machine.
-To see all available options, simply run this command:
-```bash
-python -m baselines.her.experiment.train --help
-```
-To run on, say, 20 CPU cores, you can use the following command:
-```bash
-python -m baselines.her.experiment.train --num_cpu 20
-```
-That's it, you are now running rollouts using 20 MPI workers and average gradients for network updates across all 20 core.
--- a/baselines/her/init.py
+++ b/baselines/her/init.py
--- a/baselines/her/actor_critic.py
+++ b/baselines/her/actor_critic.py
@@ -1,44 +0,0 @@
-import tensorflow as tf
-from baselines.her.util import store_args, nn
-
-
-class ActorCritic:
-    @store_args
-    def __init__(self, inputs_tf, dimo, dimg, dimu, max_u, o_stats, g_stats, hidden, layers,
-                 **kwargs):
-        """The actor-critic network and related training code.
-
-        Args:
-            inputs_tf (dict of tensors): all necessary inputs for the network: the
-                observation (o), the goal (g), and the action (u)
-            dimo (int): the dimension of the observations
-            dimg (int): the dimension of the goals
-            dimu (int): the dimension of the actions
-            max_u (float): the maximum magnitude of actions; action outputs will be scaled
-                accordingly
-            o_stats (baselines.her.Normalizer): normalizer for observations
-            g_stats (baselines.her.Normalizer): normalizer for goals
-            hidden (int): number of hidden units that should be used in hidden layers
-            layers (int): number of hidden layers
-        """
-        self.o_tf = inputs_tf['o']
-        self.g_tf = inputs_tf['g']
-        self.u_tf = inputs_tf['u']
-
-        # Prepare inputs for actor and critic.
-        o = self.o_stats.normalize(self.o_tf)
-        g = self.g_stats.normalize(self.g_tf)
-        input_pi = tf.concat(axis=1, values=[o, g])  # for actor
-
-        # Networks.
-        with tf.variable_scope('pi'):
-            self.pi_tf = self.max_u * tf.tanh(nn(
-                input_pi, [self.hidden] * self.layers + [self.dimu]))
-        with tf.variable_scope('Q'):
-            # for policy training
-            input_Q = tf.concat(axis=1, values=[o, g, self.pi_tf / self.max_u])
-            self.Q_pi_tf = nn(input_Q, [self.hidden] * self.layers + [1])
-            # for critic training
-            input_Q = tf.concat(axis=1, values=[o, g, self.u_tf / self.max_u])
-            self._input_Q = input_Q  # exposed for tests
-            self.Q_tf = nn(input_Q, [self.hidden] * self.layers + [1], reuse=True)
--- a/baselines/her/ddpg.py
+++ b/baselines/her/ddpg.py
@@ -1,340 +0,0 @@
-from collections import OrderedDict
-
-import numpy as np
-import tensorflow as tf
-from tensorflow.contrib.staging import StagingArea
-
-from baselines import logger
-from baselines.her.util import (
-    import_function, store_args, flatten_grads, transitions_in_episode_batch)
-from baselines.her.normalizer import Normalizer
-from baselines.her.replay_buffer import ReplayBuffer
-from baselines.common.mpi_adam import MpiAdam
-
-
-def dims_to_shapes(input_dims):
-    return {key: tuple([val]) if val > 0 else tuple() for key, val in input_dims.items()}
-
-
-class DDPG(object):
-    @store_args
-    def __init__(self, input_dims, buffer_size, hidden, layers, network_class, polyak, batch_size,
-                 Q_lr, pi_lr, norm_eps, norm_clip, max_u, action_l2, clip_obs, scope, T,
-                 rollout_batch_size, subtract_goals, relative_goals, clip_pos_returns, clip_return,
-                 sample_transitions, gamma, reuse=False, **kwargs):
-        """Implementation of DDPG that is used in combination with Hindsight Experience Replay (HER).
-
-        Args:
-            input_dims (dict of ints): dimensions for the observation (o), the goal (g), and the
-                actions (u)
-            buffer_size (int): number of transitions that are stored in the replay buffer
-            hidden (int): number of units in the hidden layers
-            layers (int): number of hidden layers
-            network_class (str): the network class that should be used (e.g. 'baselines.her.ActorCritic')
-            polyak (float): coefficient for Polyak-averaging of the target network
-            batch_size (int): batch size for training
-            Q_lr (float): learning rate for the Q (critic) network
-            pi_lr (float): learning rate for the pi (actor) network
-            norm_eps (float): a small value used in the normalizer to avoid numerical instabilities
-            norm_clip (float): normalized inputs are clipped to be in [-norm_clip, norm_clip]
-            max_u (float): maximum action magnitude, i.e. actions are in [-max_u, max_u]
-            action_l2 (float): coefficient for L2 penalty on the actions
-            clip_obs (float): clip observations before normalization to be in [-clip_obs, clip_obs]
-            scope (str): the scope used for the TensorFlow graph
-            T (int): the time horizon for rollouts
-            rollout_batch_size (int): number of parallel rollouts per DDPG agent
-            subtract_goals (function): function that subtracts goals from each other
-            relative_goals (boolean): whether or not relative goals should be fed into the network
-            clip_pos_returns (boolean): whether or not positive returns should be clipped
-            clip_return (float): clip returns to be in [-clip_return, clip_return]
-            sample_transitions (function) function that samples from the replay buffer
-            gamma (float): gamma used for Q learning updates
-            reuse (boolean): whether or not the networks should be reused
-        """
-        if self.clip_return is None:
-            self.clip_return = np.inf
-
-        self.create_actor_critic = import_function(self.network_class)
-
-        input_shapes = dims_to_shapes(self.input_dims)
-        self.dimo = self.input_dims['o']
-        self.dimg = self.input_dims['g']
-        self.dimu = self.input_dims['u']
-
-        # Prepare staging area for feeding data to the model.
-        stage_shapes = OrderedDict()
-        for key in sorted(self.input_dims.keys()):
-            if key.startswith('info_'):
-                continue
-            stage_shapes[key] = (None, *input_shapes[key])
-        for key in ['o', 'g']:
-            stage_shapes[key + '_2'] = stage_shapes[key]
-        stage_shapes['r'] = (None,)
-        self.stage_shapes = stage_shapes
-
-        # Create network.
-        with tf.variable_scope(self.scope):
-            self.staging_tf = StagingArea(
-                dtypes=[tf.float32 for _ in self.stage_shapes.keys()],
-                shapes=list(self.stage_shapes.values()))
-            self.buffer_ph_tf = [
-                tf.placeholder(tf.float32, shape=shape) for shape in self.stage_shapes.values()]
-            self.stage_op = self.staging_tf.put(self.buffer_ph_tf)
-
-            self._create_network(reuse=reuse)
-
-        # Configure the replay buffer.
-        buffer_shapes = {key: (self.T if key != 'o' else self.T+1, *input_shapes[key])
-                         for key, val in input_shapes.items()}
-        buffer_shapes['g'] = (buffer_shapes['g'][0], self.dimg)
-        buffer_shapes['ag'] = (self.T+1, self.dimg)
-
-        buffer_size = (self.buffer_size // self.rollout_batch_size) * self.rollout_batch_size
-        self.buffer = ReplayBuffer(buffer_shapes, buffer_size, self.T, self.sample_transitions)
-
-    def _random_action(self, n):
-        return np.random.uniform(low=-self.max_u, high=self.max_u, size=(n, self.dimu))
-
-    def _preprocess_og(self, o, ag, g):
-        if self.relative_goals:
-            g_shape = g.shape
-            g = g.reshape(-1, self.dimg)
-            ag = ag.reshape(-1, self.dimg)
-            g = self.subtract_goals(g, ag)
-            g = g.reshape(*g_shape)
-        o = np.clip(o, -self.clip_obs, self.clip_obs)
-        g = np.clip(g, -self.clip_obs, self.clip_obs)
-        return o, g
-
-    def get_actions(self, o, ag, g, noise_eps=0., random_eps=0., use_target_net=False,
-                    compute_Q=False):
-        o, g = self._preprocess_og(o, ag, g)
-        policy = self.target if use_target_net else self.main
-        # values to compute
-        vals = [policy.pi_tf]
-        if compute_Q:
-            vals += [policy.Q_pi_tf]
-        # feed
-        feed = {
-            policy.o_tf: o.reshape(-1, self.dimo),
-            policy.g_tf: g.reshape(-1, self.dimg),
-            policy.u_tf: np.zeros((o.size // self.dimo, self.dimu), dtype=np.float32)
-        }
-
-        ret = self.sess.run(vals, feed_dict=feed)
-        # action postprocessing
-        u = ret[0]
-        noise = noise_eps * self.max_u * np.random.randn(*u.shape)  # gaussian noise
-        u += noise
-        u = np.clip(u, -self.max_u, self.max_u)
-        u += np.random.binomial(1, random_eps, u.shape[0]).reshape(-1, 1) * (self._random_action(u.shape[0]) - u)  # eps-greedy
-        if u.shape[0] == 1:
-            u = u[0]
-        u = u.copy()
-        ret[0] = u
-
-        if len(ret) == 1:
-            return ret[0]
-        else:
-            return ret
-
-    def store_episode(self, episode_batch, update_stats=True):
-        """
-        episode_batch: array of batch_size x (T or T+1) x dim_key
-                       'o' is of size T+1, others are of size T
-        """
-
-        self.buffer.store_episode(episode_batch)
-
-        if update_stats:
-            # add transitions to normalizer
-            episode_batch['o_2'] = episode_batch['o'][:, 1:, :]
-            episode_batch['ag_2'] = episode_batch['ag'][:, 1:, :]
-            num_normalizing_transitions = transitions_in_episode_batch(episode_batch)
-            transitions = self.sample_transitions(episode_batch, num_normalizing_transitions)
-
-            o, o_2, g, ag = transitions['o'], transitions['o_2'], transitions['g'], transitions['ag']
-            transitions['o'], transitions['g'] = self._preprocess_og(o, ag, g)
-            # No need to preprocess the o_2 and g_2 since this is only used for stats
-
-            self.o_stats.update(transitions['o'])
-            self.g_stats.update(transitions['g'])
-
-            self.o_stats.recompute_stats()
-            self.g_stats.recompute_stats()
-
-    def get_current_buffer_size(self):
-        return self.buffer.get_current_size()
-
-    def _sync_optimizers(self):
-        self.Q_adam.sync()
-        self.pi_adam.sync()
-
-    def _grads(self):
-        # Avoid feed_dict here for performance!
-        critic_loss, actor_loss, Q_grad, pi_grad = self.sess.run([
-            self.Q_loss_tf,
-            self.main.Q_pi_tf,
-            self.Q_grad_tf,
-            self.pi_grad_tf
-        ])
-        return critic_loss, actor_loss, Q_grad, pi_grad
-
-    def _update(self, Q_grad, pi_grad):
-        self.Q_adam.update(Q_grad, self.Q_lr)
-        self.pi_adam.update(pi_grad, self.pi_lr)
-
-    def sample_batch(self):
-        transitions = self.buffer.sample(self.batch_size)
-        o, o_2, g = transitions['o'], transitions['o_2'], transitions['g']
-        ag, ag_2 = transitions['ag'], transitions['ag_2']
-        transitions['o'], transitions['g'] = self._preprocess_og(o, ag, g)
-        transitions['o_2'], transitions['g_2'] = self._preprocess_og(o_2, ag_2, g)
-
-        transitions_batch = [transitions[key] for key in self.stage_shapes.keys()]
-        return transitions_batch
-
-    def stage_batch(self, batch=None):
-        if batch is None:
-            batch = self.sample_batch()
-        assert len(self.buffer_ph_tf) == len(batch)
-        self.sess.run(self.stage_op, feed_dict=dict(zip(self.buffer_ph_tf, batch)))
-
-    def train(self, stage=True):
-        if stage:
-            self.stage_batch()
-        critic_loss, actor_loss, Q_grad, pi_grad = self._grads()
-        self._update(Q_grad, pi_grad)
-        return critic_loss, actor_loss
-
-    def _init_target_net(self):
-        self.sess.run(self.init_target_net_op)
-
-    def update_target_net(self):
-        self.sess.run(self.update_target_net_op)
-
-    def clear_buffer(self):
-        self.buffer.clear_buffer()
-
-    def _vars(self, scope):
-        res = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.scope + '/' + scope)
-        assert len(res) > 0
-        return res
-
-    def _global_vars(self, scope):
-        res = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.scope + '/' + scope)
-        return res
-
-    def _create_network(self, reuse=False):
-        logger.info("Creating a DDPG agent with action space %d x %s..." % (self.dimu, self.max_u))
-
-        self.sess = tf.get_default_session()
-        if self.sess is None:
-            self.sess = tf.InteractiveSession()
-
-        # running averages
-        with tf.variable_scope('o_stats') as vs:
-            if reuse:
-                vs.reuse_variables()
-            self.o_stats = Normalizer(self.dimo, self.norm_eps, self.norm_clip, sess=self.sess)
-        with tf.variable_scope('g_stats') as vs:
-            if reuse:
-                vs.reuse_variables()
-            self.g_stats = Normalizer(self.dimg, self.norm_eps, self.norm_clip, sess=self.sess)
-
-        # mini-batch sampling.
-        batch = self.staging_tf.get()
-        batch_tf = OrderedDict([(key, batch[i])
-                                for i, key in enumerate(self.stage_shapes.keys())])
-        batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1])
-
-        # networks
-        with tf.variable_scope('main') as vs:
-            if reuse:
-                vs.reuse_variables()
-            self.main = self.create_actor_critic(batch_tf, net_type='main', **self.__dict__)
-            vs.reuse_variables()
-        with tf.variable_scope('target') as vs:
-            if reuse:
-                vs.reuse_variables()
-            target_batch_tf = batch_tf.copy()
-            target_batch_tf['o'] = batch_tf['o_2']
-            target_batch_tf['g'] = batch_tf['g_2']
-            self.target = self.create_actor_critic(
-                target_batch_tf, net_type='target', **self.__dict__)
-            vs.reuse_variables()
-        assert len(self._vars("main")) == len(self._vars("target"))
-
-        # loss functions
-        target_Q_pi_tf = self.target.Q_pi_tf
-        clip_range = (-self.clip_return, 0. if self.clip_pos_returns else np.inf)
-        target_tf = tf.clip_by_value(batch_tf['r'] + self.gamma * target_Q_pi_tf, *clip_range)
-        self.Q_loss_tf = tf.reduce_mean(tf.square(tf.stop_gradient(target_tf) - self.main.Q_tf))
-        self.pi_loss_tf = -tf.reduce_mean(self.main.Q_pi_tf)
-        self.pi_loss_tf += self.action_l2 * tf.reduce_mean(tf.square(self.main.pi_tf / self.max_u))
-        Q_grads_tf = tf.gradients(self.Q_loss_tf, self._vars('main/Q'))
-        pi_grads_tf = tf.gradients(self.pi_loss_tf, self._vars('main/pi'))
-        assert len(self._vars('main/Q')) == len(Q_grads_tf)
-        assert len(self._vars('main/pi')) == len(pi_grads_tf)
-        self.Q_grads_vars_tf = zip(Q_grads_tf, self._vars('main/Q'))
-        self.pi_grads_vars_tf = zip(pi_grads_tf, self._vars('main/pi'))
-        self.Q_grad_tf = flatten_grads(grads=Q_grads_tf, var_list=self._vars('main/Q'))
-        self.pi_grad_tf = flatten_grads(grads=pi_grads_tf, var_list=self._vars('main/pi'))
-
-        # optimizers
-        self.Q_adam = MpiAdam(self._vars('main/Q'), scale_grad_by_procs=False)
-        self.pi_adam = MpiAdam(self._vars('main/pi'), scale_grad_by_procs=False)
-
-        # polyak averaging
-        self.main_vars = self._vars('main/Q') + self._vars('main/pi')
-        self.target_vars = self._vars('target/Q') + self._vars('target/pi')
-        self.stats_vars = self._global_vars('o_stats') + self._global_vars('g_stats')
-        self.init_target_net_op = list(
-            map(lambda v: v[0].assign(v[1]), zip(self.target_vars, self.main_vars)))
-        self.update_target_net_op = list(
-            map(lambda v: v[0].assign(self.polyak * v[0] + (1. - self.polyak) * v[1]), zip(self.target_vars, self.main_vars)))
-
-        # initialize all variables
-        tf.variables_initializer(self._global_vars('')).run()
-        self._sync_optimizers()
-        self._init_target_net()
-
-    def logs(self, prefix=''):
-        logs = []
-        logs += [('stats_o/mean', np.mean(self.sess.run([self.o_stats.mean])))]
-        logs += [('stats_o/std', np.mean(self.sess.run([self.o_stats.std])))]
-        logs += [('stats_g/mean', np.mean(self.sess.run([self.g_stats.mean])))]
-        logs += [('stats_g/std', np.mean(self.sess.run([self.g_stats.std])))]
-
-        if prefix is not '' and not prefix.endswith('/'):
-            return [(prefix + '/' + key, val) for key, val in logs]
-        else:
-            return logs
-
-    def __getstate__(self):
-        """Our policies can be loaded from pkl, but after unpickling you cannot continue training.
-        """
-        excluded_subnames = ['_tf', '_op', '_vars', '_adam', 'buffer', 'sess', '_stats',
-                             'main', 'target', 'lock', 'env', 'sample_transitions',
-                             'stage_shapes', 'create_actor_critic']
-
-        state = {k: v for k, v in self.__dict__.items() if all([not subname in k for subname in excluded_subnames])}
-        state['buffer_size'] = self.buffer_size
-        state['tf'] = self.sess.run([x for x in self._global_vars('') if 'buffer' not in x.name])
-        return state
-
-    def __setstate__(self, state):
-        if 'sample_transitions' not in state:
-            # We don't need this for playing the policy.
-            state['sample_transitions'] = None
-
-        self.__init__(**state)
-        # set up stats (they are overwritten in __init__)
-        for k, v in state.items():
-            if k[-6:] == '_stats':
-                self.__dict__[k] = v
-        # load TF variables
-        vars = [x for x in self._global_vars('') if 'buffer' not in x.name]
-        assert(len(vars) == len(state["tf"]))
-        node = [tf.assign(var, val) for var, val in zip(vars, state["tf"])]
-        self.sess.run(node)
--- a/baselines/her/experiment/init.py
+++ b/baselines/her/experiment/init.py
--- a/Show More
+++ b/Show More