Add ACER, PPO2, and results_plotter.py

2017-11-16 10:02:32 -08:00
parent 6a3cbb4bc5
commit 2dd7d307d7
19 changed files with 1210 additions and 19 deletions
--- a/README.md
+++ b/README.md
@@ -15,16 +15,18 @@ pip install -e .
 ```

 - [A2C](baselines/a2c)
+- [ACER](baselines/acer)
 - [ACKTR](baselines/acktr)
 - [DDPG](baselines/ddpg)
 - [DQN](baselines/deepq)
- [PPO](baselines/ppo1)
+- [PPO1](baselines/ppo1) (Multi-CPU using MPI)
+- [PPO2](baselines/ppo2) (Optimized for GPU)
 - [TRPO](baselines/trpo_mpi)

 To cite this repository in publications:

    @misc{baselines,
-      author = {Hesse, Christopher and Plappert, Matthias and Radford, Alec and Schulman, John and Sidor, Szymon and Wu, Yuhuai},
+      author = {Dhariwal, Prafulla and Hesse, Christopher and Plappert, Matthias and Radford, Alec and Schulman, John and Sidor, Szymon and Wu, Yuhuai},
      title = {OpenAI Baselines},
      year = {2017},
      publisher = {GitHub},
--- a/baselines/a2c/utils.py
+++ b/baselines/a2c/utils.py
@@ -238,7 +238,7 @@ def check_shape(ts,shapes):
 def avg_norm(t):
    return tf.reduce_mean(tf.sqrt(tf.reduce_sum(tf.square(t), axis=-1)))

-def myadd(g1, g2, param):
+def gradient_add(g1, g2, param):
    print([g1, g2, param.name])
    assert (not (g1 is None and g2 is None)), param.name
    if g1 is None:
@@ -248,7 +248,7 @@ def myadd(g1, g2, param):
    else:
        return g1 + g2

-def my_explained_variance(qpred, q):
+def q_explained_variance(qpred, q):
    _, vary = tf.nn.moments(q, axes=[0, 1])
    _, varpred = tf.nn.moments(q - qpred, axes=[0, 1])
    check_shape([vary, varpred], [[]] * 2)
--- a/baselines/acer/README.md
+++ b/baselines/acer/README.md
@@ -0,0 +1,4 @@
+# ACER
+
+- Original paper: https://arxiv.org/abs/1611.01224
+- `python -m baselines.acer.run_atari` runs the algorithm for 40M frames = 10M timesteps on an Atari game. See help (`-h`) for more options.
--- a/baselines/acer/init.py
+++ b/baselines/acer/init.py
--- a/baselines/acer/acer_simple.py
+++ b/baselines/acer/acer_simple.py
@@ -0,0 +1,349 @@
+import time
+import joblib
+import numpy as np
+import tensorflow as tf
+from baselines import logger
+
+from baselines.common import set_global_seeds
+
+from baselines.a2c.utils import batch_to_seq, seq_to_batch
+from baselines.a2c.utils import Scheduler, make_path, find_trainable_variables
+from baselines.a2c.utils import cat_entropy_softmax
+from baselines.a2c.utils import EpisodeStats
+from baselines.a2c.utils import get_by_index, check_shape, avg_norm, gradient_add, q_explained_variance
+from baselines.acer.buffer import Buffer
+
+# remove last step
+def strip(var, nenvs, nsteps, flat = False):
+    vars = batch_to_seq(var, nenvs, nsteps + 1, flat)
+    return seq_to_batch(vars[:-1], flat)
+
+def q_retrace(R, D, q_i, v, rho_i, nenvs, nsteps, gamma):
+    """
+    Calculates q_retrace targets
+
+    :param R: Rewards
+    :param D: Dones
+    :param q_i: Q values for actions taken
+    :param v: V values
+    :param rho_i: Importance weight for each action
+    :return: Q_retrace values
+    """
+    rho_bar = batch_to_seq(tf.minimum(1.0, rho_i), nenvs, nsteps, True)  # list of len steps, shape [nenvs]
+    rs = batch_to_seq(R, nenvs, nsteps, True)  # list of len steps, shape [nenvs]
+    ds = batch_to_seq(D, nenvs, nsteps, True)  # list of len steps, shape [nenvs]
+    q_is = batch_to_seq(q_i, nenvs, nsteps, True)
+    vs = batch_to_seq(v, nenvs, nsteps + 1, True)
+    v_final = vs[-1]
+    qret = v_final
+    qrets = []
+    for i in range(nsteps - 1, -1, -1):
+        check_shape([qret, ds[i], rs[i], rho_bar[i], q_is[i], vs[i]], [[nenvs]] * 6)
+        qret = rs[i] + gamma * qret * (1.0 - ds[i])
+        qrets.append(qret)
+        qret = (rho_bar[i] * (qret - q_is[i])) + vs[i]
+    qrets = qrets[::-1]
+    qret = seq_to_batch(qrets, flat=True)
+    return qret
+
+# For ACER with PPO clipping instead of trust region
+# def clip(ratio, eps_clip):
+#     # assume 0 <= eps_clip <= 1
+#     return tf.minimum(1 + eps_clip, tf.maximum(1 - eps_clip, ratio))
+
+class Model(object):
+    def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, nstack, num_procs,
+                 ent_coef, q_coef, gamma, max_grad_norm, lr,
+                 rprop_alpha, rprop_epsilon, total_timesteps, lrschedule,
+                 c, trust_region, alpha, delta):
+        config = tf.ConfigProto(allow_soft_placement=True,
+                                intra_op_parallelism_threads=num_procs,
+                                inter_op_parallelism_threads=num_procs)
+        sess = tf.Session(config=config)
+        nact = ac_space.n
+        nbatch = nenvs * nsteps
+
+        A = tf.placeholder(tf.int32, [nbatch]) # actions
+        D = tf.placeholder(tf.float32, [nbatch]) # dones
+        R = tf.placeholder(tf.float32, [nbatch]) # rewards, not returns
+        MU = tf.placeholder(tf.float32, [nbatch, nact]) # mu's
+        LR = tf.placeholder(tf.float32, [])
+        eps = 1e-6
+
+        step_model = policy(sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False)
+        train_model = policy(sess, ob_space, ac_space, nenvs, nsteps + 1, nstack, reuse=True)
+
+        params = find_trainable_variables("model")
+        print("Params {}".format(len(params)))
+        for var in params:
+            print(var)
+
+        # create polyak averaged model
+        ema = tf.train.ExponentialMovingAverage(alpha)
+        ema_apply_op = ema.apply(params)
+
+        def custom_getter(getter, *args, **kwargs):
+            v = ema.average(getter(*args, **kwargs))
+            print(v.name)
+            return v
+
+        with tf.variable_scope("", custom_getter=custom_getter, reuse=True):
+            polyak_model = policy(sess, ob_space, ac_space, nenvs, nsteps + 1, nstack, reuse=True)
+
+        # Notation: (var) = batch variable, (var)s = seqeuence variable, (var)_i = variable index by action at step i
+        v = tf.reduce_sum(train_model.pi * train_model.q, axis = -1) # shape is [nenvs * (nsteps + 1)]
+
+        # strip off last step
+        f, f_pol, q = map(lambda var: strip(var, nenvs, nsteps), [train_model.pi, polyak_model.pi, train_model.q])
+        # Get pi and q values for actions taken
+        f_i = get_by_index(f, A)
+        q_i = get_by_index(q, A)
+
+        # Compute ratios for importance truncation
+        rho = f / (MU + eps)
+        rho_i = get_by_index(rho, A)
+
+        # Calculate Q_retrace targets
+        qret = q_retrace(R, D, q_i, v, rho_i, nenvs, nsteps, gamma)
+
+        # Calculate losses
+        # Entropy
+        entropy = tf.reduce_mean(cat_entropy_softmax(f))
+
+        # Policy Graident loss, with truncated importance sampling & bias correction
+        v = strip(v, nenvs, nsteps, True)
+        check_shape([qret, v, rho_i, f_i], [[nenvs * nsteps]] * 4)
+        check_shape([rho, f, q], [[nenvs * nsteps, nact]] * 2)
+
+        # Truncated importance sampling
+        adv = qret - v
+        logf = tf.log(f_i + eps)
+        gain_f = logf * tf.stop_gradient(adv * tf.minimum(c, rho_i))  # [nenvs * nsteps]
+        loss_f = -tf.reduce_mean(gain_f)
+
+        # Bias correction for the truncation
+        adv_bc = (q - tf.reshape(v, [nenvs * nsteps, 1]))  # [nenvs * nsteps, nact]
+        logf_bc = tf.log(f + eps) # / (f_old + eps)
+        check_shape([adv_bc, logf_bc], [[nenvs * nsteps, nact]]*2)
+        gain_bc = tf.reduce_sum(logf_bc * tf.stop_gradient(adv_bc * tf.nn.relu(1.0 - (c / (rho + eps))) * f), axis = 1) #IMP: This is sum, as expectation wrt f
+        loss_bc= -tf.reduce_mean(gain_bc)
+
+        loss_policy = loss_f + loss_bc
+
+        # Value/Q function loss, and explained variance
+        check_shape([qret, q_i], [[nenvs * nsteps]]*2)
+        ev = q_explained_variance(tf.reshape(q_i, [nenvs, nsteps]), tf.reshape(qret, [nenvs, nsteps]))
+        loss_q = tf.reduce_mean(tf.square(tf.stop_gradient(qret) - q_i)*0.5)
+
+        # Net loss
+        check_shape([loss_policy, loss_q, entropy], [[]] * 3)
+        loss = loss_policy + q_coef * loss_q - ent_coef * entropy
+
+        if trust_region:
+            g = tf.gradients(- (loss_policy - ent_coef * entropy) * nsteps * nenvs, f) #[nenvs * nsteps, nact]
+            # k = tf.gradients(KL(f_pol || f), f)
+            k = - f_pol / (f + eps) #[nenvs * nsteps, nact] # Directly computed gradient of KL divergence wrt f
+            k_dot_g = tf.reduce_sum(k * g, axis=-1)
+            adj = tf.maximum(0.0, (tf.reduce_sum(k * g, axis=-1) - delta) / (tf.reduce_sum(tf.square(k), axis=-1) + eps)) #[nenvs * nsteps]
+
+            # Calculate stats (before doing adjustment) for logging.
+            avg_norm_k = avg_norm(k)
+            avg_norm_g = avg_norm(g)
+            avg_norm_k_dot_g = tf.reduce_mean(tf.abs(k_dot_g))
+            avg_norm_adj = tf.reduce_mean(tf.abs(adj))
+
+            g = g - tf.reshape(adj, [nenvs * nsteps, 1]) * k
+            grads_f = -g/(nenvs*nsteps) # These are turst region adjusted gradients wrt f ie statistics of policy pi
+            grads_policy = tf.gradients(f, params, grads_f)
+            grads_q = tf.gradients(loss_q * q_coef, params)
+            grads = [gradient_add(g1, g2, param) for (g1, g2, param) in zip(grads_policy, grads_q, params)]
+
+            avg_norm_grads_f = avg_norm(grads_f) * (nsteps * nenvs)
+            norm_grads_q = tf.global_norm(grads_q)
+            norm_grads_policy = tf.global_norm(grads_policy)
+        else:
+            grads = tf.gradients(loss, params)
+
+        if max_grad_norm is not None:
+            grads, norm_grads = tf.clip_by_global_norm(grads, max_grad_norm)
+        grads = list(zip(grads, params))
+        trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=rprop_alpha, epsilon=rprop_epsilon)
+        _opt_op = trainer.apply_gradients(grads)
+
+        # so when you call _train, you first do the gradient step, then you apply ema
+        with tf.control_dependencies([_opt_op]):
+            _train = tf.group(ema_apply_op)
+
+        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)
+
+        # Ops/Summaries to run, and their names for logging
+        run_ops = [_train, loss, loss_q, entropy, loss_policy, loss_f, loss_bc, ev, norm_grads]
+        names_ops = ['loss', 'loss_q', 'entropy', 'loss_policy', 'loss_f', 'loss_bc', 'explained_variance',
+                     'norm_grads']
+        if trust_region:
+            run_ops = run_ops + [norm_grads_q, norm_grads_policy, avg_norm_grads_f, avg_norm_k, avg_norm_g, avg_norm_k_dot_g,
+                                 avg_norm_adj]
+            names_ops = names_ops + ['norm_grads_q', 'norm_grads_policy', 'avg_norm_grads_f', 'avg_norm_k', 'avg_norm_g',
+                                     'avg_norm_k_dot_g', 'avg_norm_adj']
+
+        def train(obs, actions, rewards, dones, mus, states, masks, steps):
+            cur_lr = lr.value_steps(steps)
+            td_map = {train_model.X: obs, polyak_model.X: obs, A: actions, R: rewards, D: dones, MU: mus, LR: cur_lr}
+            if states != []:
+                td_map[train_model.S] = states
+                td_map[train_model.M] = masks
+                td_map[polyak_model.S] = states
+                td_map[polyak_model.M] = masks
+            return names_ops, sess.run(run_ops, td_map)[1:]  # strip off _train
+
+        def save(save_path):
+            ps = sess.run(params)
+            make_path(save_path)
+            joblib.dump(ps, save_path)
+
+        self.train = train
+        self.save = save
+        self.train_model = train_model
+        self.step_model = step_model
+        self.step = step_model.step
+        self.initial_state = step_model.initial_state
+        tf.global_variables_initializer().run(session=sess)
+
+class Runner(object):
+    def __init__(self, env, model, nsteps, nstack):
+        self.env = env
+        self.nstack = nstack
+        self.model = model
+        nh, nw, nc = env.observation_space.shape
+        self.nc = nc  # nc = 1 for atari, but just in case
+        self.nenv = nenv = env.num_envs
+        self.nact = env.action_space.n
+        self.nbatch = nenv * nsteps
+        self.batch_ob_shape = (nenv*(nsteps+1), nh, nw, nc*nstack)
+        self.obs = np.zeros((nenv, nh, nw, nc * nstack), dtype=np.uint8)
+        obs = env.reset()
+        self.update_obs(obs)
+        self.nsteps = nsteps
+        self.states = model.initial_state
+        self.dones = [False for _ in range(nenv)]
+
+    def update_obs(self, obs, dones=None):
+        if dones is not None:
+            self.obs *= (1 - dones.astype(np.uint8))[:, None, None, None]
+        self.obs = np.roll(self.obs, shift=-self.nc, axis=3)
+        self.obs[:, :, :, -self.nc:] = obs[:, :, :, :]
+
+    def run(self):
+        enc_obs = np.split(self.obs, self.nstack, axis=3)  # so now list of obs steps
+        mb_obs, mb_actions, mb_mus, mb_dones, mb_rewards = [], [], [], [], []
+        for _ in range(self.nsteps):
+            actions, mus, states = self.model.step(self.obs, state=self.states, mask=self.dones)
+            mb_obs.append(np.copy(self.obs))
+            mb_actions.append(actions)
+            mb_mus.append(mus)
+            mb_dones.append(self.dones)
+            obs, rewards, dones, _ = self.env.step(actions)
+            # states information for statefull models like LSTM
+            self.states = states
+            self.dones = dones
+            self.update_obs(obs, dones)
+            mb_rewards.append(rewards)
+            enc_obs.append(obs)
+        mb_obs.append(np.copy(self.obs))
+        mb_dones.append(self.dones)
+
+        enc_obs = np.asarray(enc_obs, dtype=np.uint8).swapaxes(1, 0)
+        mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0)
+        mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0)
+        mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0)
+        mb_mus = np.asarray(mb_mus, dtype=np.float32).swapaxes(1, 0)
+
+        mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
+
+        mb_masks = mb_dones # Used for statefull models like LSTM's to mask state when done
+        mb_dones = mb_dones[:, 1:] # Used for calculating returns. The dones array is now aligned with rewards
+
+        # shapes are now [nenv, nsteps, []]
+        # When pulling from buffer, arrays will now be reshaped in place, preventing a deep copy.
+
+        return enc_obs, mb_obs, mb_actions, mb_rewards, mb_mus, mb_dones, mb_masks
+
+class Acer():
+    def __init__(self, runner, model, buffer, log_interval):
+        self.runner = runner
+        self.model = model
+        self.buffer = buffer
+        self.log_interval = log_interval
+        self.tstart = None
+        self.episode_stats = EpisodeStats(runner.nsteps, runner.nenv)
+        self.steps = None
+
+    def call(self, on_policy):
+        runner, model, buffer, steps = self.runner, self.model, self.buffer, self.steps
+        if on_policy:
+            enc_obs, obs, actions, rewards, mus, dones, masks = runner.run()
+            self.episode_stats.feed(rewards, dones)
+            if buffer is not None:
+                buffer.put(enc_obs, actions, rewards, mus, dones, masks)
+        else:
+            # get obs, actions, rewards, mus, dones from buffer.
+            obs, actions, rewards, mus, dones, masks = buffer.get()
+
+        # reshape stuff correctly
+        obs = obs.reshape(runner.batch_ob_shape)
+        actions = actions.reshape([runner.nbatch])
+        rewards = rewards.reshape([runner.nbatch])
+        mus = mus.reshape([runner.nbatch, runner.nact])
+        dones = dones.reshape([runner.nbatch])
+        masks = masks.reshape([runner.batch_ob_shape[0]])
+
+        names_ops, values_ops = model.train(obs, actions, rewards, dones, mus, model.initial_state, masks, steps)
+
+        if on_policy and (int(steps/runner.nbatch) % self.log_interval == 0):
+            logger.record_tabular("total_timesteps", steps)
+            logger.record_tabular("fps", int(steps/(time.time() - self.tstart)))
+            # IMP: In EpisodicLife env, during training, we get done=True at each loss of life, not just at the terminal state.
+            # Thus, this is mean until end of life, not end of episode.
+            # For true episode rewards, see the monitor files in the log folder.
+            logger.record_tabular("mean_episode_length", self.episode_stats.mean_length())
+            logger.record_tabular("mean_episode_reward", self.episode_stats.mean_reward())
+            for name, val in zip(names_ops, values_ops):
+                logger.record_tabular(name, float(val))
+            logger.dump_tabular()
+
+
+def learn(policy, env, seed, nsteps=20, nstack=4, total_timesteps=int(80e6), q_coef=0.5, ent_coef=0.01,
+          max_grad_norm=10, lr=7e-4, lrschedule='linear', rprop_epsilon=1e-5, rprop_alpha=0.99, gamma=0.99,
+          log_interval=100, buffer_size=50000, replay_ratio=4, replay_start=10000, c=10.0,
+          trust_region=True, alpha=0.99, delta=1):
+    print("Running Acer Simple")
+    print(locals())
+    tf.reset_default_graph()
+    set_global_seeds(seed)
+
+    nenvs = env.num_envs
+    ob_space = env.observation_space
+    ac_space = env.action_space
+    num_procs = len(env.remotes) # HACK
+    model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nenvs=nenvs, nsteps=nsteps, nstack=nstack,
+                  num_procs=num_procs, ent_coef=ent_coef, q_coef=q_coef, gamma=gamma,
+                  max_grad_norm=max_grad_norm, lr=lr, rprop_alpha=rprop_alpha, rprop_epsilon=rprop_epsilon,
+                  total_timesteps=total_timesteps, lrschedule=lrschedule, c=c,
+                  trust_region=trust_region, alpha=alpha, delta=delta)
+
+    runner = Runner(env=env, model=model, nsteps=nsteps, nstack=nstack)
+    if replay_ratio > 0:
+        buffer = Buffer(env=env, nsteps=nsteps, nstack=nstack, size=buffer_size)
+    else:
+        buffer = None
+    nbatch = nenvs*nsteps
+    acer = Acer(runner, model, buffer, log_interval)
+    acer.tstart = time.time()
+    for acer.steps in range(0, total_timesteps, nbatch): #nbatch samples, 1 on_policy call and multiple off-policy calls
+        acer.call(on_policy=True)
+        if replay_ratio > 0 and buffer.has_atleast(replay_start):
+            n = np.random.poisson(replay_ratio)
+            for _ in range(n):
+                acer.call(on_policy=False)  # no simulation steps in this
+
+    env.close()
--- a/baselines/acer/buffer.py
+++ b/baselines/acer/buffer.py
@@ -0,0 +1,103 @@
+import numpy as np
+
+class Buffer(object):
+    # gets obs, actions, rewards, mu's, (states, masks), dones
+    def __init__(self, env, nsteps, nstack, size=50000):
+        self.nenv = env.num_envs
+        self.nsteps = nsteps
+        self.nh, self.nw, self.nc = env.observation_space.shape
+        self.nstack = nstack
+        self.nbatch = self.nenv * self.nsteps
+        self.size = size // (self.nsteps)  # Each loc contains nenv * nsteps frames, thus total buffer is nenv * size frames
+
+        # Memory
+        self.enc_obs = None
+        self.actions = None
+        self.rewards = None
+        self.mus = None
+        self.dones = None
+        self.masks = None
+
+        # Size indexes
+        self.next_idx = 0
+        self.num_in_buffer = 0
+
+    def has_atleast(self, frames):
+        # Frames per env, so total (nenv * frames) Frames needed
+        # Each buffer loc has nenv * nsteps frames
+        return self.num_in_buffer >= (frames // self.nsteps)
+
+    def can_sample(self):
+        return self.num_in_buffer > 0
+
+    # Generate stacked frames
+    def decode(self, enc_obs, dones):
+        # enc_obs has shape [nenvs, nsteps + nstack, nh, nw, nc]
+        # dones has shape [nenvs, nsteps, nh, nw, nc]
+        # returns stacked obs of shape [nenv, (nsteps + 1), nh, nw, nstack*nc]
+        nstack, nenv, nsteps, nh, nw, nc = self.nstack, self.nenv, self.nsteps, self.nh, self.nw, self.nc
+        y = np.empty([nsteps + nstack - 1, nenv, 1, 1, 1], dtype=np.float32)
+        obs = np.zeros([nstack, nsteps + nstack, nenv, nh, nw, nc], dtype=np.uint8)
+        x = np.reshape(enc_obs, [nenv, nsteps + nstack, nh, nw, nc]).swapaxes(1,
+                                                                              0)  # [nsteps + nstack, nenv, nh, nw, nc]
+        y[3:] = np.reshape(1.0 - dones, [nenv, nsteps, 1, 1, 1]).swapaxes(1, 0)  # keep
+        y[:3] = 1.0
+        # y = np.reshape(1 - dones, [nenvs, nsteps, 1, 1, 1])
+        for i in range(nstack):
+            obs[-(i + 1), i:] = x
+            # obs[:,i:,:,:,-(i+1),:] = x
+            x = x[:-1] * y
+            y = y[1:]
+        return np.reshape(obs[:, 3:].transpose((2, 1, 3, 4, 0, 5)), [nenv, (nsteps + 1), nh, nw, nstack * nc])
+
+    def put(self, enc_obs, actions, rewards, mus, dones, masks):
+        # enc_obs [nenv, (nsteps + nstack), nh, nw, nc]
+        # actions, rewards, dones [nenv, nsteps]
+        # mus [nenv, nsteps, nact]
+
+        if self.enc_obs is None:
+            self.enc_obs = np.empty([self.size] + list(enc_obs.shape), dtype=np.uint8)
+            self.actions = np.empty([self.size] + list(actions.shape), dtype=np.int32)
+            self.rewards = np.empty([self.size] + list(rewards.shape), dtype=np.float32)
+            self.mus = np.empty([self.size] + list(mus.shape), dtype=np.float32)
+            self.dones = np.empty([self.size] + list(dones.shape), dtype=np.bool)
+            self.masks = np.empty([self.size] + list(masks.shape), dtype=np.bool)
+
+        self.enc_obs[self.next_idx] = enc_obs
+        self.actions[self.next_idx] = actions
+        self.rewards[self.next_idx] = rewards
+        self.mus[self.next_idx] = mus
+        self.dones[self.next_idx] = dones
+        self.masks[self.next_idx] = masks
+
+        self.next_idx = (self.next_idx + 1) % self.size
+        self.num_in_buffer = min(self.size, self.num_in_buffer + 1)
+
+    def take(self, x, idx, envx):
+        nenv = self.nenv
+        out = np.empty([nenv] + list(x.shape[2:]), dtype=x.dtype)
+        for i in range(nenv):
+            out[i] = x[idx[i], envx[i]]
+        return out
+
+    def get(self):
+        # returns
+        # obs [nenv, (nsteps + 1), nh, nw, nstack*nc]
+        # actions, rewards, dones [nenv, nsteps]
+        # mus [nenv, nsteps, nact]
+        nenv = self.nenv
+        assert self.can_sample()
+
+        # Sample exactly one id per env. If you sample across envs, then higher correlation in samples from same env.
+        idx = np.random.randint(0, self.num_in_buffer, nenv)
+        envx = np.arange(nenv)
+
+        take = lambda x: self.take(x, idx, envx)  # for i in range(nenv)], axis = 0)
+        dones = take(self.dones)
+        enc_obs = take(self.enc_obs)
+        obs = self.decode(enc_obs, dones)
+        actions = take(self.actions)
+        rewards = take(self.rewards)
+        mus = take(self.mus)
+        masks = take(self.masks)
+        return obs, actions, rewards, mus, dones, masks
--- a/baselines/acer/policies.py
+++ b/baselines/acer/policies.py
@@ -0,0 +1,86 @@
+import numpy as np
+import tensorflow as tf
+from baselines.a2c.utils import conv, fc, conv_to_fc, batch_to_seq, seq_to_batch, lstm, lnlstm, sample, check_shape
+
+
+class AcerCnnPolicy(object):
+
+    def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False):
+        nbatch = nenv * nsteps
+        nh, nw, nc = ob_space.shape
+        ob_shape = (nbatch, nh, nw, nc * nstack)
+        nact = ac_space.n
+        X = tf.placeholder(tf.uint8, ob_shape)  # obs
+        with tf.variable_scope("model", reuse=reuse):
+            h = conv(tf.cast(X, tf.float32) / 255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2))
+            h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))
+            h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2))
+            h3 = conv_to_fc(h3)
+            h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
+            pi_logits = fc(h4, 'pi', nact, act=lambda x: x, init_scale=0.01)
+            pi = tf.nn.softmax(pi_logits)
+            q = fc(h4, 'q', nact, act=lambda x: x)
+
+        a = sample(pi_logits)  # could change this to use self.pi instead
+        self.initial_state = []  # not stateful
+        self.X = X
+        self.pi = pi  # actual policy params now
+        self.q = q
+
+        def step(ob, *args, **kwargs):
+            # returns actions, mus, states
+            a0, pi0 = sess.run([a, pi], {X: ob})
+            return a0, pi0, []  # dummy state
+
+        def out(ob, *args, **kwargs):
+            pi0, q0 = sess.run([pi, q], {X: ob})
+            return pi0, q0
+
+        def act(ob, *args, **kwargs):
+            return sess.run(a, {X: ob})
+
+        self.step = step
+        self.out = out
+        self.act = act
+
+class AcerLstmPolicy(object):
+
+    def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False, nlstm=256):
+        nbatch = nenv * nsteps
+        nh, nw, nc = ob_space.shape
+        ob_shape = (nbatch, nh, nw, nc * nstack)
+        nact = ac_space.n
+        X = tf.placeholder(tf.uint8, ob_shape)  # obs
+        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
+        S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
+        with tf.variable_scope("model", reuse=reuse):
+            h = conv(tf.cast(X, tf.float32) / 255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2))
+            h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))
+            h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2))
+            h3 = conv_to_fc(h3)
+            h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
+
+            # lstm
+            xs = batch_to_seq(h4, nenv, nsteps)
+            ms = batch_to_seq(M, nenv, nsteps)
+            h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
+            h5 = seq_to_batch(h5)
+
+            pi_logits = fc(h5, 'pi', nact, act=lambda x: x, init_scale=0.01)
+            pi = tf.nn.softmax(pi_logits)
+            q = fc(h5, 'q', nact, act=lambda x: x)
+
+        a = sample(pi_logits)  # could change this to use self.pi instead
+        self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)
+        self.X = X
+        self.M = M
+        self.S = S
+        self.pi = pi  # actual policy params now
+        self.q = q
+
+        def step(ob, state, mask, *args, **kwargs):
+            # returns actions, mus, states
+            a0, pi0, s = sess.run([a, pi, snew], {X: ob, S: state, M: mask})
+            return a0, pi0, s
+
+        self.step = step
--- a/baselines/acer/run_atari.py
+++ b/baselines/acer/run_atari.py
@@ -0,0 +1,47 @@
+#!/usr/bin/env python
+import os, logging, gym
+from baselines import logger
+from baselines.common import set_global_seeds
+from baselines import bench
+from baselines.acer.acer_simple import learn
+from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
+from baselines.common.atari_wrappers import make_atari, wrap_deepmind
+from baselines.acer.policies import AcerCnnPolicy, AcerLstmPolicy
+
+def train(env_id, num_timesteps, seed, policy, lrschedule, num_cpu):
+    def make_env(rank):
+        def _thunk():
+            env = make_atari(env_id)
+            env.seed(seed + rank)
+            env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
+            gym.logger.setLevel(logging.WARN)
+            return wrap_deepmind(env)
+        return _thunk
+    set_global_seeds(seed)
+    env = SubprocVecEnv([make_env(i) for i in range(num_cpu)])
+    if policy == 'cnn':
+        policy_fn = AcerCnnPolicy
+    elif policy == 'lstm':
+        policy_fn = AcerLstmPolicy
+    else:
+        print("Policy {} not implemented".format(policy))
+        return
+    learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), lrschedule=lrschedule)
+    env.close()
+
+def main():
+    import argparse
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4')
+    parser.add_argument('--seed', help='RNG seed', type=int, default=0)
+    parser.add_argument('--policy', help='Policy architecture', choices=['cnn', 'lstm', 'lnlstm'], default='cnn')
+    parser.add_argument('--lrschedule', help='Learning rate schedule', choices=['constant', 'linear'], default='constant')
+    parser.add_argument('--logdir', help ='Directory for logging', default='./log')
+    parser.add_argument('--num-timesteps', type=int, default=int(10e6))
+    args = parser.parse_args()
+    logger.configure(os.path.abspath(args.logdir))
+    train(args.env, num_timesteps=args.num_timesteps, seed=args.seed,
+          policy=args.policy, lrschedule=args.lrschedule, num_cpu=16)
+
+if __name__ == '__main__':
+    main()
--- a/baselines/bench/init.py
+++ b/baselines/bench/init.py
@@ -1,3 +1,2 @@
 from baselines.bench.benchmarks import *
 from baselines.bench.monitor import *
-from baselines.bench.simple_bench import simple_bench
--- a/baselines/bench/benchmarks.py
+++ b/baselines/bench/benchmarks.py
@@ -128,5 +128,5 @@ _atari50 = [  # actually 47
 register_benchmark({
    'name': 'Atari50_10M',
    'description': '47 Atari games from Mnih et al. (2013), with pixel observations, 10M timesteps',
-    'tasks': [{'env_id': _game + _ATARI_SUFFIX, 'trials': 3, 'num_timesteps': int(10e6)} for _game in _atari50]
+    'tasks': [{'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(10e6)} for _game in _atari50]
 })
--- a/baselines/common/atari_wrappers.py
+++ b/baselines/common/atari_wrappers.py
@@ -12,10 +12,6 @@ class NoopResetEnv(gym.Wrapper):
        gym.Wrapper.__init__(self, env)
        self.noop_max = noop_max
        self.override_num_noops = None
-        if isinstance(env.action_space, gym.spaces.MultiBinary):
-            self.noop_action = np.zeros(self.env.action_space.n, dtype=np.int64)
-        else:
-            # used for atari environments
        self.noop_action = 0
        assert env.unwrapped.get_action_meanings()[0] == 'NOOP'

@@ -175,7 +171,7 @@ class LazyFrames(object):

        This object should only be converted to numpy array before being passed to the model.

-        You'd not belive how complex the previous solution was."""
+        You'd not believe how complex the previous solution was."""
        self._frames = frames

    def __array__(self, dtype=None):
--- a/baselines/deepq/simple.py
+++ b/baselines/deepq/simple.py
@@ -238,10 +238,6 @@ def learn(env,
                kwargs['update_param_noise_threshold'] = update_param_noise_threshold
                kwargs['update_param_noise_scale'] = True
            action = act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0]
-            if isinstance(env.action_space, gym.spaces.MultiBinary):
-                env_action = np.zeros(env.action_space.n)
-                env_action[action] = 1
-            else:
            env_action = action
            reset = False
            new_obs, rew, done, _ = env.step(env_action)
--- a/baselines/ppo2/README.md
+++ b/baselines/ppo2/README.md
@@ -0,0 +1,6 @@
+# PPO2
+
+- Original paper: https://arxiv.org/abs/1707.06347
+- Baselines blog post: https://blog.openai.com/openai-baselines-ppo/
+- `mpirun -np 8 python -m baselines.ppo1.run_atari` runs the algorithm for 40M frames = 10M timesteps on an Atari game. See help (`-h`) for more options.
+- `python -m baselines.ppo1.run_mujoco` runs the algorithm for 1M frames on a Mujoco environment.
--- a/baselines/ppo2/init.py
+++ b/baselines/ppo2/init.py
--- a/baselines/ppo2/policies.py
+++ b/baselines/ppo2/policies.py
@@ -0,0 +1,167 @@
+import numpy as np
+import tensorflow as tf
+from baselines.a2c.utils import conv, fc, conv_to_fc, batch_to_seq, seq_to_batch, lstm, lnlstm
+from baselines.common.distributions import make_pdtype
+
+class LnLstmPolicy(object):
+    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False):
+        nenv = nbatch // nsteps
+        nh, nw, nc = ob_space.shape
+        ob_shape = (nbatch, nh, nw, nc)
+        nact = ac_space.n
+        X = tf.placeholder(tf.uint8, ob_shape) #obs
+        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
+        S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
+        with tf.variable_scope("model", reuse=reuse):
+            h = conv(tf.cast(X, tf.float32)/255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2))
+            h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))
+            h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2))
+            h3 = conv_to_fc(h3)
+            h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
+            xs = batch_to_seq(h4, nenv, nsteps)
+            ms = batch_to_seq(M, nenv, nsteps)
+            h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm)
+            h5 = seq_to_batch(h5)
+            pi = fc(h5, 'pi', nact, act=lambda x:x)
+            vf = fc(h5, 'v', 1, act=lambda x:x)
+
+        self.pdtype = make_pdtype(ac_space)
+        self.pd = self.pdtype.pdfromflat(pi)
+
+        v0 = vf[:, 0]
+        a0 = self.pd.sample()
+        neglogp0 = self.pd.neglogp(a0)
+        self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)
+
+        def step(ob, state, mask):
+            return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask})
+
+        def value(ob, state, mask):
+            return sess.run(v0, {X:ob, S:state, M:mask})
+
+        self.X = X
+        self.M = M
+        self.S = S
+        self.pi = pi
+        self.vf = vf
+        self.step = step
+        self.value = value
+
+class LstmPolicy(object):
+
+    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False):
+        nenv = nbatch // nsteps
+
+        nh, nw, nc = ob_space.shape
+        ob_shape = (nbatch, nh, nw, nc)
+        nact = ac_space.n
+        X = tf.placeholder(tf.uint8, ob_shape) #obs
+        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
+        S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
+        with tf.variable_scope("model", reuse=reuse):
+            h = conv(tf.cast(X, tf.float32)/255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2))
+            h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))
+            h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2))
+            h3 = conv_to_fc(h3)
+            h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
+            xs = batch_to_seq(h4, nenv, nsteps)
+            ms = batch_to_seq(M, nenv, nsteps)
+            h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
+            h5 = seq_to_batch(h5)
+            pi = fc(h5, 'pi', nact, act=lambda x:x)
+            vf = fc(h5, 'v', 1, act=lambda x:x)
+
+        self.pdtype = make_pdtype(ac_space)
+        self.pd = self.pdtype.pdfromflat(pi)
+
+        v0 = vf[:, 0]
+        a0 = self.pd.sample()
+        neglogp0 = self.pd.neglogp(a0)
+        self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)
+
+        def step(ob, state, mask):
+            return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask})
+
+        def value(ob, state, mask):
+            return sess.run(v0, {X:ob, S:state, M:mask})
+
+        self.X = X
+        self.M = M
+        self.S = S
+        self.pi = pi
+        self.vf = vf
+        self.step = step
+        self.value = value
+
+class CnnPolicy(object):
+
+    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False): #pylint: disable=W0613
+        nh, nw, nc = ob_space.shape
+        ob_shape = (nbatch, nh, nw, nc)
+        nact = ac_space.n
+        X = tf.placeholder(tf.uint8, ob_shape) #obs
+        with tf.variable_scope("model", reuse=reuse):
+            h = conv(tf.cast(X, tf.float32)/255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2))
+            h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))
+            h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2))
+            h3 = conv_to_fc(h3)
+            h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
+            pi = fc(h4, 'pi', nact, act=lambda x:x, init_scale=0.01)
+            vf = fc(h4, 'v', 1, act=lambda x:x)[:,0]
+
+        self.pdtype = make_pdtype(ac_space)
+        self.pd = self.pdtype.pdfromflat(pi)
+
+        a0 = self.pd.sample()
+        neglogp0 = self.pd.neglogp(a0)
+        self.initial_state = None
+
+        def step(ob, *_args, **_kwargs):
+            a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob})
+            return a, v, self.initial_state, neglogp
+
+        def value(ob, *_args, **_kwargs):
+            return sess.run(vf, {X:ob})
+
+        self.X = X
+        self.pi = pi
+        self.vf = vf
+        self.step = step
+        self.value = value
+
+class MlpPolicy(object):
+    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False): #pylint: disable=W0613
+        ob_shape = (nbatch,) + ob_space.shape
+        actdim = ac_space.shape[0]
+        X = tf.placeholder(tf.float32, ob_shape, name='Ob') #obs
+        with tf.variable_scope("model", reuse=reuse):
+            h1 = fc(X, 'pi_fc1', nh=64, init_scale=np.sqrt(2), act=tf.tanh)
+            h2 = fc(h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2), act=tf.tanh)
+            pi = fc(h2, 'pi', actdim, act=lambda x:x, init_scale=0.01)
+            h1 = fc(X, 'vf_fc1', nh=64, init_scale=np.sqrt(2), act=tf.tanh)
+            h2 = fc(h1, 'vf_fc2', nh=64, init_scale=np.sqrt(2), act=tf.tanh)
+            vf = fc(h2, 'vf', 1, act=lambda x:x)[:,0]
+            logstd = tf.get_variable(name="logstd", shape=[1, actdim], 
+                initializer=tf.zeros_initializer())
+
+        pdparam = tf.concat([pi, pi * 0.0 + logstd], axis=1)
+
+        self.pdtype = make_pdtype(ac_space)
+        self.pd = self.pdtype.pdfromflat(pdparam)
+
+        a0 = self.pd.sample()
+        neglogp0 = self.pd.neglogp(a0)
+        self.initial_state = None
+
+        def step(ob, *_args, **_kwargs):
+            a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob})
+            return a, v, self.initial_state, neglogp
+
+        def value(ob, *_args, **_kwargs):
+            return sess.run(vf, {X:ob})
+
+        self.X = X
+        self.pi = pi
+        self.vf = vf
+        self.step = step
+        self.value = value
--- a/baselines/ppo2/ppo2.py
+++ b/baselines/ppo2/ppo2.py
@@ -0,0 +1,244 @@
+import os
+import time
+import joblib
+import numpy as np
+import os.path as osp
+import tensorflow as tf
+from baselines import logger
+from collections import deque
+from baselines.common import explained_variance
+
+class Model(object):
+    def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train,
+                nsteps, ent_coef, vf_coef, max_grad_norm):
+        sess = tf.get_default_session()
+
+        act_model = policy(sess, ob_space, ac_space, nbatch_act, 1, reuse=False)
+        train_model = policy(sess, ob_space, ac_space, nbatch_train, nsteps, reuse=True)
+
+        A = train_model.pdtype.sample_placeholder([None])
+        ADV = tf.placeholder(tf.float32, [None])
+        R = tf.placeholder(tf.float32, [None])
+        OLDNEGLOGPAC = tf.placeholder(tf.float32, [None])
+        OLDVPRED = tf.placeholder(tf.float32, [None])
+        LR = tf.placeholder(tf.float32, [])
+        CLIPRANGE = tf.placeholder(tf.float32, [])
+
+        neglogpac = train_model.pd.neglogp(A)
+        entropy = tf.reduce_mean(train_model.pd.entropy())
+
+        vpred = train_model.vf
+        vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, - CLIPRANGE, CLIPRANGE)
+        vf_losses1 = tf.square(vpred - R)
+        vf_losses2 = tf.square(vpredclipped - R)
+        vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))
+        ratio = tf.exp(OLDNEGLOGPAC - neglogpac)
+        pg_losses = -ADV * ratio
+        pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE)
+        pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2))
+        approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC))
+        clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE)))
+        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef
+        with tf.variable_scope('model'):
+            params = tf.trainable_variables()
+        grads = tf.gradients(loss, params)
+        if max_grad_norm is not None:
+            grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
+        grads = list(zip(grads, params))
+        trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5)
+        _train = trainer.apply_gradients(grads)
+
+        def train(lr, cliprange, obs, returns, masks, actions, values, neglogpacs, states=None):
+            advs = returns - values
+            advs = (advs - advs.mean()) / (advs.std() + 1e-8)
+            td_map = {train_model.X:obs, A:actions, ADV:advs, R:returns, LR:lr, 
+                    CLIPRANGE:cliprange, OLDNEGLOGPAC:neglogpacs, OLDVPRED:values}
+            if states is not None:
+                td_map[train_model.S] = states
+                td_map[train_model.M] = masks
+            return sess.run(
+                [pg_loss, vf_loss, entropy, approxkl, clipfrac, _train],
+                td_map
+            )[:-1]
+        self.loss_names = ['policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac']
+
+        def save(save_path):
+            ps = sess.run(params)
+            joblib.dump(ps, save_path)
+
+        def load(load_path):
+            loaded_params = joblib.load(load_path)
+            restores = []
+            for p, loaded_p in zip(params, loaded_params):
+                restores.append(p.assign(loaded_p))
+            sess.run(restores)
+
+        self.train = train
+        self.train_model = train_model
+        self.act_model = act_model
+        self.step = act_model.step
+        self.value = act_model.value
+        self.initial_state = act_model.initial_state
+        self.save = save
+        self.load = load
+        tf.global_variables_initializer().run(session=sess) #pylint: disable=E1101
+
+class Runner(object):
+
+    def __init__(self, *, env, model, nsteps, gamma, lam):
+        self.env = env
+        self.model = model
+        nenv = env.num_envs
+        self.obs = np.zeros((nenv,) + env.observation_space.shape, dtype=model.train_model.X.dtype.name)
+        self.obs[:] = env.reset()
+        self.gamma = gamma
+        self.lam = lam
+        self.nsteps = nsteps
+        self.states = model.initial_state
+        self.dones = [False for _ in range(nenv)]
+
+    def run(self):
+        mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, mb_neglogpacs = [],[],[],[],[],[]
+        mb_states = self.states
+        epinfos = []
+        for _ in range(self.nsteps):
+            actions, values, self.states, neglogpacs = self.model.step(self.obs, self.states, self.dones)
+            mb_obs.append(self.obs.copy())
+            mb_actions.append(actions)
+            mb_values.append(values)
+            mb_neglogpacs.append(neglogpacs)
+            mb_dones.append(self.dones)            
+            self.obs[:], rewards, self.dones, infos = self.env.step(actions)
+            for info in infos:
+                maybeepinfo = info.get('episode')
+                if maybeepinfo: epinfos.append(maybeepinfo)
+            mb_rewards.append(rewards)
+        #batch of steps to batch of rollouts
+        mb_obs = np.asarray(mb_obs, dtype=self.obs.dtype)
+        mb_rewards = np.asarray(mb_rewards, dtype=np.float32)
+        mb_actions = np.asarray(mb_actions)
+        mb_values = np.asarray(mb_values, dtype=np.float32)
+        mb_neglogpacs = np.asarray(mb_neglogpacs, dtype=np.float32)
+        mb_dones = np.asarray(mb_dones, dtype=np.bool)
+        last_values = self.model.value(self.obs, self.states, self.dones)
+        #discount/bootstrap off value fn
+        mb_returns = np.zeros_like(mb_rewards)
+        mb_advs = np.zeros_like(mb_rewards)
+        lastgaelam = 0        
+        for t in reversed(range(self.nsteps)):
+            if t == self.nsteps - 1:
+                nextnonterminal = 1.0 - self.dones
+                nextvalues = last_values
+            else:
+                nextnonterminal = 1.0 - mb_dones[t+1]
+                nextvalues = mb_values[t+1]
+            delta = mb_rewards[t] + self.gamma * nextvalues * nextnonterminal - mb_values[t]
+            mb_advs[t] = lastgaelam = delta + self.gamma * self.lam * nextnonterminal * lastgaelam
+        mb_returns = mb_advs + mb_values
+        return (*map(sf01, (mb_obs, mb_returns, mb_dones, mb_actions, mb_values, mb_neglogpacs)), 
+            mb_states, epinfos)
+# obs, returns, masks, actions, values, neglogpacs, states = runner.run()
+def sf01(arr):
+    """
+    swap and then flatten axes 0 and 1
+    """
+    s = arr.shape
+    return arr.swapaxes(0, 1).reshape(s[0] * s[1], *s[2:])
+
+def constfn(val):
+    def f(_):
+        return val
+    return f
+
+def learn(*, policy, env, nsteps, total_timesteps, ent_coef, lr, 
+            vf_coef=0.5,  max_grad_norm=0.5, gamma=0.99, lam=0.95, 
+            log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2,
+            save_interval=0):
+
+    if isinstance(lr, float): lr = constfn(lr)
+    else: assert callable(lr)
+    if isinstance(cliprange, float): cliprange = constfn(cliprange)
+    else: assert callable(cliprange)
+    total_timesteps = int(total_timesteps)
+
+    nenvs = env.num_envs
+    ob_space = env.observation_space
+    ac_space = env.action_space
+    nbatch = nenvs * nsteps
+    nbatch_train = nbatch // nminibatches
+
+    make_model = lambda : Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, 
+                    nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef,
+                    max_grad_norm=max_grad_norm)
+    if save_interval and logger.get_dir():
+        import cloudpickle
+        with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh:
+            fh.write(cloudpickle.dumps(make_model))
+    model = make_model()
+    runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam)
+
+    epinfobuf = deque(maxlen=100)
+    tfirststart = time.time()
+
+    nupdates = total_timesteps//nbatch
+    for update in range(1, nupdates+1):
+        assert nbatch % nminibatches == 0
+        nbatch_train = nbatch // nminibatches
+        tstart = time.time()
+        frac = 1.0 - (update - 1.0) / nupdates
+        lrnow = lr(frac)
+        cliprangenow = cliprange(frac)
+        obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run() #pylint: disable=E0632
+        epinfobuf.extend(epinfos)
+        mblossvals = []
+        if states is None: # nonrecurrent version
+            inds = np.arange(nbatch)
+            for _ in range(noptepochs):
+                np.random.shuffle(inds)
+                for start in range(0, nbatch, nbatch_train):
+                    end = start + nbatch_train
+                    mbinds = inds[start:end]
+                    slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs))
+                    mblossvals.append(model.train(lrnow, cliprangenow, *slices))
+        else: # recurrent version
+            assert nenvs % nminibatches == 0
+            envsperbatch = nenvs // nminibatches
+            envinds = np.arange(nenvs)
+            flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps)
+            envsperbatch = nbatch_train // nsteps
+            for _ in range(noptepochs):
+                np.random.shuffle(envinds)
+                for start in range(0, nenvs, envsperbatch):
+                    end = start + envsperbatch
+                    mbenvinds = envinds[start:end]
+                    mbflatinds = flatinds[mbenvinds].ravel()
+                    slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs))
+                    mbstates = states[mbenvinds]
+                    mblossvals.append(model.train(lrnow, cliprangenow, *slices, mbstates))            
+
+        lossvals = np.mean(mblossvals, axis=0)
+        tnow = time.time()
+        fps = int(nbatch / (tnow - tstart))
+        if update % log_interval == 0 or update == 1:
+            ev = explained_variance(values, returns)
+            logger.logkv("serial_timesteps", update*nsteps)
+            logger.logkv("nupdates", update)
+            logger.logkv("total_timesteps", update*nbatch)
+            logger.logkv("fps", fps)
+            logger.logkv("explained_variance", float(ev))
+            logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf]))
+            logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf]))
+            logger.logkv('time_elapsed', tnow - tfirststart)
+            for (lossval, lossname) in zip(lossvals, model.loss_names):
+                logger.logkv(lossname, lossval)
+            logger.dumpkvs()
+        if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir():
+            checkdir = osp.join(logger.get_dir(), 'checkpoints')
+            os.makedirs(checkdir, exist_ok=True)
+            savepath = osp.join(checkdir, '%.5i'%update)
+            print('Saving to', savepath)
+            model.save(savepath)
+    env.close()
+
+def safemean(xs):
+    return np.nan if len(xs) == 0 else np.mean(xs)
--- a/baselines/ppo2/run_atari.py
+++ b/baselines/ppo2/run_atari.py
@@ -0,0 +1,58 @@
+#!/usr/bin/env python
+import sys
+import argparse
+from baselines import bench, logger
+
+def train(env_id, num_timesteps, seed, policy):
+    from baselines.common import set_global_seeds
+    from baselines.common.atari_wrappers import make_atari, wrap_deepmind
+    from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
+    from baselines.common.vec_env.vec_frame_stack import VecFrameStack
+    from baselines.ppo2 import ppo2
+    from baselines.ppo2.policies import CnnPolicy, LstmPolicy, LnLstmPolicy
+    import gym
+    import logging
+    import multiprocessing
+    import os.path as osp
+    import tensorflow as tf
+    ncpu = multiprocessing.cpu_count()
+    if sys.platform == 'darwin': ncpu //= 2
+    config = tf.ConfigProto(allow_soft_placement=True,
+                            intra_op_parallelism_threads=ncpu,
+                            inter_op_parallelism_threads=ncpu)
+    config.gpu_options.allow_growth = True #pylint: disable=E1101
+    gym.logger.setLevel(logging.WARN)
+    tf.Session(config=config).__enter__()
+
+    def make_env(rank):
+        def env_fn():
+            env = make_atari(env_id)
+            env.seed(seed + rank)
+            env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), str(rank)))
+            return wrap_deepmind(env)
+        return env_fn
+    nenvs = 8
+    env = SubprocVecEnv([make_env(i) for i in range(nenvs)])
+    set_global_seeds(seed)
+    env = VecFrameStack(env, 4)
+    policy = {'cnn' : CnnPolicy, 'lstm' : LstmPolicy, 'lnlstm' : LnLstmPolicy}[policy]
+    ppo2.learn(policy=policy, env=env, nsteps=128, nminibatches=4,
+        lam=0.95, gamma=0.99, noptepochs=4, log_interval=1,
+        ent_coef=.01,
+        lr=lambda f : f * 2.5e-4,
+        cliprange=lambda f : f * 0.1,
+        total_timesteps=int(num_timesteps * 1.1))
+
+def main():
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4')
+    parser.add_argument('--seed', help='RNG seed', type=int, default=0)
+    parser.add_argument('--policy', help='Policy architecture', choices=['cnn', 'lstm', 'lnlstm'], default='cnn')
+    parser.add_argument('--num-timesteps', type=int, default=int(10e6))
+    args = parser.parse_args()
+    logger.configure()
+    train(args.env, num_timesteps=args.num_timesteps, seed=args.seed,
+        policy=args.policy)
+
+if __name__ == '__main__':
+    main()
--- a/baselines/ppo2/run_mujoco.py
+++ b/baselines/ppo2/run_mujoco.py
@@ -0,0 +1,47 @@
+#!/usr/bin/env python
+import argparse
+from baselines import bench, logger
+
+def train(env_id, num_timesteps, seed):
+    from baselines.common import set_global_seeds
+    from baselines.common.vec_env.vec_normalize import VecNormalize
+    from baselines.ppo2 import ppo2
+    from baselines.ppo2.policies import MlpPolicy
+    import gym
+    import tensorflow as tf
+    from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
+    ncpu = 1
+    config = tf.ConfigProto(allow_soft_placement=True,
+                            intra_op_parallelism_threads=ncpu,
+                            inter_op_parallelism_threads=ncpu)
+    tf.Session(config=config).__enter__()
+    def make_env():
+        env = gym.make(env_id)
+        env = bench.Monitor(env, logger.get_dir())
+        return env
+    env = DummyVecEnv([make_env])
+    env = VecNormalize(env)
+
+    set_global_seeds(seed)
+    policy = MlpPolicy
+    ppo2.learn(policy=policy, env=env, nsteps=2048, nminibatches=32,
+        lam=0.95, gamma=0.99, noptepochs=10, log_interval=1,
+        ent_coef=0.0,
+        lr=3e-4,
+        cliprange=0.2,
+        total_timesteps=num_timesteps)
+
+
+def main():
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('--env', help='environment ID', default='Hopper-v1')
+    parser.add_argument('--seed', help='RNG seed', type=int, default=0)
+    parser.add_argument('--num-timesteps', type=int, default=int(1e6))
+    args = parser.parse_args()
+    logger.configure()
+    train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
+
+
+if __name__ == '__main__':
+    main()
+
--- a/baselines/results_plotter.py
+++ b/baselines/results_plotter.py
@@ -0,0 +1,87 @@
+import numpy as np
+import matplotlib
+matplotlib.use('TkAgg') # Can change to 'Agg' for non-interactive mode
+
+import matplotlib.pyplot as plt
+plt.rcParams['svg.fonttype'] = 'none'
+
+from baselines.bench.monitor import load_results
+
+X_TIMESTEPS = 'timesteps'
+X_EPISODES = 'episodes'
+X_WALLTIME = 'walltime_hrs'
+POSSIBLE_X_AXES = [X_TIMESTEPS, X_EPISODES, X_WALLTIME]
+EPISODES_WINDOW = 100
+COLORS = ['blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'black', 'purple', 'pink',
+        'brown', 'orange', 'teal', 'coral', 'lightblue', 'lime', 'lavender', 'turquoise',
+        'darkgreen', 'tan', 'salmon', 'gold', 'lightpurple', 'darkred', 'darkblue']
+
+def rolling_window(a, window):
+    shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
+    strides = a.strides + (a.strides[-1],)
+    return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)
+
+def window_func(x, y, window, func):
+    yw = rolling_window(y, window)
+    yw_func = func(yw, axis=-1)
+    return x[window-1:], yw_func
+
+def ts2xy(ts, xaxis):
+    if xaxis == X_TIMESTEPS:
+        x = np.cumsum(ts.l.values)
+        y = ts.r.values
+    elif xaxis == X_EPISODES:
+        x = np.arange(len(ts))
+        y = ts.r.values
+    elif xaxis == X_WALLTIME:
+        x = ts.t.values / 3600.
+        y = ts.r.values
+    else:
+        raise NotImplementedError
+    return x, y
+
+def plot_curves(xy_list, xaxis, title):
+    plt.figure(figsize=(8,2))
+    maxx = max(xy[0][-1] for xy in xy_list)
+    minx = 0
+    for (i, (x, y)) in enumerate(xy_list):
+        color = COLORS[i]
+        plt.scatter(x, y, s=2)
+        x, y_mean = window_func(x, y, EPISODES_WINDOW, np.mean) #So returns average of last EPISODE_WINDOW episodes
+        plt.plot(x, y_mean, color=color)
+    plt.xlim(minx, maxx)
+    plt.title(title)
+    plt.xlabel(xaxis)
+    plt.ylabel("Episode Rewards")
+    plt.tight_layout()
+
+def plot_results(dirs, num_timesteps, xaxis, task_name):
+    tslist = []
+    for dir in dirs:
+        ts = load_results(dir)
+        ts = ts[ts.l.cumsum() <= num_timesteps]
+        tslist.append(ts)
+    xy_list = [ts2xy(ts, xaxis) for ts in tslist]
+    plot_curves(xy_list, xaxis, task_name)
+
+# Example usage in jupyter-notebook
+# from baselines import log_viewer
+# %matplotlib inline
+# log_viewer.plot_results(["./log"], 10e6, log_viewer.X_TIMESTEPS, "Breakout")
+# Here ./log is a directory containing the monitor.csv files
+
+def main():
+    import argparse
+    import os
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('--dirs', help='List of log directories', nargs = '*', default=['./log'])
+    parser.add_argument('--num_timesteps', type=int, default=int(10e6))
+    parser.add_argument('--xaxis', help = 'Varible on X-axis', default = X_TIMESTEPS)
+    parser.add_argument('--task_name', help = 'Title of plot', default = 'Breakout')
+    args = parser.parse_args()
+    args.dirs = [os.path.abspath(dir) for dir in args.dirs]
+    plot_results(args.dirs, args.num_timesteps, args.xaxis, args.task_name)
+    plt.show()
+
+if __name__ == '__main__':
+    main()