microbatches in ppo2, custom frame size in WarpFrame, matching fc layer only when needed (#707)

* joshim5 changes (width and height to WarpFrame wrapper) * match network output with action distribution via a linear layer only if necessary (#167) * support color vs. grayscale option in WarpFrame wrapper (#166) * support color vs. grayscale option in WarpFrame wrapper * Support color in other wrappers * Updated per Peters suggestions * fixing test failures * ppo2 with microbatches (#168) * pass microbatch_size to the model during construction * microbatch fixes and test (#169) * microbatch fixes and test * tiny cleanup * added assertions to the test * vpg-related fix * Peterz joshim5 subclass ppo2 model (#170) * microbatch fixes and test * tiny cleanup * added assertions to the test * vpg-related fix * subclassing the model to make microbatched version of model WIP * made microbatched model a subclass of ppo2 Model * flake8 complaint * mpi-less ppo2 (resolving merge conflict) * flake8 and mpi4py imports in ppo2/model.py * more un-mpying
2018-11-09 11:18:05 -08:00
parent d80acbb4d1
commit 52255beda5
6 changed files with 353 additions and 217 deletions
--- a/baselines/common/tests/test_serialization.py
+++ b/baselines/common/tests/test_serialization.py
@@ -103,9 +103,9 @@ def test_coexistence(learn_fn, network_fn):
    kwargs.update(learn_kwargs[learn_fn])

    learn =  partial(learn, env=env, network=network_fn, total_timesteps=0, **kwargs)
-    make_session(make_default=True, graph=tf.Graph());
+    make_session(make_default=True, graph=tf.Graph())
    model1 = learn(seed=1)
-    make_session(make_default=True, graph=tf.Graph());
+    make_session(make_default=True, graph=tf.Graph())
    model2 = learn(seed=2)

    model1.step(env.observation_space.sample())
--- a/baselines/ppo2/microbatched_model.py
+++ b/baselines/ppo2/microbatched_model.py
@@ -0,0 +1,76 @@
+import tensorflow as tf
+import numpy as np
+from baselines.ppo2.model import Model
+
+class MicrobatchedModel(Model):
+    """
+    Model that does training one microbatch at a time - when gradient computation
+    on the entire minibatch causes some overflow
+    """
+    def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train,
+                nsteps, ent_coef, vf_coef, max_grad_norm, microbatch_size):
+
+        self.nmicrobatches = nbatch_train // microbatch_size
+        self.microbatch_size = microbatch_size
+        assert nbatch_train % microbatch_size == 0, 'microbatch_size ({}) should divide nbatch_train ({}) evenly'.format(microbatch_size, nbatch_train)
+
+        super().__init__(
+                policy=policy,
+                ob_space=ob_space,
+                ac_space=ac_space,
+                nbatch_act=nbatch_act,
+                nbatch_train=microbatch_size,
+                nsteps=nsteps,
+                ent_coef=ent_coef,
+                vf_coef=vf_coef,
+                max_grad_norm=max_grad_norm)
+
+        self.grads_ph = [tf.placeholder(dtype=g.dtype, shape=g.shape) for g in self.grads]
+        grads_ph_and_vars = list(zip(self.grads_ph, self.var))
+        self._apply_gradients_op = self.trainer.apply_gradients(grads_ph_and_vars)
+
+
+    def train(self, lr, cliprange, obs, returns, masks, actions, values, neglogpacs, states=None):
+        assert states is None, "microbatches with recurrent models are not supported yet"
+
+        # Here we calculate advantage A(s,a) = R + yV(s') - V(s)
+        # Returns = R + yV(s')
+        advs = returns - values
+
+        # Normalize the advantages
+        advs = (advs - advs.mean()) / (advs.std() + 1e-8)
+
+        # Initialize empty list for per-microbatch stats like pg_loss, vf_loss, entropy, approxkl (whatever is in self.stats_list)
+        stats_vs = []
+
+        for microbatch_idx in range(self.nmicrobatches):
+            _sli = range(microbatch_idx * self.microbatch_size, (microbatch_idx+1) * self.microbatch_size)
+            td_map = {
+                self.train_model.X: obs[_sli],
+                self.A:actions[_sli],
+                self.ADV:advs[_sli],
+                self.R:returns[_sli],
+                self.CLIPRANGE:cliprange,
+                self.OLDNEGLOGPAC:neglogpacs[_sli],
+                self.OLDVPRED:values[_sli]
+            }
+
+            # Compute gradient on a microbatch (note that variables do not change here) ...
+            grad_v, stats_v  = self.sess.run([self.grads, self.stats_list], td_map)
+            if microbatch_idx == 0:
+                sum_grad_v = grad_v
+            else:
+                # .. and add to the total of the gradients
+                for i, g in enumerate(grad_v):
+                    sum_grad_v[i] += g
+            stats_vs.append(stats_v)
+
+        feed_dict = {ph: sum_g / self.nmicrobatches for ph, sum_g in zip(self.grads_ph, sum_grad_v)}
+        feed_dict[self.LR] = lr
+        # Update variables using average of the gradients
+        self.sess.run(self._apply_gradients_op, feed_dict)
+        # Return average of the stats
+        return np.mean(np.array(stats_vs), axis=0).tolist()
+
+
+
--- a/baselines/ppo2/model.py
+++ b/baselines/ppo2/model.py
@@ -0,0 +1,157 @@
+import tensorflow as tf
+import functools
+
+from baselines.common.tf_util import get_session, save_variables, load_variables
+from baselines.common.tf_util import initialize
+
+try:
+    from baselines.common.mpi_adam_optimizer import MpiAdamOptimizer
+    from mpi4py import MPI
+    from baselines.common.mpi_util import sync_from_root
+except ImportError:
+    MPI = None
+
+class Model(object):
+    """
+    We use this object to :
+    __init__:
+    - Creates the step_model
+    - Creates the train_model
+
+    train():
+    - Make the training part (feedforward and retropropagation of gradients)
+
+    save/load():
+    - Save load the model
+    """
+    def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train,
+                nsteps, ent_coef, vf_coef, max_grad_norm, microbatch_size=None):
+        self.sess = sess = get_session()
+
+        with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE):
+            # CREATE OUR TWO MODELS
+            # act_model that is used for sampling
+            act_model = policy(nbatch_act, 1, sess)
+
+            # Train model for training
+            if microbatch_size is None:
+                train_model = policy(nbatch_train, nsteps, sess)
+            else:
+                train_model = policy(microbatch_size, nsteps, sess)
+
+        # CREATE THE PLACEHOLDERS
+        self.A = A = train_model.pdtype.sample_placeholder([None])
+        self.ADV = ADV = tf.placeholder(tf.float32, [None])
+        self.R = R = tf.placeholder(tf.float32, [None])
+        # Keep track of old actor
+        self.OLDNEGLOGPAC = OLDNEGLOGPAC = tf.placeholder(tf.float32, [None])
+        # Keep track of old critic
+        self.OLDVPRED = OLDVPRED = tf.placeholder(tf.float32, [None])
+        self.LR = LR = tf.placeholder(tf.float32, [])
+        # Cliprange
+        self.CLIPRANGE = CLIPRANGE = tf.placeholder(tf.float32, [])
+
+        neglogpac = train_model.pd.neglogp(A)
+
+        # Calculate the entropy
+        # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy.
+        entropy = tf.reduce_mean(train_model.pd.entropy())
+
+        # CALCULATE THE LOSS
+        # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss
+
+        # Clip the value to reduce variability during Critic training
+        # Get the predicted value
+        vpred = train_model.vf
+        vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, - CLIPRANGE, CLIPRANGE)
+        # Unclipped value
+        vf_losses1 = tf.square(vpred - R)
+        # Clipped value
+        vf_losses2 = tf.square(vpredclipped - R)
+
+        vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))
+
+        # Calculate ratio (pi current policy / pi old policy)
+        ratio = tf.exp(OLDNEGLOGPAC - neglogpac)
+
+        # Defining Loss = - J is equivalent to max J
+        pg_losses = -ADV * ratio
+
+        pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE)
+
+        # Final PG loss
+        pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2))
+        approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC))
+        clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE)))
+
+        # Total loss
+        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef
+
+        # UPDATE THE PARAMETERS USING LOSS
+        # 1. Get the model parameters
+        params = tf.trainable_variables('ppo2_model')
+        # 2. Build our trainer
+        if MPI is not None:
+            self.trainer = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=LR, epsilon=1e-5)
+        else:
+            self.trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5)
+        # 3. Calculate the gradients
+        grads_and_var = self.trainer.compute_gradients(loss, params)
+        grads, var = zip(*grads_and_var)
+
+        if max_grad_norm is not None:
+            # Clip the gradients (normalize)
+            grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
+        grads_and_var = list(zip(grads, var))
+        # zip aggregate each gradient with parameters associated
+        # For instance zip(ABCD, xyza) => Ax, By, Cz, Da
+
+        self.grads = grads
+        self.var = var
+        self._train_op = self.trainer.apply_gradients(grads_and_var)
+        self.loss_names = ['policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac']
+        self.stats_list = [pg_loss, vf_loss, entropy, approxkl, clipfrac]
+
+
+        self.train_model = train_model
+        self.act_model = act_model
+        self.step = act_model.step
+        self.value = act_model.value
+        self.initial_state = act_model.initial_state
+
+        self.save = functools.partial(save_variables, sess=sess)
+        self.load = functools.partial(load_variables, sess=sess)
+
+        if MPI is None or MPI.COMM_WORLD.Get_rank() == 0:
+            initialize()
+        else:
+            global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="")
+            sync_from_root(sess, global_variables) #pylint: disable=E1101
+
+    def train(self, lr, cliprange, obs, returns, masks, actions, values, neglogpacs, states=None):
+        # Here we calculate advantage A(s,a) = R + yV(s') - V(s)
+        # Returns = R + yV(s')
+        advs = returns - values
+
+        # Normalize the advantages
+        advs = (advs - advs.mean()) / (advs.std() + 1e-8)
+
+        td_map = {
+            self.train_model.X : obs,
+            self.A : actions,
+            self.ADV : advs,
+            self.R : returns,
+            self.LR : lr,
+            self.CLIPRANGE : cliprange,
+            self.OLDNEGLOGPAC : neglogpacs,
+            self.OLDVPRED : values
+        }
+        if states is not None:
+            td_map[self.train_model.S] = states
+            td_map[self.train_model.M] = masks
+
+        return self.sess.run(
+            self.stats_list + [self._train_op],
+            td_map
+        )[:-1]
+
--- a/baselines/ppo2/ppo2.py
+++ b/baselines/ppo2/ppo2.py
@@ -1,226 +1,17 @@
 import os
 import time
-import functools
 import numpy as np
 import os.path as osp
-import tensorflow as tf
 from baselines import logger
 from collections import deque
 from baselines.common import explained_variance, set_global_seeds
 from baselines.common.policies import build_policy
-from baselines.common.runners import AbstractEnvRunner
-from baselines.common.tf_util import get_session, save_variables, load_variables
-
 try:
-    from baselines.common.mpi_adam_optimizer import MpiAdamOptimizer
    from mpi4py import MPI
-    from baselines.common.mpi_util import sync_from_root
 except ImportError:
    MPI = None
+from baselines.ppo2.runner import Runner

-from baselines.common.tf_util import initialize
-
-class Model(object):
-    """
-    We use this object to :
-    __init__:
-    - Creates the step_model
-    - Creates the train_model
-
-    train():
-    - Make the training part (feedforward and retropropagation of gradients)
-
-    save/load():
-    - Save load the model
-    """
-    def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train,
-                nsteps, ent_coef, vf_coef, max_grad_norm):
-        sess = get_session()
-
-        with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE):
-            # CREATE OUR TWO MODELS
-            # act_model that is used for sampling
-            act_model = policy(nbatch_act, 1, sess)
-
-            # Train model for training
-            train_model = policy(nbatch_train, nsteps, sess)
-
-        # CREATE THE PLACEHOLDERS
-        A = train_model.pdtype.sample_placeholder([None])
-        ADV = tf.placeholder(tf.float32, [None])
-        R = tf.placeholder(tf.float32, [None])
-        # Keep track of old actor
-        OLDNEGLOGPAC = tf.placeholder(tf.float32, [None])
-        # Keep track of old critic
-        OLDVPRED = tf.placeholder(tf.float32, [None])
-        LR = tf.placeholder(tf.float32, [])
-        # Cliprange
-        CLIPRANGE = tf.placeholder(tf.float32, [])
-
-        neglogpac = train_model.pd.neglogp(A)
-
-        # Calculate the entropy
-        # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy.
-        entropy = tf.reduce_mean(train_model.pd.entropy())
-
-        # CALCULATE THE LOSS
-        # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss
-
-        # Clip the value to reduce variability during Critic training
-        # Get the predicted value
-        vpred = train_model.vf
-        vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, - CLIPRANGE, CLIPRANGE)
-        # Unclipped value
-        vf_losses1 = tf.square(vpred - R)
-        # Clipped value
-        vf_losses2 = tf.square(vpredclipped - R)
-
-        vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))
-
-        # Calculate ratio (pi current policy / pi old policy)
-        ratio = tf.exp(OLDNEGLOGPAC - neglogpac)
-
-        # Defining Loss = - J is equivalent to max J
-        pg_losses = -ADV * ratio
-
-        pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE)
-
-        # Final PG loss
-        pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2))
-        approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC))
-        clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE)))
-
-        # Total loss
-        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef
-
-        # UPDATE THE PARAMETERS USING LOSS
-        # 1. Get the model parameters
-        params = tf.trainable_variables('ppo2_model')
-        # 2. Build our trainer
-        if MPI is not None:
-            trainer = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=LR, epsilon=1e-5)
-        else:
-            trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5)
-        # 3. Calculate the gradients
-        grads_and_var = trainer.compute_gradients(loss, params)
-        grads, var = zip(*grads_and_var)
-
-        if max_grad_norm is not None:
-            # Clip the gradients (normalize)
-            grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
-        grads_and_var = list(zip(grads, var))
-        # zip aggregate each gradient with parameters associated
-        # For instance zip(ABCD, xyza) => Ax, By, Cz, Da
-
-        _train = trainer.apply_gradients(grads_and_var)
-
-        def train(lr, cliprange, obs, returns, masks, actions, values, neglogpacs, states=None):
-            # Here we calculate advantage A(s,a) = R + yV(s') - V(s)
-            # Returns = R + yV(s')
-            advs = returns - values
-
-            # Normalize the advantages
-            advs = (advs - advs.mean()) / (advs.std() + 1e-8)
-            td_map = {train_model.X:obs, A:actions, ADV:advs, R:returns, LR:lr,
-                    CLIPRANGE:cliprange, OLDNEGLOGPAC:neglogpacs, OLDVPRED:values}
-            if states is not None:
-                td_map[train_model.S] = states
-                td_map[train_model.M] = masks
-            return sess.run(
-                [pg_loss, vf_loss, entropy, approxkl, clipfrac, _train],
-                td_map
-            )[:-1]
-        self.loss_names = ['policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac']
-
-
-        self.train = train
-        self.train_model = train_model
-        self.act_model = act_model
-        self.step = act_model.step
-        self.value = act_model.value
-        self.initial_state = act_model.initial_state
-
-        self.save = functools.partial(save_variables, sess=sess)
-        self.load = functools.partial(load_variables, sess=sess)
-
-        if MPI is None or MPI.COMM_WORLD.Get_rank() == 0:
-            initialize()
-        global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="")
-
-        if MPI is not None:
-            sync_from_root(sess, global_variables) #pylint: disable=E1101
-
-class Runner(AbstractEnvRunner):
-    """
-    We use this object to make a mini batch of experiences
-    __init__:
-    - Initialize the runner
-
-    run():
-    - Make a mini batch
-    """
-    def __init__(self, *, env, model, nsteps, gamma, lam):
-        super().__init__(env=env, model=model, nsteps=nsteps)
-        # Lambda used in GAE (General Advantage Estimation)
-        self.lam = lam
-        # Discount rate
-        self.gamma = gamma
-
-    def run(self):
-        # Here, we init the lists that will contain the mb of experiences
-        mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, mb_neglogpacs = [],[],[],[],[],[]
-        mb_states = self.states
-        epinfos = []
-        # For n in range number of steps
-        for _ in range(self.nsteps):
-            # Given observations, get action value and neglopacs
-            # We already have self.obs because Runner superclass run self.obs[:] = env.reset() on init
-            actions, values, self.states, neglogpacs = self.model.step(self.obs, S=self.states, M=self.dones)
-            mb_obs.append(self.obs.copy())
-            mb_actions.append(actions)
-            mb_values.append(values)
-            mb_neglogpacs.append(neglogpacs)
-            mb_dones.append(self.dones)
-
-            # Take actions in env and look the results
-            # Infos contains a ton of useful informations
-            self.obs[:], rewards, self.dones, infos = self.env.step(actions)
-            for info in infos:
-                maybeepinfo = info.get('episode')
-                if maybeepinfo: epinfos.append(maybeepinfo)
-            mb_rewards.append(rewards)
-        #batch of steps to batch of rollouts
-        mb_obs = np.asarray(mb_obs, dtype=self.obs.dtype)
-        mb_rewards = np.asarray(mb_rewards, dtype=np.float32)
-        mb_actions = np.asarray(mb_actions)
-        mb_values = np.asarray(mb_values, dtype=np.float32)
-        mb_neglogpacs = np.asarray(mb_neglogpacs, dtype=np.float32)
-        mb_dones = np.asarray(mb_dones, dtype=np.bool)
-        last_values = self.model.value(self.obs, S=self.states, M=self.dones)
-
-        # discount/bootstrap off value fn
-        mb_returns = np.zeros_like(mb_rewards)
-        mb_advs = np.zeros_like(mb_rewards)
-        lastgaelam = 0
-        for t in reversed(range(self.nsteps)):
-            if t == self.nsteps - 1:
-                nextnonterminal = 1.0 - self.dones
-                nextvalues = last_values
-            else:
-                nextnonterminal = 1.0 - mb_dones[t+1]
-                nextvalues = mb_values[t+1]
-            delta = mb_rewards[t] + self.gamma * nextvalues * nextnonterminal - mb_values[t]
-            mb_advs[t] = lastgaelam = delta + self.gamma * self.lam * nextnonterminal * lastgaelam
-        mb_returns = mb_advs + mb_values
-        return (*map(sf01, (mb_obs, mb_returns, mb_dones, mb_actions, mb_values, mb_neglogpacs)),
-            mb_states, epinfos)
-# obs, returns, masks, actions, values, neglogpacs, states = runner.run()
-def sf01(arr):
-    """
-    swap and then flatten axes 0 and 1
-    """
-    s = arr.shape
-    return arr.swapaxes(0, 1).reshape(s[0] * s[1], *s[2:])

 def constfn(val):
    def f(_):
@@ -230,7 +21,7 @@ def constfn(val):
 def learn(*, network, env, total_timesteps, eval_env = None, seed=None, nsteps=2048, ent_coef=0.0, lr=3e-4,
            vf_coef=0.5,  max_grad_norm=0.5, gamma=0.99, lam=0.95,
            log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2,
-            save_interval=0, load_path=None, **network_kwargs):
+            save_interval=0, load_path=None, model_fn=None, **network_kwargs):
    '''
    Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347)

@@ -308,10 +99,14 @@ def learn(*, network, env, total_timesteps, eval_env = None, seed=None, nsteps=2
    nbatch_train = nbatch // nminibatches

    # Instantiate the model object (that creates act_model and train_model)
-    make_model = lambda : Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train,
+    if model_fn is None:
+        from baselines.ppo2.model import Model
+        model_fn = Model
+
+    model = model_fn(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train,
                    nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef,
                    max_grad_norm=max_grad_norm)
-    model = make_model()
+
    if load_path is not None:
        model.load(load_path)
    # Instantiate the runner object
@@ -319,8 +114,6 @@ def learn(*, network, env, total_timesteps, eval_env = None, seed=None, nsteps=2
    if eval_env is not None:
        eval_runner = Runner(env = eval_env, model = model, nsteps = nsteps, gamma = gamma, lam= lam)

-
-
    epinfobuf = deque(maxlen=100)
    if eval_env is not None:
        eval_epinfobuf = deque(maxlen=100)
--- a/baselines/ppo2/runner.py
+++ b/baselines/ppo2/runner.py
@@ -0,0 +1,76 @@
+import numpy as np
+from baselines.common.runners import AbstractEnvRunner
+
+class Runner(AbstractEnvRunner):
+    """
+    We use this object to make a mini batch of experiences
+    __init__:
+    - Initialize the runner
+
+    run():
+    - Make a mini batch
+    """
+    def __init__(self, *, env, model, nsteps, gamma, lam):
+        super().__init__(env=env, model=model, nsteps=nsteps)
+        # Lambda used in GAE (General Advantage Estimation)
+        self.lam = lam
+        # Discount rate
+        self.gamma = gamma
+
+    def run(self):
+        # Here, we init the lists that will contain the mb of experiences
+        mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, mb_neglogpacs = [],[],[],[],[],[]
+        mb_states = self.states
+        epinfos = []
+        # For n in range number of steps
+        for _ in range(self.nsteps):
+            # Given observations, get action value and neglopacs
+            # We already have self.obs because Runner superclass run self.obs[:] = env.reset() on init
+            actions, values, self.states, neglogpacs = self.model.step(self.obs, S=self.states, M=self.dones)
+            mb_obs.append(self.obs.copy())
+            mb_actions.append(actions)
+            mb_values.append(values)
+            mb_neglogpacs.append(neglogpacs)
+            mb_dones.append(self.dones)
+
+            # Take actions in env and look the results
+            # Infos contains a ton of useful informations
+            self.obs[:], rewards, self.dones, infos = self.env.step(actions)
+            for info in infos:
+                maybeepinfo = info.get('episode')
+                if maybeepinfo: epinfos.append(maybeepinfo)
+            mb_rewards.append(rewards)
+        #batch of steps to batch of rollouts
+        mb_obs = np.asarray(mb_obs, dtype=self.obs.dtype)
+        mb_rewards = np.asarray(mb_rewards, dtype=np.float32)
+        mb_actions = np.asarray(mb_actions)
+        mb_values = np.asarray(mb_values, dtype=np.float32)
+        mb_neglogpacs = np.asarray(mb_neglogpacs, dtype=np.float32)
+        mb_dones = np.asarray(mb_dones, dtype=np.bool)
+        last_values = self.model.value(self.obs, S=self.states, M=self.dones)
+
+        # discount/bootstrap off value fn
+        mb_returns = np.zeros_like(mb_rewards)
+        mb_advs = np.zeros_like(mb_rewards)
+        lastgaelam = 0
+        for t in reversed(range(self.nsteps)):
+            if t == self.nsteps - 1:
+                nextnonterminal = 1.0 - self.dones
+                nextvalues = last_values
+            else:
+                nextnonterminal = 1.0 - mb_dones[t+1]
+                nextvalues = mb_values[t+1]
+            delta = mb_rewards[t] + self.gamma * nextvalues * nextnonterminal - mb_values[t]
+            mb_advs[t] = lastgaelam = delta + self.gamma * self.lam * nextnonterminal * lastgaelam
+        mb_returns = mb_advs + mb_values
+        return (*map(sf01, (mb_obs, mb_returns, mb_dones, mb_actions, mb_values, mb_neglogpacs)),
+            mb_states, epinfos)
+# obs, returns, masks, actions, values, neglogpacs, states = runner.run()
+def sf01(arr):
+    """
+    swap and then flatten axes 0 and 1
+    """
+    s = arr.shape
+    return arr.swapaxes(0, 1).reshape(s[0] * s[1], *s[2:])
+
+
--- a/baselines/ppo2/test_microbatches.py
+++ b/baselines/ppo2/test_microbatches.py
@@ -0,0 +1,34 @@
+import gym
+import tensorflow as tf
+import numpy as np
+from functools import partial
+
+from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
+from baselines.common.tf_util import make_session
+from baselines.ppo2.ppo2 import learn
+
+from baselines.ppo2.microbatched_model import MicrobatchedModel
+
+def test_microbatches():
+    def env_fn():
+        env = gym.make('CartPole-v0')
+        env.seed(0)
+        return env
+
+    learn_fn = partial(learn, network='mlp', nsteps=32, total_timesteps=32, seed=0)
+
+    env_ref = DummyVecEnv([env_fn])
+    sess_ref = make_session(make_default=True, graph=tf.Graph())
+    learn_fn(env=env_ref)
+    vars_ref = {v.name: sess_ref.run(v) for v in tf.trainable_variables()}
+
+    env_test = DummyVecEnv([env_fn])
+    sess_test = make_session(make_default=True, graph=tf.Graph())
+    learn_fn(env=env_test, model_fn=partial(MicrobatchedModel, microbatch_size=2))
+    vars_test = {v.name: sess_test.run(v) for v in tf.trainable_variables()}
+
+    for v in vars_ref:
+        np.testing.assert_allclose(vars_ref[v], vars_test[v], atol=1e-3)
+
+if __name__ == '__main__':
+    test_microbatches()