From 217b111c887a80e1350a19d0daad91a2782b303f Mon Sep 17 00:00:00 2001
From: Peter Zhokhov <peterz@openai.com>
Date: Fri, 10 Aug 2018 14:14:46 -0700
Subject: [PATCH] merged refactor

---
 .benchmark_pattern                            |   1 +
 .gitignore                                    |   2 -
 .travis.yml                                   |   4 +-
 Dockerfile                                    |  16 +-
 README.md                                     |  54 ++++
 baselines/a2c/a2c.py                          | 174 ++++++-----
 baselines/a2c/policies.py                     | 146 ---------
 baselines/a2c/run_atari.py                    |  30 --
 baselines/a2c/runner.py                       |  60 ++++
 baselines/a2c/utils.py                        |  16 +-
 baselines/acer/{acer_simple.py => acer.py}    | 190 +++++++-----
 baselines/acer/defaults.py                    |   4 +
 baselines/acer/policies.py                    |   6 +-
 baselines/acer/run_atari.py                   |  30 --
 baselines/acer/runner.py                      |  60 ++++
 baselines/acktr/acktr.py                      |   1 +
 baselines/acktr/acktr_disc.py                 |  62 ++--
 baselines/acktr/run_atari.py                  |   4 +-
 baselines/bench/benchmarks.py                 |   5 +-
 baselines/bench/monitor.py                    |   4 +-
 baselines/common/atari_wrappers.py            |   2 +
 baselines/common/cmd_util.py                  |  64 +++-
 baselines/common/distributions.py             |  18 +-
 baselines/common/input.py                     |  62 ++--
 baselines/common/misc_util.py                 |  13 +-
 baselines/common/models.py                    | 177 +++++++++++
 baselines/common/mpi_adam_optimizer.py        |  31 ++
 baselines/common/mpi_util.py                  | 101 ++++++
 baselines/common/policies.py                  | 179 +++++++++++
 baselines/common/retro_wrappers.py            | 293 ++++++++++++++++++
 baselines/common/runners.py                   |   3 +-
 baselines/common/running_mean_std.py          | 165 +++++++++-
 baselines/common/test_identity.py             |  44 ---
 baselines/common/tests/__init__.py            |   0
 baselines/common/tests/envs/__init__.py       |   0
 .../common/tests/envs/fixed_sequence_env.py   |  44 +++
 baselines/common/tests/envs/identity_env.py   |  70 +++++
 baselines/common/tests/envs/mnist_env.py      |  70 +++++
 baselines/common/tests/test_cartpole.py       |  40 +++
 baselines/common/tests/test_fixed_sequence.py |  51 +++
 baselines/common/tests/test_identity.py       |  55 ++++
 baselines/common/tests/test_mnist.py          |  50 +++
 baselines/common/tests/test_serialization.py  |  97 ++++++
 baselines/common/tests/util.py                |  91 ++++++
 baselines/common/tf_util.py                   | 147 +++++++--
 baselines/common/vec_env/dummy_vec_env.py     |  21 +-
 baselines/common/vec_env/subproc_vec_env.py   |  42 +--
 baselines/common/vec_env/vec_normalize.py     |   2 +
 baselines/ddpg/ddpg.py                        |   4 +-
 baselines/deepq/__init__.py                   |   4 +-
 baselines/deepq/{simple.py => deepq.py}       |  57 +++-
 baselines/deepq/defaults.py                   |  21 ++
 baselines/deepq/experiments/enjoy_retro.py    |  34 ++
 baselines/deepq/experiments/run_retro.py      |  49 +++
 baselines/deepq/models.py                     |  38 +++
 baselines/deepq/test_identity.py              |  43 ---
 baselines/deepq/utils.py                      |   3 +-
 baselines/ppo1/run_atari.py                   |   2 +-
 baselines/ppo2/defaults.py                    |  22 ++
 baselines/ppo2/policies.py                    | 146 ---------
 baselines/ppo2/ppo2.py                        | 128 ++++++--
 baselines/ppo2/run_atari.py                   |  40 ---
 baselines/ppo2/run_mujoco.py                  |  57 ----
 baselines/run.py                              | 230 ++++++++++++++
 baselines/trpo_mpi/defaults.py                |  30 ++
 baselines/trpo_mpi/nosharing_cnn_policy.py    |  56 ----
 baselines/trpo_mpi/run_atari.py               |  43 ---
 baselines/trpo_mpi/run_mujoco.py              |  36 ---
 baselines/trpo_mpi/trpo_mpi.py                | 138 +++++++--
 conftest.py                                   |  19 ++
 setup.py                                      |   7 +-
 71 files changed, 2939 insertions(+), 1069 deletions(-)
 create mode 100644 .benchmark_pattern
 delete mode 100644 baselines/a2c/policies.py
 delete mode 100644 baselines/a2c/run_atari.py
 create mode 100644 baselines/a2c/runner.py
 rename baselines/acer/{acer_simple.py => acer.py} (64%)
 create mode 100644 baselines/acer/defaults.py
 delete mode 100644 baselines/acer/run_atari.py
 create mode 100644 baselines/acer/runner.py
 create mode 100644 baselines/acktr/acktr.py
 create mode 100644 baselines/common/models.py
 create mode 100644 baselines/common/mpi_adam_optimizer.py
 create mode 100644 baselines/common/mpi_util.py
 create mode 100644 baselines/common/policies.py
 create mode 100644 baselines/common/retro_wrappers.py
 delete mode 100644 baselines/common/test_identity.py
 create mode 100644 baselines/common/tests/__init__.py
 create mode 100644 baselines/common/tests/envs/__init__.py
 create mode 100644 baselines/common/tests/envs/fixed_sequence_env.py
 create mode 100644 baselines/common/tests/envs/identity_env.py
 create mode 100644 baselines/common/tests/envs/mnist_env.py
 create mode 100644 baselines/common/tests/test_cartpole.py
 create mode 100644 baselines/common/tests/test_fixed_sequence.py
 create mode 100644 baselines/common/tests/test_identity.py
 create mode 100644 baselines/common/tests/test_mnist.py
 create mode 100644 baselines/common/tests/test_serialization.py
 create mode 100644 baselines/common/tests/util.py
 rename baselines/deepq/{simple.py => deepq.py} (90%)
 create mode 100644 baselines/deepq/defaults.py
 create mode 100644 baselines/deepq/experiments/enjoy_retro.py
 create mode 100644 baselines/deepq/experiments/run_retro.py
 delete mode 100644 baselines/deepq/test_identity.py
 create mode 100644 baselines/ppo2/defaults.py
 delete mode 100644 baselines/ppo2/policies.py
 delete mode 100644 baselines/ppo2/run_atari.py
 delete mode 100644 baselines/ppo2/run_mujoco.py
 create mode 100644 baselines/run.py
 create mode 100644 baselines/trpo_mpi/defaults.py
 delete mode 100644 baselines/trpo_mpi/nosharing_cnn_policy.py
 delete mode 100644 baselines/trpo_mpi/run_atari.py
 delete mode 100644 baselines/trpo_mpi/run_mujoco.py
 create mode 100644 conftest.py

diff --git a/.benchmark_pattern b/.benchmark_pattern
new file mode 100644
index 0000000..e53df25
--- /dev/null
+++ b/.benchmark_pattern
@@ -0,0 +1 @@
+ppo2
diff --git a/.gitignore b/.gitignore
index 722e942..a41103d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -34,5 +34,3 @@ src
 .cache
 
 MUJOCO_LOG.TXT
-
-
diff --git a/.travis.yml b/.travis.yml
index 5ba3ead..e267785 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -10,5 +10,5 @@ install:
     - docker build . -t baselines-test
 
 script:
-    - flake8 --select=F baselines/common
-    - docker run baselines-test pytest
+    - flake8 --select=F,E999 baselines/common baselines/trpo_mpi baselines/ppo2 baselines/a2c baselines/deepq baselines/acer
+    - docker run baselines-test pytest --runslow
diff --git a/Dockerfile b/Dockerfile
index eeac22a..1d432f3 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,20 +1,24 @@
 FROM ubuntu:16.04
 
-RUN apt-get -y update && apt-get -y install git wget python-dev python3-dev libopenmpi-dev python-pip zlib1g-dev cmake
+RUN apt-get -y update && apt-get -y install git wget python-dev python3-dev libopenmpi-dev python-pip zlib1g-dev cmake python-opencv
 ENV CODE_DIR /root/code
 ENV VENV /root/venv
 
-COPY . $CODE_DIR/baselines
 RUN \
     pip install virtualenv && \
     virtualenv $VENV --python=python3 && \
     . $VENV/bin/activate && \
-    cd $CODE_DIR && \
-    pip install --upgrade pip && \
-    pip install -e baselines && \
-    pip install pytest
+    pip install --upgrade pip
 
 ENV PATH=$VENV/bin:$PATH
+
+COPY . $CODE_DIR/baselines
 WORKDIR $CODE_DIR/baselines
 
+# Clean up pycache and pyc files
+RUN rm -rf __pycache__ && \
+    find . -name "*.pyc" -delete && \
+    pip install -e .[test]
+
+
 CMD /bin/bash
diff --git a/README.md b/README.md
index 197f01a..e8a4abb 100644
--- a/README.md
+++ b/README.md
@@ -62,6 +62,60 @@ pip install pytest
 pytest
 ```
 
+## Subpackages
+
+## Testing the installation
+All unit tests in baselines can be run using pytest runner:
+```
+pip install pytest
+pytest
+```
+
+## Training models
+Most of the algorithms in baselines repo are used as follows:
+```bash
+    python -m baselines.run --alg=<name of the algorithm> --env=<environment_id> [additional arguments]
+```
+### Example 1. PPO with MuJoCo Humanoid
+For instance, to train a fully-connected network controlling MuJoCo humanoid using a2c for 20M timesteps
+```bash
+    python -m baselines.run --alg=a2c --env=Humanoid-v2 --network=mlp --num_timesteps=2e7
+```
+Note that for mujoco environments fully-connected network is default, so we can omit `--network=mlp`
+The hyperparameters for both network and the learning algorithm can be controlled via the command line, for instance:
+```bash
+    python -m baselines.run --alg=a2c --env=Humanoid-v2 --network=mlp --num_timesteps=2e7 --ent_coef=0.1 --num_hidden=32 --num_layers=3 --value_network=copy
+```
+will set entropy coeffient to 0.1, and construct fully connected network with 3 layers with 32 hidden units in each, and create a separate network for value function estimation (so that its parameters are not shared with the policy network, but the structure is the same)
+
+See docstrings in [common/models.py](common/models.py) for description of network parameters for each type of model, and 
+docstring for [baselines/ppo2/ppo2.py/learn()](ppo2/ppo2.py) fir the description of the ppo2 hyperparamters. 
+
+### Example 2. DQN on Atari 
+DQN with Atari is at this point a classics of benchmarks. To run the baselines implementation of DQN on Atari Pong:
+```
+    python -m baselines.run --alg=deepq --env=PongNoFrameskip-v4 --num_timesteps=1e6
+```
+
+## Saving, loading and visualizing models
+The algorithms serialization API is not properly unified yet; however, there is a simple method to save / restore trained models. 
+`--save_path` and `--load_path` command-line option loads the tensorflow state from a given path before training, and saves it after the training, respectively. 
+Let's imagine you'd like to train ppo2 on Atari Pong,  save the model and then later visualize what has it learnt.
+```bash
+    python -m baselines.run --alg=ppo2 --env=PongNoFrameskip-v4 --num-timesteps=2e7 --save_path=~/models/pong_20M_ppo2
+```
+This should get to the mean reward per episode about 5k. To load and visualize the model, we'll do the following - load the model, train it for 0 steps, and then visualize: 
+```bash
+    python -m baselines.run --alg=ppo2 --env=PongNoFrameskip-v4 --num-timesteps=0 --load_path=~/models/pong_20M_ppo2 --play
+```
+
+*NOTE:* At the moment Mujoco training uses VecNormalize wrapper for the environment which is not being saved correctly; so loading the models trained on Mujoco will not work well if the environment is recreated. If necessary, you can work around that by replacing RunningMeanStd by TfRunningMeanStd in [baselines/common/vec_env/vec_normalize.py](baselines/common/vec_env/vec_normalize.py#L12). This way, mean and std of environment normalizing wrapper will be saved in tensorflow variables and included in the model file; however, training is slower that way - hence not including it by default
+
+
+
+
+
+
 ## Subpackages
 
 - [A2C](baselines/a2c)
diff --git a/baselines/a2c/a2c.py b/baselines/a2c/a2c.py
index f1de88a..4c3013d 100644
--- a/baselines/a2c/a2c.py
+++ b/baselines/a2c/a2c.py
@@ -1,42 +1,48 @@
-import os.path as osp
 import time
-import joblib
-import numpy as np
+import functools
 import tensorflow as tf
+
 from baselines import logger
 
 from baselines.common import set_global_seeds, explained_variance
-from baselines.common.runners import AbstractEnvRunner
 from baselines.common import tf_util
+from baselines.common.policies import build_policy
 
-from baselines.a2c.utils import discount_with_dones
-from baselines.a2c.utils import Scheduler, make_path, find_trainable_variables
-from baselines.a2c.utils import cat_entropy, mse
+
+from baselines.a2c.utils import Scheduler, find_trainable_variables
+from baselines.a2c.runner import Runner
+
+from tensorflow import losses
 
 class Model(object):
 
-    def __init__(self, policy, ob_space, ac_space, nenvs, nsteps,
+    def __init__(self, policy, env, nsteps,
             ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4,
             alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear'):
 
-        sess = tf_util.make_session()
+        sess = tf_util.get_session()
+        nenvs = env.num_envs
         nbatch = nenvs*nsteps
 
-        A = tf.placeholder(tf.int32, [nbatch])
+
+        with tf.variable_scope('a2c_model', reuse=tf.AUTO_REUSE):
+            step_model = policy(nenvs, 1, sess)
+            train_model = policy(nbatch, nsteps, sess)
+
+        A = tf.placeholder(train_model.action.dtype, train_model.action.shape)
         ADV = tf.placeholder(tf.float32, [nbatch])
         R = tf.placeholder(tf.float32, [nbatch])
         LR = tf.placeholder(tf.float32, [])
 
-        step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False)
-        train_model = policy(sess, ob_space, ac_space, nenvs*nsteps, nsteps, reuse=True)
+        neglogpac = train_model.pd.neglogp(A)
+        entropy = tf.reduce_mean(train_model.pd.entropy())
 
-        neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=A)
         pg_loss = tf.reduce_mean(ADV * neglogpac)
-        vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R))
-        entropy = tf.reduce_mean(cat_entropy(train_model.pi))
+        vf_loss = losses.mean_squared_error(tf.squeeze(train_model.vf), R)
+
         loss = pg_loss - entropy*ent_coef + vf_loss * vf_coef
 
-        params = find_trainable_variables("model")
+        params = find_trainable_variables("a2c_model")
         grads = tf.gradients(loss, params)
         if max_grad_norm is not None:
             grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
@@ -50,6 +56,7 @@ class Model(object):
             advs = rewards - values
             for step in range(len(obs)):
                 cur_lr = lr.value()
+
             td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, LR:cur_lr}
             if states is not None:
                 td_map[train_model.S] = states
@@ -60,17 +67,6 @@ class Model(object):
             )
             return policy_loss, value_loss, policy_entropy
 
-        def save(save_path):
-            ps = sess.run(params)
-            make_path(osp.dirname(save_path))
-            joblib.dump(ps, save_path)
-
-        def load(load_path):
-            loaded_params = joblib.load(load_path)
-            restores = []
-            for p, loaded_p in zip(params, loaded_params):
-                restores.append(p.assign(loaded_p))
-            sess.run(restores)
 
         self.train = train
         self.train_model = train_model
@@ -78,66 +74,87 @@ class Model(object):
         self.step = step_model.step
         self.value = step_model.value
         self.initial_state = step_model.initial_state
-        self.save = save
-        self.load = load
+        self.save = functools.partial(tf_util.save_variables, sess=sess)
+        self.load = functools.partial(tf_util.load_variables, sess=sess)
         tf.global_variables_initializer().run(session=sess)
 
-class Runner(AbstractEnvRunner):
 
-    def __init__(self, env, model, nsteps=5, gamma=0.99):
-        super().__init__(env=env, model=model, nsteps=nsteps)
-        self.gamma = gamma
+def learn(
+    network,
+    env,
+    seed=None,
+    nsteps=5,
+    total_timesteps=int(80e6),
+    vf_coef=0.5,
+    ent_coef=0.01,
+    max_grad_norm=0.5,
+    lr=7e-4,
+    lrschedule='linear',
+    epsilon=1e-5,
+    alpha=0.99,
+    gamma=0.99,
+    log_interval=100,
+    load_path=None,
+    **network_kwargs):
+
+    ''' 
+    Main entrypoint for A2C algorithm. Train a policy with given network architecture on a given environment using a2c algorithm.
+
+    Parameters:
+    -----------
+
+    network:            policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
+                        specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns 
+                        tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
+                        neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
+                        See baselines.common/policies.py/lstm for more details on using recurrent nets in policies
+                
+
+    env:                RL environment. Should implement interface similar to VecEnv (baselines.common/vec_env) or be wrapped with DummyVecEnv (baselines.common/vec_env/dummy_vec_env.py)
+                    
+
+    seed:               seed to make random number sequence in the alorightm reproducible. By default is None which means seed from system noise generator (not reproducible)
+
+    nsteps:             int, number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where
+                        nenv is number of environment copies simulated in parallel)
+
+    total_timesteps:    int, total number of timesteps to train on (default: 80M)
+
+    vf_coef:            float, coefficient in front of value function loss in the total loss function (default: 0.5)
+
+    ent_coef:           float, coeffictiant in front of the policy entropy in the total loss function (default: 0.01)
+
+    max_gradient_norm:  float, gradient is clipped to have global L2 norm no more than this value (default: 0.5)
+
+    lr:                 float, learning rate for RMSProp (current implementation has RMSProp hardcoded in) (default: 7e-4)
+
+    lrschedule:         schedule of learning rate. Can be 'linear', 'constant', or a function [0..1] -> [0..1] that takes fraction of the training progress as input and 
+                        returns fraction of the learning rate (specified as lr) as output
+
+    epsilon:            float, RMSProp epsilon (stabilizes square root computation in denominator of RMSProp update) (default: 1e-5)
+
+    alpha:              float, RMSProp decay parameter (default: 0.99)
+
+    gamma:              float, reward discounting parameter (default: 0.99)
+
+    log_interval:       int, specifies how frequently the logs are printed out (default: 100)
+
+    **network_kwargs:   keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
+                        For instance, 'mlp' network architecture has arguments num_hidden and num_layers. 
+
+    '''
+    
 
-    def run(self):
-        mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[]
-        mb_states = self.states
-        for n in range(self.nsteps):
-            actions, values, states, _ = self.model.step(self.obs, self.states, self.dones)
-            mb_obs.append(np.copy(self.obs))
-            mb_actions.append(actions)
-            mb_values.append(values)
-            mb_dones.append(self.dones)
-            obs, rewards, dones, _ = self.env.step(actions)
-            self.states = states
-            self.dones = dones
-            for n, done in enumerate(dones):
-                if done:
-                    self.obs[n] = self.obs[n]*0
-            self.obs = obs
-            mb_rewards.append(rewards)
-        mb_dones.append(self.dones)
-        #batch of steps to batch of rollouts
-        mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape(self.batch_ob_shape)
-        mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0)
-        mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0)
-        mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0)
-        mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
-        mb_masks = mb_dones[:, :-1]
-        mb_dones = mb_dones[:, 1:]
-        last_values = self.model.value(self.obs, self.states, self.dones).tolist()
-        #discount/bootstrap off value fn
-        for n, (rewards, dones, value) in enumerate(zip(mb_rewards, mb_dones, last_values)):
-            rewards = rewards.tolist()
-            dones = dones.tolist()
-            if dones[-1] == 0:
-                rewards = discount_with_dones(rewards+[value], dones+[0], self.gamma)[:-1]
-            else:
-                rewards = discount_with_dones(rewards, dones, self.gamma)
-            mb_rewards[n] = rewards
-        mb_rewards = mb_rewards.flatten()
-        mb_actions = mb_actions.flatten()
-        mb_values = mb_values.flatten()
-        mb_masks = mb_masks.flatten()
-        return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values
 
-def learn(policy, env, seed, nsteps=5, total_timesteps=int(80e6), vf_coef=0.5, ent_coef=0.01, max_grad_norm=0.5, lr=7e-4, lrschedule='linear', epsilon=1e-5, alpha=0.99, gamma=0.99, log_interval=100):
     set_global_seeds(seed)
 
     nenvs = env.num_envs
-    ob_space = env.observation_space
-    ac_space = env.action_space
-    model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nenvs=nenvs, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef,
+    policy = build_policy(env, network, **network_kwargs)
+   
+    model = Model(policy=policy, env=env, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef,
         max_grad_norm=max_grad_norm, lr=lr, alpha=alpha, epsilon=epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule)
+    if load_path is not None:
+        model.load(load_path)
     runner = Runner(env, model, nsteps=nsteps, gamma=gamma)
 
     nbatch = nenvs*nsteps
@@ -158,3 +175,4 @@ def learn(policy, env, seed, nsteps=5, total_timesteps=int(80e6), vf_coef=0.5, e
             logger.dump_tabular()
     env.close()
     return model
+
diff --git a/baselines/a2c/policies.py b/baselines/a2c/policies.py
deleted file mode 100644
index 6fbbb14..0000000
--- a/baselines/a2c/policies.py
+++ /dev/null
@@ -1,146 +0,0 @@
-import numpy as np
-import tensorflow as tf
-from baselines.a2c.utils import conv, fc, conv_to_fc, batch_to_seq, seq_to_batch, lstm, lnlstm
-from baselines.common.distributions import make_pdtype
-from baselines.common.input import observation_input
-
-def nature_cnn(unscaled_images, **conv_kwargs):
-    """
-    CNN from Nature paper.
-    """
-    scaled_images = tf.cast(unscaled_images, tf.float32) / 255.
-    activ = tf.nn.relu
-    h = activ(conv(scaled_images, 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2),
-                   **conv_kwargs))
-    h2 = activ(conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2), **conv_kwargs))
-    h3 = activ(conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2), **conv_kwargs))
-    h3 = conv_to_fc(h3)
-    return activ(fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2)))
-
-class LnLstmPolicy(object):
-    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False):
-        nenv = nbatch // nsteps
-        X, processed_x = observation_input(ob_space, nbatch)
-        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
-        S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
-        self.pdtype = make_pdtype(ac_space)
-        with tf.variable_scope("model", reuse=reuse):
-            h = nature_cnn(processed_x)
-            xs = batch_to_seq(h, nenv, nsteps)
-            ms = batch_to_seq(M, nenv, nsteps)
-            h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm)
-            h5 = seq_to_batch(h5)
-            vf = fc(h5, 'v', 1)
-            self.pd, self.pi = self.pdtype.pdfromlatent(h5)
-
-        v0 = vf[:, 0]
-        a0 = self.pd.sample()
-        neglogp0 = self.pd.neglogp(a0)
-        self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)
-
-        def step(ob, state, mask):
-            return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask})
-
-        def value(ob, state, mask):
-            return sess.run(v0, {X:ob, S:state, M:mask})
-
-        self.X = X
-        self.M = M
-        self.S = S
-        self.vf = vf
-        self.step = step
-        self.value = value
-
-class LstmPolicy(object):
-
-    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False):
-        nenv = nbatch // nsteps
-        self.pdtype = make_pdtype(ac_space)
-        X, processed_x = observation_input(ob_space, nbatch)
-
-        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
-        S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
-        with tf.variable_scope("model", reuse=reuse):
-            h = nature_cnn(X)
-            xs = batch_to_seq(h, nenv, nsteps)
-            ms = batch_to_seq(M, nenv, nsteps)
-            h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
-            h5 = seq_to_batch(h5)
-            vf = fc(h5, 'v', 1)
-            self.pd, self.pi = self.pdtype.pdfromlatent(h5)
-
-        v0 = vf[:, 0]
-        a0 = self.pd.sample()
-        neglogp0 = self.pd.neglogp(a0)
-        self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)
-
-        def step(ob, state, mask):
-            return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask})
-
-        def value(ob, state, mask):
-            return sess.run(v0, {X:ob, S:state, M:mask})
-
-        self.X = X
-        self.M = M
-        self.S = S
-        self.vf = vf
-        self.step = step
-        self.value = value
-
-class CnnPolicy(object):
-
-    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False, **conv_kwargs): #pylint: disable=W0613
-        self.pdtype = make_pdtype(ac_space)
-        X, processed_x = observation_input(ob_space, nbatch)
-        with tf.variable_scope("model", reuse=reuse):
-            h = nature_cnn(processed_x, **conv_kwargs)
-            vf = fc(h, 'v', 1)[:,0]
-            self.pd, self.pi = self.pdtype.pdfromlatent(h, init_scale=0.01)
-
-        a0 = self.pd.sample()
-        neglogp0 = self.pd.neglogp(a0)
-        self.initial_state = None
-
-        def step(ob, *_args, **_kwargs):
-            a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob})
-            return a, v, self.initial_state, neglogp
-
-        def value(ob, *_args, **_kwargs):
-            return sess.run(vf, {X:ob})
-
-        self.X = X
-        self.vf = vf
-        self.step = step
-        self.value = value
-
-class MlpPolicy(object):
-    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False): #pylint: disable=W0613
-        self.pdtype = make_pdtype(ac_space)
-        with tf.variable_scope("model", reuse=reuse):
-            X, processed_x = observation_input(ob_space, nbatch)
-            activ = tf.tanh
-            processed_x = tf.layers.flatten(processed_x)
-            pi_h1 = activ(fc(processed_x, 'pi_fc1', nh=64, init_scale=np.sqrt(2)))
-            pi_h2 = activ(fc(pi_h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2)))
-            vf_h1 = activ(fc(processed_x, 'vf_fc1', nh=64, init_scale=np.sqrt(2)))
-            vf_h2 = activ(fc(vf_h1, 'vf_fc2', nh=64, init_scale=np.sqrt(2)))
-            vf = fc(vf_h2, 'vf', 1)[:,0]
-
-            self.pd, self.pi = self.pdtype.pdfromlatent(pi_h2, init_scale=0.01)
-
-
-        a0 = self.pd.sample()
-        neglogp0 = self.pd.neglogp(a0)
-        self.initial_state = None
-
-        def step(ob, *_args, **_kwargs):
-            a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob})
-            return a, v, self.initial_state, neglogp
-
-        def value(ob, *_args, **_kwargs):
-            return sess.run(vf, {X:ob})
-
-        self.X = X
-        self.vf = vf
-        self.step = step
-        self.value = value
diff --git a/baselines/a2c/run_atari.py b/baselines/a2c/run_atari.py
deleted file mode 100644
index b09d9bb..0000000
--- a/baselines/a2c/run_atari.py
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/usr/bin/env python3
-
-from baselines import logger
-from baselines.common.cmd_util import make_atari_env, atari_arg_parser
-from baselines.common.vec_env.vec_frame_stack import VecFrameStack
-from baselines.a2c.a2c import learn
-from baselines.ppo2.policies import CnnPolicy, LstmPolicy, LnLstmPolicy
-
-def train(env_id, num_timesteps, seed, policy, lrschedule, num_env):
-    if policy == 'cnn':
-        policy_fn = CnnPolicy
-    elif policy == 'lstm':
-        policy_fn = LstmPolicy
-    elif policy == 'lnlstm':
-        policy_fn = LnLstmPolicy
-    env = VecFrameStack(make_atari_env(env_id, num_env, seed), 4)
-    learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), lrschedule=lrschedule)
-    env.close()
-
-def main():
-    parser = atari_arg_parser()
-    parser.add_argument('--policy', help='Policy architecture', choices=['cnn', 'lstm', 'lnlstm'], default='cnn')
-    parser.add_argument('--lrschedule', help='Learning rate schedule', choices=['constant', 'linear'], default='constant')
-    args = parser.parse_args()
-    logger.configure()
-    train(args.env, num_timesteps=args.num_timesteps, seed=args.seed,
-        policy=args.policy, lrschedule=args.lrschedule, num_env=16)
-
-if __name__ == '__main__':
-    main()
diff --git a/baselines/a2c/runner.py b/baselines/a2c/runner.py
new file mode 100644
index 0000000..60b5e1d
--- /dev/null
+++ b/baselines/a2c/runner.py
@@ -0,0 +1,60 @@
+import numpy as np
+from baselines.a2c.utils import discount_with_dones
+from baselines.common.runners import AbstractEnvRunner
+
+class Runner(AbstractEnvRunner):
+
+    def __init__(self, env, model, nsteps=5, gamma=0.99):
+        super().__init__(env=env, model=model, nsteps=nsteps)
+        self.gamma = gamma
+        self.batch_action_shape = [x if x is not None else -1 for x in model.train_model.action.shape.as_list()]
+        self.ob_dtype = model.train_model.X.dtype.as_numpy_dtype
+    
+    def run(self):
+        mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[]
+        mb_states = self.states
+        for n in range(self.nsteps):
+            actions, values, states, _ = self.model.step(self.obs, S=self.states, M=self.dones)
+            mb_obs.append(np.copy(self.obs))
+            mb_actions.append(actions)
+            mb_values.append(values)
+            mb_dones.append(self.dones)
+            obs, rewards, dones, _ = self.env.step(actions)
+            self.states = states
+            self.dones = dones
+            for n, done in enumerate(dones):
+                if done:
+                    self.obs[n] = self.obs[n]*0
+            self.obs = obs
+            mb_rewards.append(rewards)
+        mb_dones.append(self.dones)
+        #batch of steps to batch of rollouts
+
+        mb_obs = np.asarray(mb_obs, dtype=self.ob_dtype).swapaxes(1, 0).reshape(self.batch_ob_shape)
+        mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0)
+        mb_actions = np.asarray(mb_actions, dtype=self.model.train_model.action.dtype.name).swapaxes(1, 0)
+        mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0)
+        mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
+        mb_masks = mb_dones[:, :-1]
+        mb_dones = mb_dones[:, 1:]
+
+
+        if self.gamma > 0.0:
+            #discount/bootstrap off value fn
+            last_values = self.model.value(self.obs, S=self.states, M=self.dones).tolist()
+            for n, (rewards, dones, value) in enumerate(zip(mb_rewards, mb_dones, last_values)):
+                rewards = rewards.tolist()
+                dones = dones.tolist()
+                if dones[-1] == 0:
+                    rewards = discount_with_dones(rewards+[value], dones+[0], self.gamma)[:-1]
+                else:
+                    rewards = discount_with_dones(rewards, dones, self.gamma)
+
+                mb_rewards[n] = rewards
+    
+        mb_actions = mb_actions.reshape(self.batch_action_shape)
+
+        mb_rewards = mb_rewards.flatten()
+        mb_values = mb_values.flatten()
+        mb_masks = mb_masks.flatten()
+        return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values
diff --git a/baselines/a2c/utils.py b/baselines/a2c/utils.py
index a7610eb..f38085b 100644
--- a/baselines/a2c/utils.py
+++ b/baselines/a2c/utils.py
@@ -1,8 +1,6 @@
 import os
-import gym
 import numpy as np
 import tensorflow as tf
-from gym import spaces
 from collections import deque
 
 def sample(logits):
@@ -10,18 +8,15 @@ def sample(logits):
     return tf.argmax(logits - tf.log(-tf.log(noise)), 1)
 
 def cat_entropy(logits):
-    a0 = logits - tf.reduce_max(logits, 1, keep_dims=True)
+    a0 = logits - tf.reduce_max(logits, 1, keepdims=True)
     ea0 = tf.exp(a0)
-    z0 = tf.reduce_sum(ea0, 1, keep_dims=True)
+    z0 = tf.reduce_sum(ea0, 1, keepdims=True)
     p0 = ea0 / z0
     return tf.reduce_sum(p0 * (tf.log(z0) - a0), 1)
 
 def cat_entropy_softmax(p0):
     return - tf.reduce_sum(p0 * tf.log(p0 + 1e-6), axis = 1)
 
-def mse(pred, target):
-    return tf.square(pred-target)/2.
-
 def ortho_init(scale=1.0):
     def _ortho_init(shape, dtype, partition_info=None):
         #lasagne ortho init for tf
@@ -58,7 +53,7 @@ def conv(x, scope, *, nf, rf, stride, pad='VALID', init_scale=1.0, data_format='
         b = tf.get_variable("b", bias_var_shape, initializer=tf.constant_initializer(0.0))
         if not one_dim_bias and data_format == 'NHWC':
             b = tf.reshape(b, bshape)
-        return b + tf.nn.conv2d(x, w, strides=strides, padding=pad, data_format=data_format)
+        return tf.nn.conv2d(x, w, strides=strides, padding=pad, data_format=data_format) + b
 
 def fc(x, scope, nh, *, init_scale=1.0, init_bias=0.0):
     with tf.variable_scope(scope):
@@ -85,7 +80,6 @@ def seq_to_batch(h, flat = False):
 
 def lstm(xs, ms, s, scope, nh, init_scale=1.0):
     nbatch, nin = [v.value for v in xs[0].get_shape()]
-    nsteps = len(xs)
     with tf.variable_scope(scope):
         wx = tf.get_variable("wx", [nin, nh*4], initializer=ortho_init(init_scale))
         wh = tf.get_variable("wh", [nh, nh*4], initializer=ortho_init(init_scale))
@@ -115,7 +109,6 @@ def _ln(x, g, b, e=1e-5, axes=[1]):
 
 def lnlstm(xs, ms, s, scope, nh, init_scale=1.0):
     nbatch, nin = [v.value for v in xs[0].get_shape()]
-    nsteps = len(xs)
     with tf.variable_scope(scope):
         wx = tf.get_variable("wx", [nin, nh*4], initializer=ortho_init(init_scale))
         gx = tf.get_variable("gx", [nh*4], initializer=tf.constant_initializer(1.0))
@@ -160,8 +153,7 @@ def discount_with_dones(rewards, dones, gamma):
     return discounted[::-1]
 
 def find_trainable_variables(key):
-    with tf.variable_scope(key):
-        return tf.trainable_variables()
+    return tf.trainable_variables(key)
 
 def make_path(f):
     return os.makedirs(f, exist_ok=True)
diff --git a/baselines/acer/acer_simple.py b/baselines/acer/acer.py
similarity index 64%
rename from baselines/acer/acer_simple.py
rename to baselines/acer/acer.py
index bed486a..1bb8129 100644
--- a/baselines/acer/acer_simple.py
+++ b/baselines/acer/acer.py
@@ -1,20 +1,20 @@
 import time
-import joblib
+import functools
 import numpy as np
 import tensorflow as tf
 from baselines import logger
 
 from baselines.common import set_global_seeds
-from baselines.common.runners import AbstractEnvRunner
+from baselines.common.policies import build_policy
+from baselines.common.tf_util import get_session, save_variables
 
 from baselines.a2c.utils import batch_to_seq, seq_to_batch
-from baselines.a2c.utils import Scheduler, make_path, find_trainable_variables
 from baselines.a2c.utils import cat_entropy_softmax
+from baselines.a2c.utils import Scheduler, find_trainable_variables
 from baselines.a2c.utils import EpisodeStats
 from baselines.a2c.utils import get_by_index, check_shape, avg_norm, gradient_add, q_explained_variance
 from baselines.acer.buffer import Buffer
-
-import os.path as osp
+from baselines.acer.runner import Runner
 
 # remove last step
 def strip(var, nenvs, nsteps, flat = False):
@@ -59,10 +59,8 @@ class Model(object):
                  ent_coef, q_coef, gamma, max_grad_norm, lr,
                  rprop_alpha, rprop_epsilon, total_timesteps, lrschedule,
                  c, trust_region, alpha, delta):
-        config = tf.ConfigProto(allow_soft_placement=True,
-                                intra_op_parallelism_threads=num_procs,
-                                inter_op_parallelism_threads=num_procs)
-        sess = tf.Session(config=config)
+
+        sess = get_session()
         nact = ac_space.n
         nbatch = nenvs * nsteps
 
@@ -72,11 +70,16 @@ class Model(object):
         MU = tf.placeholder(tf.float32, [nbatch, nact]) # mu's
         LR = tf.placeholder(tf.float32, [])
         eps = 1e-6
+    
+        step_ob_placeholder = tf.placeholder(dtype=ob_space.dtype, shape=(nenvs,) + ob_space.shape[:-1] + (ob_space.shape[-1] * nstack,))
+        train_ob_placeholder = tf.placeholder(dtype=ob_space.dtype, shape=(nenvs*(nsteps+1),) + ob_space.shape[:-1] + (ob_space.shape[-1] * nstack,))
+        with tf.variable_scope('acer_model', reuse=tf.AUTO_REUSE):
 
-        step_model = policy(sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False)
-        train_model = policy(sess, ob_space, ac_space, nenvs, nsteps + 1, nstack, reuse=True)
+            step_model = policy(observ_placeholder=step_ob_placeholder, sess=sess)
+            train_model = policy(observ_placeholder=train_ob_placeholder, sess=sess)
 
-        params = find_trainable_variables("model")
+    
+        params = find_trainable_variables("acer_model")
         print("Params {}".format(len(params)))
         for var in params:
             print(var)
@@ -90,14 +93,20 @@ class Model(object):
             print(v.name)
             return v
 
-        with tf.variable_scope("", custom_getter=custom_getter, reuse=True):
-            polyak_model = policy(sess, ob_space, ac_space, nenvs, nsteps + 1, nstack, reuse=True)
+        with tf.variable_scope("acer_model", custom_getter=custom_getter, reuse=True):
+            polyak_model = policy(observ_placeholder=train_ob_placeholder, sess=sess)
 
         # Notation: (var) = batch variable, (var)s = seqeuence variable, (var)_i = variable index by action at step i
-        v = tf.reduce_sum(train_model.pi * train_model.q, axis = -1) # shape is [nenvs * (nsteps + 1)]
+        
+        # action probability distributions according to train_model, polyak_model and step_model
+        # poilcy.pi is probability distribution parameters; to obtain distribution that sums to 1 need to take softmax
+        train_model_p = tf.nn.softmax(train_model.pi)  
+        polyak_model_p = tf.nn.softmax(polyak_model.pi)
+        step_model_p = tf.nn.softmax(step_model.pi)
+        v = tf.reduce_sum(train_model_p * train_model.q, axis = -1) # shape is [nenvs * (nsteps + 1)]
 
         # strip off last step
-        f, f_pol, q = map(lambda var: strip(var, nenvs, nsteps), [train_model.pi, polyak_model.pi, train_model.q])
+        f, f_pol, q = map(lambda var: strip(var, nenvs, nsteps), [train_model_p, polyak_model_p, train_model.q])
         # Get pi and q values for actions taken
         f_i = get_by_index(f, A)
         q_i = get_by_index(q, A)
@@ -110,7 +119,8 @@ class Model(object):
         qret = q_retrace(R, D, q_i, v, rho_i, nenvs, nsteps, gamma)
 
         # Calculate losses
-        # Entropy
+        # Entropy   
+        # entropy = tf.reduce_mean(strip(train_model.pd.entropy(), nenvs, nsteps))
         entropy = tf.reduce_mean(cat_entropy_softmax(f))
 
         # Policy Graident loss, with truncated importance sampling & bias correction
@@ -192,80 +202,29 @@ class Model(object):
         def train(obs, actions, rewards, dones, mus, states, masks, steps):
             cur_lr = lr.value_steps(steps)
             td_map = {train_model.X: obs, polyak_model.X: obs, A: actions, R: rewards, D: dones, MU: mus, LR: cur_lr}
-            if states != []:
+            if states is not None:
                 td_map[train_model.S] = states
                 td_map[train_model.M] = masks
                 td_map[polyak_model.S] = states
                 td_map[polyak_model.M] = masks
+
             return names_ops, sess.run(run_ops, td_map)[1:]  # strip off _train
 
-        def save(save_path):
-            ps = sess.run(params)
-            make_path(osp.dirname(save_path))
-            joblib.dump(ps, save_path)
+        def _step(observation, **kwargs):
+            return step_model._evaluate([step_model.action, step_model_p, step_model.state], observation, **kwargs)
+                
+                    
 
         self.train = train
-        self.save = save
+        self.save = functools.partial(save_variables, sess=sess, variables=params)
         self.train_model = train_model
         self.step_model = step_model
-        self.step = step_model.step
+        self._step = _step
+        self.step = self.step_model.step
+
         self.initial_state = step_model.initial_state
         tf.global_variables_initializer().run(session=sess)
 
-class Runner(AbstractEnvRunner):
-    def __init__(self, env, model, nsteps, nstack):
-        super().__init__(env=env, model=model, nsteps=nsteps)
-        self.nstack = nstack
-        nh, nw, nc = env.observation_space.shape
-        self.nc = nc  # nc = 1 for atari, but just in case
-        self.nenv = nenv = env.num_envs
-        self.nact = env.action_space.n
-        self.nbatch = nenv * nsteps
-        self.batch_ob_shape = (nenv*(nsteps+1), nh, nw, nc*nstack)
-        self.obs = np.zeros((nenv, nh, nw, nc * nstack), dtype=np.uint8)
-        obs = env.reset()
-        self.update_obs(obs)
-
-    def update_obs(self, obs, dones=None):
-        if dones is not None:
-            self.obs *= (1 - dones.astype(np.uint8))[:, None, None, None]
-        self.obs = np.roll(self.obs, shift=-self.nc, axis=3)
-        self.obs[:, :, :, -self.nc:] = obs[:, :, :, :]
-
-    def run(self):
-        enc_obs = np.split(self.obs, self.nstack, axis=3)  # so now list of obs steps
-        mb_obs, mb_actions, mb_mus, mb_dones, mb_rewards = [], [], [], [], []
-        for _ in range(self.nsteps):
-            actions, mus, states = self.model.step(self.obs, state=self.states, mask=self.dones)
-            mb_obs.append(np.copy(self.obs))
-            mb_actions.append(actions)
-            mb_mus.append(mus)
-            mb_dones.append(self.dones)
-            obs, rewards, dones, _ = self.env.step(actions)
-            # states information for statefull models like LSTM
-            self.states = states
-            self.dones = dones
-            self.update_obs(obs, dones)
-            mb_rewards.append(rewards)
-            enc_obs.append(obs)
-        mb_obs.append(np.copy(self.obs))
-        mb_dones.append(self.dones)
-
-        enc_obs = np.asarray(enc_obs, dtype=np.uint8).swapaxes(1, 0)
-        mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0)
-        mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0)
-        mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0)
-        mb_mus = np.asarray(mb_mus, dtype=np.float32).swapaxes(1, 0)
-
-        mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
-
-        mb_masks = mb_dones # Used for statefull models like LSTM's to mask state when done
-        mb_dones = mb_dones[:, 1:] # Used for calculating returns. The dones array is now aligned with rewards
-
-        # shapes are now [nenv, nsteps, []]
-        # When pulling from buffer, arrays will now be reshaped in place, preventing a deep copy.
-
-        return enc_obs, mb_obs, mb_actions, mb_rewards, mb_mus, mb_dones, mb_masks
 
 class Acer():
     def __init__(self, runner, model, buffer, log_interval):
@@ -311,19 +270,84 @@ class Acer():
             logger.dump_tabular()
 
 
-def learn(policy, env, seed, nsteps=20, nstack=4, total_timesteps=int(80e6), q_coef=0.5, ent_coef=0.01,
+def learn(network, env, seed=None, nsteps=20, nstack=4, total_timesteps=int(80e6), q_coef=0.5, ent_coef=0.01,
           max_grad_norm=10, lr=7e-4, lrschedule='linear', rprop_epsilon=1e-5, rprop_alpha=0.99, gamma=0.99,
           log_interval=100, buffer_size=50000, replay_ratio=4, replay_start=10000, c=10.0,
-          trust_region=True, alpha=0.99, delta=1):
+          trust_region=True, alpha=0.99, delta=1, load_path=None, **network_kwargs):
+
+    '''
+    Main entrypoint for ACER (Actor-Critic with Experience Replay) algorithm (https://arxiv.org/pdf/1611.01224.pdf)
+    Train an agent with given network architecture on a given environment using ACER.
+
+    Parameters:
+    ----------
+
+    network:            policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
+                        specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns 
+                        tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
+                        neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
+                        See baselines.common/policies.py/lstm for more details on using recurrent nets in policies
+
+    env:                environment. Needs to be vectorized for parallel environment simulation. 
+                        The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class.
+
+    nsteps:             int, number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where
+                        nenv is number of environment copies simulated in parallel) (default: 20)
+
+    nstack:             int, size of the frame stack, i.e. number of the frames passed to the step model. Frames are stacked along channel dimension 
+                        (last image dimension) (default: 4)
+
+    total_timesteps:    int, number of timesteps (i.e. number of actions taken in the environment) (default: 80M)
+
+    q_coef:             float, value function loss coefficient in the optimization objective (analog of vf_coef for other actor-critic methods)
+
+    ent_coef:           float, policy entropy coefficient in the optimization objective (default: 0.01)
+
+    max_grad_norm:      float, gradient norm clipping coefficient. If set to None, no clipping. (default: 10), 
+    
+    lr:                 float, learning rate for RMSProp (current implementation has RMSProp hardcoded in) (default: 7e-4)
+
+    lrschedule:         schedule of learning rate. Can be 'linear', 'constant', or a function [0..1] -> [0..1] that takes fraction of the training progress as input and 
+                        returns fraction of the learning rate (specified as lr) as output
+
+    rprop_epsilon:      float, RMSProp epsilon (stabilizes square root computation in denominator of RMSProp update) (default: 1e-5)
+
+    rprop_alpha:        float, RMSProp decay parameter (default: 0.99)
+
+    gamma:              float, reward discounting factor (default: 0.99)
+
+    log_interval:       int, number of updates between logging events (default: 100)
+
+    buffer_size:        int, size of the replay buffer (default: 50k)
+
+    replay_ratio:       int, now many (on average) batches of data to sample from the replay buffer take after batch from the environment (default: 4)
+
+    replay_start:       int, the sampling from the replay buffer does not start until replay buffer has at least that many samples (default: 10k)
+
+    c:                  float, importance weight clipping factor (default: 10)
+    
+    trust_region        bool, whether or not algorithms estimates the gradient KL divergence between the old and updated policy and uses it to determine step size  (default: True)
+
+    delta:              float, max KL divergence between the old policy and updated policy (default: 1)
+
+    alpha:              float, momentum factor in the Polyak (exponential moving average) averaging of the model parameters (default: 0.99) 
+
+    load_path:          str, path to load the model from (default: None)
+
+    **network_kwargs:               keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
+                                    For instance, 'mlp' network architecture has arguments num_hidden and num_layers. 
+
+    '''
+
     print("Running Acer Simple")
     print(locals())
-    tf.reset_default_graph()
     set_global_seeds(seed)
+    policy = build_policy(env, network, estimate_q=True, **network_kwargs)
 
     nenvs = env.num_envs
     ob_space = env.observation_space
     ac_space = env.action_space
-    num_procs = len(env.remotes) # HACK
+    num_procs = len(env.remotes) if hasattr(env, 'remotes') else 1# HACK
     model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nenvs=nenvs, nsteps=nsteps, nstack=nstack,
                   num_procs=num_procs, ent_coef=ent_coef, q_coef=q_coef, gamma=gamma,
                   max_grad_norm=max_grad_norm, lr=lr, rprop_alpha=rprop_alpha, rprop_epsilon=rprop_epsilon,
@@ -338,6 +362,7 @@ def learn(policy, env, seed, nsteps=20, nstack=4, total_timesteps=int(80e6), q_c
     nbatch = nenvs*nsteps
     acer = Acer(runner, model, buffer, log_interval)
     acer.tstart = time.time()
+
     for acer.steps in range(0, total_timesteps, nbatch): #nbatch samples, 1 on_policy call and multiple off-policy calls
         acer.call(on_policy=True)
         if replay_ratio > 0 and buffer.has_atleast(replay_start):
@@ -346,3 +371,4 @@ def learn(policy, env, seed, nsteps=20, nstack=4, total_timesteps=int(80e6), q_c
                 acer.call(on_policy=False)  # no simulation steps in this
 
     env.close()
+    return model
diff --git a/baselines/acer/defaults.py b/baselines/acer/defaults.py
new file mode 100644
index 0000000..0334bae
--- /dev/null
+++ b/baselines/acer/defaults.py
@@ -0,0 +1,4 @@
+def atari():
+    return dict(
+        lrschedule='constant'
+    )
diff --git a/baselines/acer/policies.py b/baselines/acer/policies.py
index 627c400..6dad6f3 100644
--- a/baselines/acer/policies.py
+++ b/baselines/acer/policies.py
@@ -1,6 +1,6 @@
 import numpy as np
 import tensorflow as tf
-from baselines.ppo2.policies import nature_cnn
+from baselines.common.policies import nature_cnn
 from baselines.a2c.utils import fc, batch_to_seq, seq_to_batch, lstm, sample
 
 
@@ -18,11 +18,13 @@ class AcerCnnPolicy(object):
             pi = tf.nn.softmax(pi_logits)
             q = fc(h, 'q', nact)
 
-        a = sample(pi_logits)  # could change this to use self.pi instead
+        a = sample(tf.nn.softmax(pi_logits))  # could change this to use self.pi instead
         self.initial_state = []  # not stateful
         self.X = X
         self.pi = pi  # actual policy params now
+        self.pi_logits = pi_logits
         self.q = q
+        self.vf = q
 
         def step(ob, *args, **kwargs):
             # returns actions, mus, states
diff --git a/baselines/acer/run_atari.py b/baselines/acer/run_atari.py
deleted file mode 100644
index cce979e..0000000
--- a/baselines/acer/run_atari.py
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/usr/bin/env python3
-from baselines import logger
-from baselines.acer.acer_simple import learn
-from baselines.acer.policies import AcerCnnPolicy, AcerLstmPolicy
-from baselines.common.cmd_util import make_atari_env, atari_arg_parser
-
-def train(env_id, num_timesteps, seed, policy, lrschedule, num_cpu):
-    env = make_atari_env(env_id, num_cpu, seed)
-    if policy == 'cnn':
-        policy_fn = AcerCnnPolicy
-    elif policy == 'lstm':
-        policy_fn = AcerLstmPolicy
-    else:
-        print("Policy {} not implemented".format(policy))
-        return
-    learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), lrschedule=lrschedule)
-    env.close()
-
-def main():
-    parser = atari_arg_parser()
-    parser.add_argument('--policy', help='Policy architecture', choices=['cnn', 'lstm', 'lnlstm'], default='cnn')
-    parser.add_argument('--lrschedule', help='Learning rate schedule', choices=['constant', 'linear'], default='constant')
-    parser.add_argument('--logdir', help ='Directory for logging')
-    args = parser.parse_args()
-    logger.configure(args.logdir)
-    train(args.env, num_timesteps=args.num_timesteps, seed=args.seed,
-          policy=args.policy, lrschedule=args.lrschedule, num_cpu=16)
-
-if __name__ == '__main__':
-    main()
diff --git a/baselines/acer/runner.py b/baselines/acer/runner.py
new file mode 100644
index 0000000..6bc1b4c
--- /dev/null
+++ b/baselines/acer/runner.py
@@ -0,0 +1,60 @@
+import numpy as np
+from baselines.common.runners import AbstractEnvRunner
+
+class Runner(AbstractEnvRunner):
+
+    def __init__(self, env, model, nsteps, nstack):
+        super().__init__(env=env, model=model, nsteps=nsteps)
+        self.nstack = nstack
+        nh, nw, nc = env.observation_space.shape
+        self.nc = nc  # nc = 1 for atari, but just in case
+        self.nact = env.action_space.n
+        nenv = self.nenv
+        self.nbatch = nenv * nsteps
+        self.batch_ob_shape = (nenv*(nsteps+1), nh, nw, nc*nstack)
+        self.obs = np.zeros((nenv, nh, nw, nc * nstack), dtype=np.uint8)
+        obs = env.reset()
+        self.update_obs(obs)
+
+    def update_obs(self, obs, dones=None):
+        #self.obs = obs
+        if dones is not None:
+            self.obs *= (1 - dones.astype(np.uint8))[:, None, None, None]
+        self.obs = np.roll(self.obs, shift=-self.nc, axis=3)
+        self.obs[:, :, :, -self.nc:] = obs[:, :, :, :]
+
+    def run(self):
+        enc_obs = np.split(self.obs, self.nstack, axis=3)  # so now list of obs steps
+        mb_obs, mb_actions, mb_mus, mb_dones, mb_rewards = [], [], [], [], []
+        for _ in range(self.nsteps):
+            actions, mus, states = self.model._step(self.obs, S=self.states, M=self.dones)
+            mb_obs.append(np.copy(self.obs))
+            mb_actions.append(actions)
+            mb_mus.append(mus)
+            mb_dones.append(self.dones)
+            obs, rewards, dones, _ = self.env.step(actions)
+            # states information for statefull models like LSTM
+            self.states = states
+            self.dones = dones
+            self.update_obs(obs, dones)
+            mb_rewards.append(rewards)
+            enc_obs.append(obs)
+        mb_obs.append(np.copy(self.obs))
+        mb_dones.append(self.dones)
+
+        enc_obs = np.asarray(enc_obs, dtype=np.uint8).swapaxes(1, 0)
+        mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0)
+        mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0)
+        mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0)
+        mb_mus = np.asarray(mb_mus, dtype=np.float32).swapaxes(1, 0)
+
+        mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
+
+        mb_masks = mb_dones # Used for statefull models like LSTM's to mask state when done
+        mb_dones = mb_dones[:, 1:] # Used for calculating returns. The dones array is now aligned with rewards
+
+        # shapes are now [nenv, nsteps, []]
+        # When pulling from buffer, arrays will now be reshaped in place, preventing a deep copy.
+
+        return enc_obs, mb_obs, mb_actions, mb_rewards, mb_mus, mb_dones, mb_masks
+
diff --git a/baselines/acktr/acktr.py b/baselines/acktr/acktr.py
new file mode 100644
index 0000000..97090b4
--- /dev/null
+++ b/baselines/acktr/acktr.py
@@ -0,0 +1 @@
+from baselines.acktr.acktr_disc import *
diff --git a/baselines/acktr/acktr_disc.py b/baselines/acktr/acktr_disc.py
index a8b77b6..cfa028d 100644
--- a/baselines/acktr/acktr_disc.py
+++ b/baselines/acktr/acktr_disc.py
@@ -1,16 +1,17 @@
 import os.path as osp
 import time
-import joblib
+import functools
 import numpy as np
 import tensorflow as tf
 from baselines import logger
 
 from baselines.common import set_global_seeds, explained_variance
+from baselines.common.policies import build_policy
+from baselines.common.tf_util import get_session, save_variables, load_variables
 
-from baselines.a2c.a2c import Runner
+from baselines.a2c.runner import Runner
 from baselines.a2c.utils import discount_with_dones
 from baselines.a2c.utils import Scheduler, find_trainable_variables
-from baselines.a2c.utils import cat_entropy, mse
 from baselines.acktr import kfac
 
 
@@ -19,11 +20,8 @@ class Model(object):
     def __init__(self, policy, ob_space, ac_space, nenvs,total_timesteps, nprocs=32, nsteps=20,
                  ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5,
                  kfac_clip=0.001, lrschedule='linear'):
-        config = tf.ConfigProto(allow_soft_placement=True,
-                                intra_op_parallelism_threads=nprocs,
-                                inter_op_parallelism_threads=nprocs)
-        config.gpu_options.allow_growth = True
-        self.sess = sess = tf.Session(config=config)
+
+        self.sess = sess = get_session()
         nact = ac_space.n
         nbatch = nenvs * nsteps
         A = tf.placeholder(tf.int32, [nbatch])
@@ -32,27 +30,28 @@ class Model(object):
         PG_LR = tf.placeholder(tf.float32, [])
         VF_LR = tf.placeholder(tf.float32, [])
 
-        self.model = step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False)
-        self.model2 = train_model = policy(sess, ob_space, ac_space, nenvs*nsteps, nsteps, reuse=True)
+        with tf.variable_scope('acktr_model', reuse=tf.AUTO_REUSE):
+            self.model = step_model = policy(nenvs, 1, sess=sess)
+            self.model2 = train_model = policy(nenvs*nsteps, nsteps, sess=sess)
 
-        logpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=A)
+        neglogpac = train_model.pd.neglogp(A)
         self.logits = logits = train_model.pi
 
         ##training loss
-        pg_loss = tf.reduce_mean(ADV*logpac)
-        entropy = tf.reduce_mean(cat_entropy(train_model.pi))
+        pg_loss = tf.reduce_mean(ADV*neglogpac)
+        entropy = tf.reduce_mean(train_model.pd.entropy())
         pg_loss = pg_loss - ent_coef * entropy
-        vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R))
+        vf_loss = tf.losses.mean_squared_error(tf.squeeze(train_model.vf), R)
         train_loss = pg_loss + vf_coef * vf_loss
 
 
         ##Fisher loss construction
-        self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(logpac)
+        self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(neglogpac)
         sample_net = train_model.vf + tf.random_normal(tf.shape(train_model.vf))
         self.vf_fisher = vf_fisher_loss = - vf_fisher_coef*tf.reduce_mean(tf.pow(train_model.vf - tf.stop_gradient(sample_net), 2))
         self.joint_fisher = joint_fisher_loss = pg_fisher_loss + vf_fisher_loss
 
-        self.params=params = find_trainable_variables("model")
+        self.params=params = find_trainable_variables("acktr_model")
 
         self.grads_check = grads = tf.gradients(train_loss,params)
 
@@ -82,22 +81,10 @@ class Model(object):
             )
             return policy_loss, value_loss, policy_entropy
 
-        def save(save_path):
-            ps = sess.run(params)
-            joblib.dump(ps, save_path)
-
-        def load(load_path):
-            loaded_params = joblib.load(load_path)
-            restores = []
-            for p, loaded_p in zip(params, loaded_params):
-                restores.append(p.assign(loaded_p))
-            sess.run(restores)
-
-
 
         self.train = train
-        self.save = save
-        self.load = load
+        self.save = functools.partial(save_variables, sess=sess)
+        self.load = functools.partial(load_variables, sess=sess)
         self.train_model = train_model
         self.step_model = step_model
         self.step = step_model.step
@@ -105,12 +92,17 @@ class Model(object):
         self.initial_state = step_model.initial_state
         tf.global_variables_initializer().run(session=sess)
 
-def learn(policy, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interval=1, nprocs=32, nsteps=20,
+def learn(network, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interval=1, nprocs=32, nsteps=20,
                  ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5,
-                 kfac_clip=0.001, save_interval=None, lrschedule='linear'):
-    tf.reset_default_graph()
+                 kfac_clip=0.001, save_interval=None, lrschedule='linear', load_path=None, **network_kwargs):
     set_global_seeds(seed)
 
+    
+    if network == 'cnn':
+        network_kwargs['one_dim_bias'] = True
+
+    policy = build_policy(env, network, **network_kwargs)
+
     nenvs = env.num_envs
     ob_space = env.observation_space
     ac_space = env.action_space
@@ -123,6 +115,9 @@ def learn(policy, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interval
         with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh:
             fh.write(cloudpickle.dumps(make_model))
     model = make_model()
+            
+    if load_path is not None:
+        model.load(load_path)
 
     runner = Runner(env, model, nsteps=nsteps, gamma=gamma)
     nbatch = nenvs*nsteps
@@ -153,3 +148,4 @@ def learn(policy, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interval
     coord.request_stop()
     coord.join(enqueue_threads)
     env.close()
+    return model
diff --git a/baselines/acktr/run_atari.py b/baselines/acktr/run_atari.py
index 6e398ce..50e1580 100644
--- a/baselines/acktr/run_atari.py
+++ b/baselines/acktr/run_atari.py
@@ -6,11 +6,11 @@ from baselines import logger
 from baselines.acktr.acktr_disc import learn
 from baselines.common.cmd_util import make_atari_env, atari_arg_parser
 from baselines.common.vec_env.vec_frame_stack import VecFrameStack
-from baselines.ppo2.policies import CnnPolicy
+from baselines.common.policies import cnn
 
 def train(env_id, num_timesteps, seed, num_cpu):
     env = VecFrameStack(make_atari_env(env_id, num_cpu, seed), 4)
-    policy_fn = partial(CnnPolicy, one_dim_bias=True)
+    policy_fn = cnn(env=env, one_dim_bias=True)
     learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), nprocs=num_cpu)
     env.close()
 
diff --git a/baselines/bench/benchmarks.py b/baselines/bench/benchmarks.py
index a5a35f8..e9328b2 100644
--- a/baselines/bench/benchmarks.py
+++ b/baselines/bench/benchmarks.py
@@ -59,7 +59,7 @@ register_benchmark({
 register_benchmark({
     'name': 'Atari10M',
     'description': '7 Atari games from Mnih et al. (2013), with pixel observations, 10M timesteps',
-    'tasks': [{'desc': _game, 'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(10e6)} for _game in _atari7]
+    'tasks': [{'desc': _game, 'env_id': _game + _ATARI_SUFFIX, 'trials': 6, 'num_timesteps': int(10e6)} for _game in _atari7]
 })
 
 register_benchmark({
@@ -84,8 +84,9 @@ _mujocosmall = [
 register_benchmark({
     'name': 'Mujoco1M',
     'description': 'Some small 2D MuJoCo tasks, run for 1M timesteps',
-    'tasks': [{'env_id': _envid, 'trials': 3, 'num_timesteps': int(1e6)} for _envid in _mujocosmall]
+    'tasks': [{'env_id': _envid, 'trials': 6, 'num_timesteps': int(1e6)} for _envid in _mujocosmall]
 })
+
 register_benchmark({
     'name': 'MujocoWalkers',
     'description': 'MuJoCo forward walkers, run for 8M, humanoid 100M',
diff --git a/baselines/bench/monitor.py b/baselines/bench/monitor.py
index 0da1b4f..bb0c282 100644
--- a/baselines/bench/monitor.py
+++ b/baselines/bench/monitor.py
@@ -112,6 +112,8 @@ def load_results(dir):
         with open(fname, 'rt') as fh:
             if fname.endswith('csv'):
                 firstline = fh.readline()
+                if not firstline:
+                    continue
                 assert firstline[0] == '#'
                 header = json.loads(firstline[1:])
                 df = pandas.read_csv(fh, index_col=None)
@@ -158,4 +160,4 @@ def test_monitor():
     last_logline = pandas.read_csv(f, index_col=None)
     assert set(last_logline.keys()) == {'l', 't', 'r'}, "Incorrect keys in monitor logline"
     f.close()
-    os.remove(mon_file)
\ No newline at end of file
+    os.remove(mon_file)
diff --git a/baselines/common/atari_wrappers.py b/baselines/common/atari_wrappers.py
index 2aefad7..4598e23 100644
--- a/baselines/common/atari_wrappers.py
+++ b/baselines/common/atari_wrappers.py
@@ -1,4 +1,6 @@
 import numpy as np
+import os
+os.environ.setdefault('PATH', '')
 from collections import deque
 import gym
 from gym import spaces
diff --git a/baselines/common/cmd_util.py b/baselines/common/cmd_util.py
index 5707695..681a80c 100644
--- a/baselines/common/cmd_util.py
+++ b/baselines/common/cmd_util.py
@@ -3,7 +3,11 @@ Helpers for scripts like run_atari.py.
 """
 
 import os
-from mpi4py import MPI
+try:
+    from mpi4py import MPI
+except ImportError:
+    MPI = None
+
 import gym
 from gym.wrappers import FlattenDictWrapper
 from baselines import logger
@@ -17,25 +21,32 @@ def make_atari_env(env_id, num_env, seed, wrapper_kwargs=None, start_index=0):
     Create a wrapped, monitored SubprocVecEnv for Atari.
     """
     if wrapper_kwargs is None: wrapper_kwargs = {}
+    mpi_rank = MPI.COMM_WORLD.Get_rank() if MPI else 0
     def make_env(rank): # pylint: disable=C0111
         def _thunk():
             env = make_atari(env_id)
-            env.seed(seed + rank)
-            env = Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
+            env.seed(seed + 10000*mpi_rank + rank if seed is not None else None)
+            env = Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(mpi_rank) + '.' + str(rank)))
             return wrap_deepmind(env, **wrapper_kwargs)
         return _thunk
     set_global_seeds(seed)
     return SubprocVecEnv([make_env(i + start_index) for i in range(num_env)])
 
-def make_mujoco_env(env_id, seed):
+def make_mujoco_env(env_id, seed, reward_scale=1.0):
     """
     Create a wrapped, monitored gym.Env for MuJoCo.
     """
     rank = MPI.COMM_WORLD.Get_rank()
-    set_global_seeds(seed + 10000 * rank)
+    myseed = seed  + 1000 * rank if seed is not None else None
+    set_global_seeds(myseed)
     env = gym.make(env_id)
-    env = Monitor(env, os.path.join(logger.get_dir(), str(rank)))
+    env = Monitor(env, os.path.join(logger.get_dir(), str(rank)), allow_early_resets=True)
     env.seed(seed)
+
+    if reward_scale != 1.0:
+        from baselines.common.retro_wrappers import RewardScaler
+        env = RewardScaler(env, reward_scale)
+
     return env
 
 def make_robotics_env(env_id, seed, rank=0):
@@ -62,20 +73,27 @@ def atari_arg_parser():
     """
     Create an argparse.ArgumentParser for run_atari.py.
     """
-    parser = arg_parser()
-    parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4')
-    parser.add_argument('--seed', help='RNG seed', type=int, default=0)
-    parser.add_argument('--num-timesteps', type=int, default=int(10e6))
-    return parser
+    print('Obsolete - use common_arg_parser instead')
+    return common_arg_parser()
 
 def mujoco_arg_parser():
+    print('Obsolete - use common_arg_parser instead')
+    return common_arg_parser()
+
+def common_arg_parser():
     """
     Create an argparse.ArgumentParser for run_mujoco.py.
     """
     parser = arg_parser()
     parser.add_argument('--env', help='environment ID', type=str, default='Reacher-v2')
-    parser.add_argument('--seed', help='RNG seed', type=int, default=0)
-    parser.add_argument('--num-timesteps', type=int, default=int(1e6))
+    parser.add_argument('--seed', help='RNG seed', type=int, default=None)
+    parser.add_argument('--alg', help='Algorithm', type=str, default='ppo2')
+    parser.add_argument('--num_timesteps', type=float, default=1e6), 
+    parser.add_argument('--network', help='network type (mlp, cnn, lstm, cnn_lstm, conv_only)', default=None)
+    parser.add_argument('--gamestate', help='game state to load (so far only used in retro games)', default=None)
+    parser.add_argument('--num_env', help='Number of environment copies being run in parallel. When not specified, set to number of cpus for Atari, and to 1 for Mujoco', default=None, type=int)
+    parser.add_argument('--reward_scale', help='Reward scale factor. Default: 1.0', default=1.0, type=float)
+    parser.add_argument('--save_path', help='Path to save trained model to', default=None, type=str)
     parser.add_argument('--play', default=False, action='store_true')
     return parser
 
@@ -85,6 +103,24 @@ def robotics_arg_parser():
     """
     parser = arg_parser()
     parser.add_argument('--env', help='environment ID', type=str, default='FetchReach-v0')
-    parser.add_argument('--seed', help='RNG seed', type=int, default=0)
+    parser.add_argument('--seed', help='RNG seed', type=int, default=None)
     parser.add_argument('--num-timesteps', type=int, default=int(1e6))
     return parser
+
+
+def parse_unknown_args(args):
+    """
+    Parse arguments not consumed by arg parser into a dicitonary
+    """
+    retval = {}
+    for arg in args:
+        assert arg.startswith('--')
+        assert '=' in arg, 'cannot parse arg {}'.format(arg)
+        key = arg.split('=')[0][2:]
+        value = arg.split('=')[1]
+        retval[key] = value
+
+    return retval
+
+
+
diff --git a/baselines/common/distributions.py b/baselines/common/distributions.py
index 8a57c37..29f3632 100644
--- a/baselines/common/distributions.py
+++ b/baselines/common/distributions.py
@@ -85,7 +85,7 @@ class DiagGaussianPdType(PdType):
 
     def pdfromlatent(self, latent_vector, init_scale=1.0, init_bias=0.0):
         mean = fc(latent_vector, 'pi', self.size, init_scale=init_scale, init_bias=init_bias)
-        logstd = tf.get_variable(name='logstd', shape=[1, self.size], initializer=tf.zeros_initializer())
+        logstd = tf.get_variable(name='pi/logstd', shape=[1, self.size], initializer=tf.zeros_initializer())
         pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
         return self.pdfromflat(pdparam), mean
 
@@ -143,26 +143,26 @@ class CategoricalPd(Pd):
         # Note: we can't use sparse_softmax_cross_entropy_with_logits because
         #       the implementation does not allow second-order derivatives...
         one_hot_actions = tf.one_hot(x, self.logits.get_shape().as_list()[-1])
-        return tf.nn.softmax_cross_entropy_with_logits(
+        return tf.nn.softmax_cross_entropy_with_logits_v2(
             logits=self.logits,
             labels=one_hot_actions)
     def kl(self, other):
-        a0 = self.logits - tf.reduce_max(self.logits, axis=-1, keep_dims=True)
-        a1 = other.logits - tf.reduce_max(other.logits, axis=-1, keep_dims=True)
+        a0 = self.logits - tf.reduce_max(self.logits, axis=-1, keepdims=True)
+        a1 = other.logits - tf.reduce_max(other.logits, axis=-1, keepdims=True)
         ea0 = tf.exp(a0)
         ea1 = tf.exp(a1)
-        z0 = tf.reduce_sum(ea0, axis=-1, keep_dims=True)
-        z1 = tf.reduce_sum(ea1, axis=-1, keep_dims=True)
+        z0 = tf.reduce_sum(ea0, axis=-1, keepdims=True)
+        z1 = tf.reduce_sum(ea1, axis=-1, keepdims=True)
         p0 = ea0 / z0
         return tf.reduce_sum(p0 * (a0 - tf.log(z0) - a1 + tf.log(z1)), axis=-1)
     def entropy(self):
-        a0 = self.logits - tf.reduce_max(self.logits, axis=-1, keep_dims=True)
+        a0 = self.logits - tf.reduce_max(self.logits, axis=-1, keepdims=True)
         ea0 = tf.exp(a0)
-        z0 = tf.reduce_sum(ea0, axis=-1, keep_dims=True)
+        z0 = tf.reduce_sum(ea0, axis=-1, keepdims=True)
         p0 = ea0 / z0
         return tf.reduce_sum(p0 * (tf.log(z0) - a0), axis=-1)
     def sample(self):
-        u = tf.random_uniform(tf.shape(self.logits))
+        u = tf.random_uniform(tf.shape(self.logits), dtype=self.logits.dtype)
         return tf.argmax(self.logits - tf.log(-tf.log(u)), axis=-1)
     @classmethod
     def fromflat(cls, flat):
diff --git a/baselines/common/input.py b/baselines/common/input.py
index 7fbf9fc..dff9480 100644
--- a/baselines/common/input.py
+++ b/baselines/common/input.py
@@ -1,30 +1,56 @@
 import tensorflow as tf
 from gym.spaces import Discrete, Box
 
-def observation_input(ob_space, batch_size=None, name='Ob'):
-    '''
-    Build observation input with encoding depending on the 
-    observation space type
-    Params:
+def observation_placeholder(ob_space, batch_size=None, name='Ob'):
+    ''' 
+    Create placeholder to feed observations into of the size appropriate to the observation space
     
-    ob_space: observation space (should be one of gym.spaces)
-    batch_size: batch size for input (default is None, so that resulting input placeholder can take tensors with any batch size)
-    name: tensorflow variable name for input placeholder
+    Parameters:
+    ----------
 
-    returns: tuple (input_placeholder, processed_input_tensor)
+    ob_space: gym.Space     observation space
+    
+    batch_size: int         size of the batch to be fed into input. Can be left None in most cases. 
+
+    name: str               name of the placeholder
+
+    Returns:
+    -------
+
+    tensorflow placeholder tensor
+    '''
+
+    assert isinstance(ob_space, Discrete) or isinstance(ob_space, Box), \
+        'Can only deal with Discrete and Box observation spaces for now'
+
+    return tf.placeholder(shape=(batch_size,) + ob_space.shape, dtype=ob_space.dtype, name=name)
+
+
+def observation_input(ob_space, batch_size=None, name='Ob'):
+    ''' 
+    Create placeholder to feed observations into of the size appropriate to the observation space, and add input 
+    encoder of the appropriate type. 
+    '''
+
+    placeholder = observation_placeholder(ob_space, batch_size, name)
+    return placeholder, encode_observation(ob_space, placeholder)
+
+def encode_observation(ob_space, placeholder):
+    '''
+    Encode input in the way that is appropriate to the observation space
+
+    Parameters:
+    ----------
+    
+    ob_space: gym.Space             observation space
+    
+    placeholder: tf.placeholder     observation input placeholder
     '''
     if isinstance(ob_space, Discrete):
-        input_x  = tf.placeholder(shape=(batch_size,), dtype=tf.int32, name=name)
-        processed_x = tf.to_float(tf.one_hot(input_x, ob_space.n))
-        return input_x, processed_x
+        return tf.to_float(tf.one_hot(placeholder, ob_space.n))
 
     elif isinstance(ob_space, Box):
-        input_shape = (batch_size,) + ob_space.shape
-        input_x = tf.placeholder(shape=input_shape, dtype=ob_space.dtype, name=name)
-        processed_x = tf.to_float(input_x)
-        return input_x, processed_x
-
+        return tf.to_float(placeholder)
     else:
         raise NotImplementedError
 
- 
diff --git a/baselines/common/misc_util.py b/baselines/common/misc_util.py
index 9985dea..451de1c 100644
--- a/baselines/common/misc_util.py
+++ b/baselines/common/misc_util.py
@@ -67,14 +67,21 @@ class EzPickle(object):
 
 
 def set_global_seeds(i):
+    try:
+        import MPI
+        rank = MPI.COMM_WORLD.Get_rank()
+    except ImportError:
+        rank = 0
+
+    myseed = i  + 1000 * rank if i is not None else None
     try:
         import tensorflow as tf
     except ImportError:
         pass
     else:
-        tf.set_random_seed(i)
-    np.random.seed(i)
-    random.seed(i)
+        tf.set_random_seed(myseed)
+    np.random.seed(myseed)
+    random.seed(myseed)
 
 
 def pretty_eta(seconds_left):
diff --git a/baselines/common/models.py b/baselines/common/models.py
new file mode 100644
index 0000000..0763095
--- /dev/null
+++ b/baselines/common/models.py
@@ -0,0 +1,177 @@
+import numpy as np
+import tensorflow as tf
+from baselines.a2c import utils
+from baselines.a2c.utils import conv, fc, conv_to_fc, batch_to_seq, seq_to_batch
+from baselines.common.mpi_running_mean_std import RunningMeanStd
+import tensorflow.contrib.layers as layers
+
+
+def nature_cnn(unscaled_images, **conv_kwargs):
+    """
+    CNN from Nature paper.
+    """
+    scaled_images = tf.cast(unscaled_images, tf.float32) / 255.
+    activ = tf.nn.relu
+    h = activ(conv(scaled_images, 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2),
+                   **conv_kwargs))
+    h2 = activ(conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2), **conv_kwargs))
+    h3 = activ(conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2), **conv_kwargs))
+    h3 = conv_to_fc(h3)
+    return activ(fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2)))
+
+
+def mlp(num_layers=2, num_hidden=64, activation=tf.tanh):
+    """
+    Simple fully connected layer policy. Separate stacks of fully-connected layers are used for policy and value function estimation.
+    More customized fully-connected policies can be obtained by using PolicyWithV class directly.
+
+    Parameters:
+    ----------
+
+    num_layers: int                 number of fully-connected layers (default: 2)
+    
+    num_hidden: int                 size of fully-connected layers (default: 64)
+    
+    activation:                     activation function (default: tf.tanh)
+        
+    Returns:
+    -------
+
+    function that builds fully connected network with a given input placeholder
+    """        
+    def network_fn(X):
+        h = tf.layers.flatten(X)
+        for i in range(num_layers):
+            h = activation(fc(h, 'mlp_fc{}'.format(i), nh=num_hidden, init_scale=np.sqrt(2)))
+        return h, None
+
+    return network_fn
+  
+
+def cnn(**conv_kwargs):
+    def network_fn(X):
+        return nature_cnn(X, **conv_kwargs), None
+    return network_fn
+
+def cnn_small(**conv_kwargs):
+    def network_fn(X):
+        h = tf.cast(X, tf.float32) / 255.
+        
+        activ = tf.nn.relu
+        h = activ(conv(h, 'c1', nf=8, rf=8, stride=4, init_scale=np.sqrt(2), **conv_kwargs))
+        h = activ(conv(h, 'c2', nf=16, rf=4, stride=2, init_scale=np.sqrt(2), **conv_kwargs))
+        h = conv_to_fc(h)
+        h = activ(fc(h, 'fc1', nh=128, init_scale=np.sqrt(2)))
+        return h, None
+    return network_fn
+
+
+
+def lstm(nlstm=128, layer_norm=False):
+    def network_fn(X, nenv=1):
+        nbatch = X.shape[0] 
+        nsteps = nbatch // nenv
+         
+        h = tf.layers.flatten(X)
+
+        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
+        S = tf.placeholder(tf.float32, [nenv, 2*nlstm]) #states
+
+        xs = batch_to_seq(h, nenv, nsteps)
+        ms = batch_to_seq(M, nenv, nsteps)
+
+        if layer_norm:
+            h5, snew = utils.lnlstm(xs, ms, S, scope='lnlstm', nh=nlstm)
+        else:
+            h5, snew = utils.lstm(xs, ms, S, scope='lstm', nh=nlstm)
+            
+        h = seq_to_batch(h5)
+        initial_state = np.zeros(S.shape.as_list(), dtype=float)
+
+        return h, {'S':S, 'M':M, 'state':snew, 'initial_state':initial_state}
+
+    return network_fn
+
+
+def cnn_lstm(nlstm=128, layer_norm=False, **conv_kwargs):
+    def network_fn(X, nenv=1):
+        nbatch = X.shape[0] 
+        nsteps = nbatch // nenv
+         
+        h = nature_cnn(X, **conv_kwargs)
+       
+        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
+        S = tf.placeholder(tf.float32, [nenv, 2*nlstm]) #states
+
+        xs = batch_to_seq(h, nenv, nsteps)
+        ms = batch_to_seq(M, nenv, nsteps)
+
+        if layer_norm:
+            h5, snew = utils.lnlstm(xs, ms, S, scope='lnlstm', nh=nlstm)
+        else:
+            h5, snew = utils.lstm(xs, ms, S, scope='lstm', nh=nlstm)
+            
+        h = seq_to_batch(h5)
+        initial_state = np.zeros(S.shape.as_list(), dtype=float)
+
+        return h, {'S':S, 'M':M, 'state':snew, 'initial_state':initial_state}
+
+    return network_fn
+
+def cnn_lnlstm(nlstm=128, **conv_kwargs):
+    return cnn_lstm(nlstm, layer_norm=True, **conv_kwargs)
+
+
+def conv_only(convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], **conv_kwargs):
+    ''' 
+    convolutions-only net
+
+    Parameters:
+    ----------
+
+    conv:       list of triples (filter_number, filter_size, stride) specifying parameters for each layer. 
+
+    Returns:
+
+    function that takes tensorflow tensor as input and returns the output of the last convolutional layer
+    
+    '''
+
+    def network_fn(X):
+        out = X
+        with tf.variable_scope("convnet"):
+            for num_outputs, kernel_size, stride in convs:
+                out = layers.convolution2d(out,
+                                           num_outputs=num_outputs,
+                                           kernel_size=kernel_size,
+                                           stride=stride,
+                                           activation_fn=tf.nn.relu,
+                                           **conv_kwargs)
+
+        return out, None
+    return network_fn
+
+def _normalize_clip_observation(x, clip_range=[-5.0, 5.0]):
+    rms = RunningMeanStd(shape=x.shape[1:])
+    norm_x = tf.clip_by_value((x - rms.mean) / rms.std, min(clip_range), max(clip_range))
+    return norm_x, rms
+    
+
+def get_network_builder(name):
+    # TODO: replace with reflection? 
+    if name == 'cnn':
+        return cnn
+    elif name == 'cnn_small':
+        return cnn_small
+    elif name == 'conv_only':
+        return conv_only
+    elif name == 'mlp':
+        return mlp
+    elif name == 'lstm':
+        return lstm
+    elif name == 'cnn_lstm':
+        return cnn_lstm
+    elif name == 'cnn_lnlstm':
+        return cnn_lnlstm
+    else:
+        raise ValueError('Unknown network type: {}'.format(name))
diff --git a/baselines/common/mpi_adam_optimizer.py b/baselines/common/mpi_adam_optimizer.py
new file mode 100644
index 0000000..8cf09c4
--- /dev/null
+++ b/baselines/common/mpi_adam_optimizer.py
@@ -0,0 +1,31 @@
+import numpy as np
+import tensorflow as tf
+from mpi4py import MPI
+
+class MpiAdamOptimizer(tf.train.AdamOptimizer):
+    """Adam optimizer that averages gradients across mpi processes."""
+    def __init__(self, comm, **kwargs):
+        self.comm = comm
+        tf.train.AdamOptimizer.__init__(self, **kwargs)
+    def compute_gradients(self, loss, var_list, **kwargs):
+        grads_and_vars = tf.train.AdamOptimizer.compute_gradients(self, loss, var_list, **kwargs)
+        grads_and_vars = [(g, v) for g, v in grads_and_vars if g is not None]
+        flat_grad = tf.concat([tf.reshape(g, (-1,)) for g, v in grads_and_vars], axis=0)
+        shapes = [v.shape.as_list() for g, v in grads_and_vars]
+        sizes = [int(np.prod(s)) for s in shapes]
+
+        num_tasks = self.comm.Get_size()
+        buf = np.zeros(sum(sizes), np.float32)
+
+        def _collect_grads(flat_grad):
+            self.comm.Allreduce(flat_grad, buf, op=MPI.SUM)
+            np.divide(buf, float(num_tasks), out=buf)
+            return buf
+
+        avg_flat_grad = tf.py_func(_collect_grads, [flat_grad], tf.float32)
+        avg_flat_grad.set_shape(flat_grad.shape)
+        avg_grads = tf.split(avg_flat_grad, sizes, axis=0)
+        avg_grads_and_vars = [(tf.reshape(g, v.shape), v)
+                    for g, (_, v) in zip(avg_grads, grads_and_vars)]
+
+        return avg_grads_and_vars
diff --git a/baselines/common/mpi_util.py b/baselines/common/mpi_util.py
new file mode 100644
index 0000000..f04187b
--- /dev/null
+++ b/baselines/common/mpi_util.py
@@ -0,0 +1,101 @@
+from collections import defaultdict
+from mpi4py import MPI
+import os, numpy as np
+import platform
+import shutil
+import subprocess
+
+def sync_from_root(sess, variables, comm=None):
+    """
+    Send the root node's parameters to every worker.
+    Arguments:
+      sess: the TensorFlow session.
+      variables: all parameter variables including optimizer's
+    """
+    if comm is None: comm = MPI.COMM_WORLD
+    rank = comm.Get_rank()
+    for var in variables:
+        if rank == 0:
+            comm.Bcast(sess.run(var))
+        else:
+            import tensorflow as tf
+            returned_var = np.empty(var.shape, dtype='float32')
+            comm.Bcast(returned_var)
+            sess.run(tf.assign(var, returned_var))
+
+def gpu_count():
+    """
+    Count the GPUs on this machine.
+    """
+    if shutil.which('nvidia-smi') is None:
+        return 0
+    output = subprocess.check_output(['nvidia-smi', '--query-gpu=gpu_name', '--format=csv'])
+    return max(0, len(output.split(b'\n')) - 2)
+
+def setup_mpi_gpus():
+    """
+    Set CUDA_VISIBLE_DEVICES using MPI.
+    """
+    num_gpus = gpu_count()
+    if num_gpus == 0:
+        return
+    local_rank, _ = get_local_rank_size(MPI.COMM_WORLD)
+    os.environ['CUDA_VISIBLE_DEVICES'] = str(local_rank % num_gpus)
+
+def get_local_rank_size(comm):
+    """
+    Returns the rank of each process on its machine
+    The processes on a given machine will be assigned ranks
+        0, 1, 2, ..., N-1,
+    where N is the number of processes on this machine.
+
+    Useful if you want to assign one gpu per machine
+    """
+    this_node = platform.node()
+    ranks_nodes = comm.allgather((comm.Get_rank(), this_node))
+    node2rankssofar = defaultdict(int)
+    local_rank = None
+    for (rank, node) in ranks_nodes:
+        if rank == comm.Get_rank():
+            local_rank = node2rankssofar[node]
+        node2rankssofar[node] += 1
+    assert local_rank is not None
+    return local_rank, node2rankssofar[this_node]
+
+def share_file(comm, path):
+    """
+    Copies the file from rank 0 to all other ranks
+    Puts it in the same place on all machines
+    """
+    localrank, _ = get_local_rank_size(comm)
+    if comm.Get_rank() == 0:
+        with open(path, 'rb') as fh:
+            data = fh.read()
+        comm.bcast(data)
+    else:
+        data = comm.bcast(None)
+        if localrank == 0:
+            os.makedirs(os.path.dirname(path), exist_ok=True)
+            with open(path, 'wb') as fh:
+                fh.write(data)
+    comm.Barrier()
+
+def dict_gather(comm, d, op='mean', assert_all_have_data=True):
+    if comm is None: return d
+    alldicts = comm.allgather(d)
+    size = comm.size
+    k2li = defaultdict(list)
+    for d in alldicts:
+        for (k,v) in d.items():
+            k2li[k].append(v)
+    result = {}
+    for (k,li) in k2li.items():
+        if assert_all_have_data:
+            assert len(li)==size, "only %i out of %i MPI workers have sent '%s'" % (len(li), size, k)
+        if op=='mean':
+            result[k] = np.mean(li, axis=0)
+        elif op=='sum':
+            result[k] = np.sum(li, axis=0)
+        else:
+            assert 0, op
+    return result
diff --git a/baselines/common/policies.py b/baselines/common/policies.py
new file mode 100644
index 0000000..4ad41cb
--- /dev/null
+++ b/baselines/common/policies.py
@@ -0,0 +1,179 @@
+import tensorflow as tf
+from baselines.common import tf_util
+from baselines.a2c.utils import fc
+from baselines.common.distributions import make_pdtype
+from baselines.common.input import observation_placeholder, encode_observation
+from baselines.common.tf_util import adjust_shape
+from baselines.common.mpi_running_mean_std import RunningMeanStd
+from baselines.common.models import get_network_builder
+
+import gym
+
+
+class PolicyWithValue(object):
+    """
+    Encapsulates fields and methods for RL policy and value function estimation with shared parameters
+    """
+
+    def __init__(self, env, observations, latent, estimate_q=False, vf_latent=None, sess=None, **tensors):
+        """
+        Parameters:
+        ----------
+        env             RL environment
+
+        observations    tensorflow placeholder in which the observations will be fed
+
+        latent          latent state from which policy distribution parameters should be inferred
+
+        vf_latent       latent state from which value function should be inferred (if None, then latent is used)
+
+        sess            tensorflow session to run calculations in (if None, default session is used)
+
+        **tensors       tensorflow tensors for additional attributes such as state or mask
+
+        """
+            
+        self.X = observations
+        self.state = tf.constant([])
+        self.initial_state = None
+        self.__dict__.update(tensors)
+
+        vf_latent = vf_latent if vf_latent is not None else latent
+
+        vf_latent = tf.layers.flatten(vf_latent)
+        latent = tf.layers.flatten(latent)
+
+        self.pdtype = make_pdtype(env.action_space)
+
+        self.pd, self.pi = self.pdtype.pdfromlatent(latent, init_scale=0.01)
+
+        self.action = self.pd.sample()
+        self.neglogp = self.pd.neglogp(self.action)
+        self.sess = sess
+
+        if estimate_q:
+            assert isinstance(env.action_space, gym.spaces.Discrete)
+            self.q = fc(vf_latent, 'q', env.action_space.n)
+            self.vf = self.q
+        else:
+            self.vf = fc(vf_latent, 'vf', 1)
+            self.vf = self.vf[:,0]
+
+    def _evaluate(self, variables, observation, **extra_feed):
+        sess = self.sess or tf.get_default_session()
+        feed_dict = {self.X: adjust_shape(self.X, observation)}
+        for inpt_name, data in extra_feed.items():
+            if inpt_name in self.__dict__.keys():
+                inpt = self.__dict__[inpt_name]
+                if isinstance(inpt, tf.Tensor) and inpt._op.type == 'Placeholder':
+                    feed_dict[inpt] = adjust_shape(inpt, data)
+
+        return sess.run(variables, feed_dict)
+
+    def step(self, observation, **extra_feed):
+        """
+        Compute next action(s) given the observaion(s)
+
+        Parameters:
+        ----------
+
+        observation     observation data (either single or a batch)
+
+        **extra_feed    additional data such as state or mask (names of the arguments should match the ones in constructor, see __init__)
+
+        Returns:
+        -------
+        (action, value estimate, next state, negative log likelihood of the action under current policy parameters) tuple
+        """
+    
+        a, v, state, neglogp = self._evaluate([self.action, self.vf, self.state, self.neglogp], observation, **extra_feed)
+        if state.size == 0:
+            state = None
+        return a, v, state, neglogp
+
+    def value(self, ob, *args, **kwargs):
+        """
+        Compute value estimate(s) given the observaion(s)
+
+        Parameters:
+        ----------
+
+        observation     observation data (either single or a batch)
+
+        **extra_feed    additional data such as state or mask (names of the arguments should match the ones in constructor, see __init__)
+
+        Returns:
+        -------
+        value estimate
+        """
+        return self._evaluate(self.vf, ob, *args, **kwargs)      
+
+    def save(self, save_path):
+        tf_util.save_state(save_path, sess=self.sess)
+
+    def load(self, load_path):
+        tf_util.load_state(load_path, sess=self.sess)
+  
+def build_policy(env, policy_network, value_network=None,  normalize_observations=False, estimate_q=False, **policy_kwargs):
+    if isinstance(policy_network, str):
+        network_type = policy_network
+        policy_network = get_network_builder(network_type)(**policy_kwargs)
+
+    def policy_fn(nbatch=None, nsteps=None, sess=None, observ_placeholder=None):
+        ob_space = env.observation_space
+
+        X = observ_placeholder if observ_placeholder is not None else observation_placeholder(ob_space, batch_size=nbatch)
+        
+        extra_tensors = {}
+
+        if normalize_observations and X.dtype == tf.float32:
+            encoded_x, rms = _normalize_clip_observation(X)
+            extra_tensors['rms'] = rms
+        else:
+            encoded_x = X
+
+        encoded_x = encode_observation(ob_space, encoded_x)
+
+        with tf.variable_scope('pi', reuse=tf.AUTO_REUSE):
+            policy_latent, recurrent_tensors = policy_network(encoded_x)
+
+            if recurrent_tensors is not None:
+                # recurrent architecture, need a few more steps
+                nenv = nbatch // nsteps
+                assert nenv > 0, 'Bad input for recurrent policy: batch size {} smaller than nsteps {}'.format(nbatch, nsteps)
+                policy_latent, recurrent_tensors = policy_network(encoded_x, nenv)
+                extra_tensors.update(recurrent_tensors)
+
+            
+        _v_net = value_network
+
+        if _v_net is None or _v_net == 'shared':
+            vf_latent = policy_latent
+        else:
+            if _v_net == 'copy':
+                _v_net = policy_network
+            else:
+                assert callable(_v_net)
+ 
+            with tf.variable_scope('vf', reuse=tf.AUTO_REUSE):
+                vf_latent, _ = _v_net(encoded_x)
+        
+        policy = PolicyWithValue(
+            env=env,
+            observations=X,
+            latent=policy_latent,
+            vf_latent=vf_latent,
+            sess=sess,
+            estimate_q=estimate_q,
+            **extra_tensors
+        )
+        return policy
+
+    return policy_fn
+
+
+def _normalize_clip_observation(x, clip_range=[-5.0, 5.0]):
+    rms = RunningMeanStd(shape=x.shape[1:])
+    norm_x = tf.clip_by_value((x - rms.mean) / rms.std, min(clip_range), max(clip_range))
+    return norm_x, rms
+    
diff --git a/baselines/common/retro_wrappers.py b/baselines/common/retro_wrappers.py
new file mode 100644
index 0000000..3eb2eb3
--- /dev/null
+++ b/baselines/common/retro_wrappers.py
@@ -0,0 +1,293 @@
+ # flake8: noqa F403, F405
+from .atari_wrappers import *
+import numpy as np
+import gym
+
+class TimeLimit(gym.Wrapper):
+    def __init__(self, env, max_episode_steps=None):
+        super(TimeLimit, self).__init__(env)
+        self._max_episode_steps = max_episode_steps
+        self._elapsed_steps = 0
+
+    def step(self, ac):
+        observation, reward, done, info = self.env.step(ac)
+        self._elapsed_steps += 1
+        if self._elapsed_steps >= self._max_episode_steps:
+            done = True
+            info['TimeLimit.truncated'] = True
+        return observation, reward, done, info
+
+    def reset(self, **kwargs):
+        self._elapsed_steps = 0
+        return self.env.reset(**kwargs)
+
+class StochasticFrameSkip(gym.Wrapper):
+    def __init__(self, env, n, stickprob):
+        gym.Wrapper.__init__(self, env)
+        self.n = n
+        self.stickprob = stickprob
+        self.curac = None
+        self.rng = np.random.RandomState()
+        self.supports_want_render = hasattr(env, "supports_want_render")
+
+    def reset(self, **kwargs):
+        self.curac = None
+        return self.env.reset(**kwargs)
+
+    def step(self, ac):
+        done = False
+        totrew = 0
+        for i in range(self.n):
+            # First step after reset, use action
+            if self.curac is None:
+                self.curac = ac
+            # First substep, delay with probability=stickprob
+            elif i==0:
+                if self.rng.rand() > self.stickprob:
+                    self.curac = ac
+            # Second substep, new action definitely kicks in
+            elif i==1:
+                self.curac = ac
+            if self.supports_want_render and i<self.n-1:
+                ob, rew, done, info = self.env.step(self.curac, want_render=False)
+            else:
+                ob, rew, done, info = self.env.step(self.curac)
+            totrew += rew
+            if done: break
+        return ob, totrew, done, info
+
+    def seed(self, s):
+        self.rng.seed(s)
+
+class PartialFrameStack(gym.Wrapper):
+    def __init__(self, env, k, channel=1):
+        """
+        Stack one channel (channel keyword) from previous frames
+        """
+        gym.Wrapper.__init__(self, env)
+        shp = env.observation_space.shape
+        self.channel = channel
+        self.observation_space = gym.spaces.Box(low=0, high=255,
+            shape=(shp[0], shp[1], shp[2] + k - 1),
+            dtype=env.observation_space.dtype)
+        self.k = k
+        self.frames = deque([], maxlen=k)
+        shp = env.observation_space.shape
+
+    def reset(self):
+        ob = self.env.reset()
+        assert ob.shape[2] > self.channel
+        for _ in range(self.k):
+            self.frames.append(ob)
+        return self._get_ob()
+
+    def step(self, ac):
+        ob, reward, done, info = self.env.step(ac)
+        self.frames.append(ob)
+        return self._get_ob(), reward, done, info
+
+    def _get_ob(self):
+        assert len(self.frames) == self.k
+        return np.concatenate([frame if i==self.k-1 else frame[:,:,self.channel:self.channel+1]
+            for (i, frame) in enumerate(self.frames)], axis=2)
+
+class Downsample(gym.ObservationWrapper):
+    def __init__(self, env, ratio):
+        """
+        Downsample images by a factor of ratio
+        """
+        gym.ObservationWrapper.__init__(self, env)
+        (oldh, oldw, oldc) = env.observation_space.shape
+        newshape = (oldh//ratio, oldw//ratio, oldc)
+        self.observation_space = spaces.Box(low=0, high=255,
+            shape=newshape, dtype=np.uint8)
+
+    def observation(self, frame):
+        height, width, _ = self.observation_space.shape
+        frame = cv2.resize(frame, (width, height), interpolation=cv2.INTER_AREA)
+        if frame.ndim == 2:
+            frame = frame[:,:,None]
+        return frame
+
+class Rgb2gray(gym.ObservationWrapper):
+    def __init__(self, env):
+        """
+        Downsample images by a factor of ratio
+        """
+        gym.ObservationWrapper.__init__(self, env)
+        (oldh, oldw, _oldc) = env.observation_space.shape
+        self.observation_space = spaces.Box(low=0, high=255,
+            shape=(oldh, oldw, 1), dtype=np.uint8)
+
+    def observation(self, frame):
+        frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
+        return frame[:,:,None]
+
+
+class MovieRecord(gym.Wrapper):
+    def __init__(self, env, savedir, k):
+        gym.Wrapper.__init__(self, env)
+        self.savedir = savedir
+        self.k = k
+        self.epcount = 0
+    def reset(self):
+        if self.epcount % self.k == 0:
+            print('saving movie this episode', self.savedir)
+            self.env.unwrapped.movie_path = self.savedir
+        else:
+            print('not saving this episode')
+            self.env.unwrapped.movie_path = None
+            self.env.unwrapped.movie = None
+        self.epcount += 1
+        return self.env.reset()
+
+class AppendTimeout(gym.Wrapper):
+    def __init__(self, env):
+        gym.Wrapper.__init__(self, env)
+        self.action_space = env.action_space
+        self.timeout_space = gym.spaces.Box(low=np.array([0.0]), high=np.array([1.0]), dtype=np.float32)
+        self.original_os = env.observation_space
+        if isinstance(self.original_os, gym.spaces.Dict):
+            import copy
+            ordered_dict = copy.deepcopy(self.original_os.spaces)
+            ordered_dict['value_estimation_timeout'] = self.timeout_space
+            self.observation_space = gym.spaces.Dict(ordered_dict)
+            self.dict_mode = True
+        else:
+            self.observation_space = gym.spaces.Dict({
+                'original': self.original_os,
+                'value_estimation_timeout': self.timeout_space
+                })
+            self.dict_mode = False
+        self.ac_count = None
+        while 1:
+            if not hasattr(env, "_max_episode_steps"):  # Looking for TimeLimit wrapper that has this field
+                env = env.env
+                continue
+            break
+        self.timeout = env._max_episode_steps
+
+    def step(self, ac):
+        self.ac_count += 1
+        ob, rew, done, info = self.env.step(ac)
+        return self._process(ob), rew, done, info
+
+    def reset(self):
+        self.ac_count = 0
+        return self._process(self.env.reset())
+
+    def _process(self, ob):
+        fracmissing = 1 - self.ac_count / self.timeout
+        if self.dict_mode:
+            ob['value_estimation_timeout'] = fracmissing
+        else:
+            return { 'original': ob, 'value_estimation_timeout': fracmissing }
+
+class StartDoingRandomActionsWrapper(gym.Wrapper):
+    """
+    Warning: can eat info dicts, not good if you depend on them
+    """
+    def __init__(self, env, max_random_steps, on_startup=True, every_episode=False):
+        gym.Wrapper.__init__(self, env)
+        self.on_startup = on_startup
+        self.every_episode = every_episode
+        self.random_steps = max_random_steps
+        self.last_obs = None
+        if on_startup:
+            self.some_random_steps()
+
+    def some_random_steps(self):
+        self.last_obs = self.env.reset()
+        n = np.random.randint(self.random_steps)
+        #print("running for random %i frames" % n)
+        for _ in range(n):
+            self.last_obs, _, done, _ = self.env.step(self.env.action_space.sample())
+            if done: self.last_obs = self.env.reset()
+
+    def reset(self):
+        return self.last_obs
+
+    def step(self, a):
+        self.last_obs, rew, done, info = self.env.step(a)
+        if done:
+            self.last_obs = self.env.reset()
+            if self.every_episode:
+                self.some_random_steps()
+        return self.last_obs, rew, done, info
+
+def make_retro(*, game, state, max_episode_steps, **kwargs):
+    import retro
+    env = retro.make(game, state, **kwargs)
+    env = StochasticFrameSkip(env, n=4, stickprob=0.25)
+    if max_episode_steps is not None:
+        env = TimeLimit(env, max_episode_steps=max_episode_steps)
+    return env
+
+def wrap_deepmind_retro(env, scale=True, frame_stack=4):
+    """
+    Configure environment for retro games, using config similar to DeepMind-style Atari in wrap_deepmind
+    """
+    env = WarpFrame(env)
+    env = ClipRewardEnv(env)
+    env = FrameStack(env, frame_stack)
+    if scale:
+        env = ScaledFloatFrame(env)
+    return env
+
+class SonicDiscretizer(gym.ActionWrapper):
+    """
+    Wrap a gym-retro environment and make it use discrete
+    actions for the Sonic game.
+    """
+    def __init__(self, env):
+        super(SonicDiscretizer, self).__init__(env)
+        buttons = ["B", "A", "MODE", "START", "UP", "DOWN", "LEFT", "RIGHT", "C", "Y", "X", "Z"]
+        actions = [['LEFT'], ['RIGHT'], ['LEFT', 'DOWN'], ['RIGHT', 'DOWN'], ['DOWN'],
+                   ['DOWN', 'B'], ['B']]
+        self._actions = []
+        for action in actions:
+            arr = np.array([False] * 12)
+            for button in action:
+                arr[buttons.index(button)] = True
+            self._actions.append(arr)
+        self.action_space = gym.spaces.Discrete(len(self._actions))
+
+    def action(self, a): # pylint: disable=W0221
+        return self._actions[a].copy()
+
+class RewardScaler(gym.RewardWrapper):
+    """
+    Bring rewards to a reasonable scale for PPO.
+    This is incredibly important and effects performance
+    drastically.
+    """
+    def __init__(self, env, scale=0.01):
+        super(RewardScaler, self).__init__(env)
+        self.scale = scale
+
+    def reward(self, reward):
+        return reward * self.scale
+
+class AllowBacktracking(gym.Wrapper):
+    """
+    Use deltas in max(X) as the reward, rather than deltas
+    in X. This way, agents are not discouraged too heavily
+    from exploring backwards if there is no way to advance
+    head-on in the level.
+    """
+    def __init__(self, env):
+        super(AllowBacktracking, self).__init__(env)
+        self._cur_x = 0
+        self._max_x = 0
+
+    def reset(self, **kwargs): # pylint: disable=E0202
+        self._cur_x = 0
+        self._max_x = 0
+        return self.env.reset(**kwargs)
+
+    def step(self, action): # pylint: disable=E0202
+        obs, rew, done, info = self.env.step(action)
+        self._cur_x += rew
+        rew = max(0, self._cur_x - self._max_x)
+        self._max_x = max(self._max_x, self._cur_x)
+        return obs, rew, done, info
diff --git a/baselines/common/runners.py b/baselines/common/runners.py
index 0a4b221..c30e322 100644
--- a/baselines/common/runners.py
+++ b/baselines/common/runners.py
@@ -5,7 +5,7 @@ class AbstractEnvRunner(ABC):
     def __init__(self, *, env, model, nsteps):
         self.env = env
         self.model = model
-        nenv = env.num_envs
+        self.nenv = nenv = env.num_envs if hasattr(env, 'num_envs') else 1
         self.batch_ob_shape = (nenv*nsteps,) + env.observation_space.shape
         self.obs = np.zeros((nenv,) + env.observation_space.shape, dtype=env.observation_space.dtype.name)
         self.obs[:] = env.reset()
@@ -16,3 +16,4 @@ class AbstractEnvRunner(ABC):
     @abstractmethod
     def run(self):
         raise NotImplementedError
+
diff --git a/baselines/common/running_mean_std.py b/baselines/common/running_mean_std.py
index 06ba8d8..504c7c9 100644
--- a/baselines/common/running_mean_std.py
+++ b/baselines/common/running_mean_std.py
@@ -1,4 +1,7 @@
+import tensorflow as tf
 import numpy as np
+from baselines.common.tf_util import get_session
+
 class RunningMeanStd(object):
     # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
     def __init__(self, epsilon=1e-4, shape=()):
@@ -13,20 +16,71 @@ class RunningMeanStd(object):
         self.update_from_moments(batch_mean, batch_var, batch_count)
 
     def update_from_moments(self, batch_mean, batch_var, batch_count):
-        delta = batch_mean - self.mean
-        tot_count = self.count + batch_count
+        self.mean, self.var, self.count = update_mean_var_count_from_moments(
+            self.mean, self.var, self.count, batch_mean, batch_var, batch_count)
 
-        new_mean = self.mean + delta * batch_count / tot_count        
-        m_a = self.var * (self.count)
-        m_b = batch_var * (batch_count)
-        M2 = m_a + m_b + np.square(delta) * self.count * batch_count / (self.count + batch_count)
-        new_var = M2 / (self.count + batch_count)
+def update_mean_var_count_from_moments(mean, var, count, batch_mean, batch_var, batch_count):
+    delta = batch_mean - mean
+    tot_count = count + batch_count
 
-        new_count = batch_count + self.count
+    new_mean = mean + delta * batch_count / tot_count        
+    m_a = var * count
+    m_b = batch_var * batch_count
+    M2 = m_a + m_b + np.square(delta) * count * batch_count / (count + batch_count)
+    new_var = M2 / (count + batch_count)
+    new_count = batch_count + count
+    
+    return new_mean, new_var, new_count
+    
 
-        self.mean = new_mean
-        self.var = new_var
-        self.count = new_count    
+class TfRunningMeanStd(object):
+    # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
+    '''
+    TensorFlow variables-based implmentation of computing running mean and std
+    Benefit of this implementation is that it can be saved / loaded together with the tensorflow model
+    '''
+    def __init__(self, epsilon=1e-4, shape=(), scope=''):
+        sess = get_session()
+
+        self._new_mean = tf.placeholder(shape=shape, dtype=tf.float64)
+        self._new_var = tf.placeholder(shape=shape, dtype=tf.float64)
+        self._new_count = tf.placeholder(shape=(), dtype=tf.float64)
+
+        
+        with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
+            self._mean  = tf.get_variable('mean',  initializer=np.zeros(shape, 'float64'),      dtype=tf.float64)
+            self._var   = tf.get_variable('std',   initializer=np.ones(shape, 'float64'),       dtype=tf.float64)    
+            self._count = tf.get_variable('count', initializer=np.full((), epsilon, 'float64'), dtype=tf.float64)
+
+        self.update_ops = tf.group([
+            self._var.assign(self._new_var),
+            self._mean.assign(self._new_mean),
+            self._count.assign(self._new_count)
+        ])
+
+        sess.run(tf.variables_initializer([self._mean, self._var, self._count]))
+        self.sess = sess
+        self._set_mean_var_count()
+    
+    def _set_mean_var_count(self):
+        self.mean, self.var, self.count = self.sess.run([self._mean, self._var, self._count])                    
+         
+    def update(self, x):
+        batch_mean = np.mean(x, axis=0)
+        batch_var = np.var(x, axis=0)
+        batch_count = x.shape[0]
+
+        new_mean, new_var, new_count = update_mean_var_count_from_moments(self.mean, self.var, self.count, batch_mean, batch_var, batch_count)
+
+        self.sess.run(self.update_ops, feed_dict={
+            self._new_mean: new_mean,
+            self._new_var: new_var, 
+            self._new_count: new_count
+        })
+
+        self._set_mean_var_count()
+
+        
 
 def test_runningmeanstd():
     for (x1, x2, x3) in [
@@ -43,4 +97,91 @@ def test_runningmeanstd():
         rms.update(x3)
         ms2 = [rms.mean, rms.var]
 
-        assert np.allclose(ms1, ms2)
+        np.testing.assert_allclose(ms1, ms2)
+
+def test_tf_runningmeanstd():
+    for (x1, x2, x3) in [
+        (np.random.randn(3), np.random.randn(4), np.random.randn(5)),
+        (np.random.randn(3,2), np.random.randn(4,2), np.random.randn(5,2)),
+        ]:
+
+        rms = TfRunningMeanStd(epsilon=0.0, shape=x1.shape[1:], scope='running_mean_std' + str(np.random.randint(0, 128)))
+
+        x = np.concatenate([x1, x2, x3], axis=0)
+        ms1 = [x.mean(axis=0), x.var(axis=0)]
+        rms.update(x1)
+        rms.update(x2)
+        rms.update(x3)
+        ms2 = [rms.mean, rms.var]
+
+        np.testing.assert_allclose(ms1, ms2)
+
+
+def profile_tf_runningmeanstd():
+    import time
+    from baselines.common import tf_util
+
+    tf_util.get_session( config=tf.ConfigProto(
+        inter_op_parallelism_threads=1,
+        intra_op_parallelism_threads=1,
+        allow_soft_placement=True
+    ))
+
+    x = np.random.random((376,))
+
+    n_trials = 10000
+    rms = RunningMeanStd()
+    tfrms = TfRunningMeanStd()
+
+    tic1 = time.time()
+    for _ in range(n_trials):
+        rms.update(x)
+
+    tic2 = time.time()
+    for _ in range(n_trials):
+        tfrms.update(x)
+
+    tic3 = time.time()
+
+    print('rms update time ({} trials): {} s'.format(n_trials, tic2 - tic1))
+    print('tfrms update time ({} trials): {} s'.format(n_trials, tic3 - tic2))
+    
+
+    tic1 = time.time()
+    for _ in range(n_trials):
+        z1 = rms.mean
+
+    tic2 = time.time()
+    for _ in range(n_trials):
+        z2 = tfrms.mean
+
+    assert z1 == z2
+
+    tic3 = time.time()
+
+    print('rms get mean time ({} trials): {} s'.format(n_trials, tic2 - tic1))
+    print('tfrms get mean time ({} trials): {} s'.format(n_trials, tic3 - tic2))
+         
+    
+
+    '''
+    options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) #pylint: disable=E1101
+    run_metadata = tf.RunMetadata()
+    profile_opts = dict(options=options, run_metadata=run_metadata)
+
+    
+
+    from tensorflow.python.client import timeline
+    fetched_timeline = timeline.Timeline(run_metadata.step_stats) #pylint: disable=E1101
+    chrome_trace = fetched_timeline.generate_chrome_trace_format()
+    outfile = '/tmp/timeline.json'
+    with open(outfile, 'wt') as f: 
+        f.write(chrome_trace)
+    print(f'Successfully saved profile to {outfile}. Exiting.')
+    exit(0)
+    '''
+
+
+
+if __name__ == '__main__':
+   profile_tf_runningmeanstd() 
diff --git a/baselines/common/test_identity.py b/baselines/common/test_identity.py
deleted file mode 100644
index a429e0c..0000000
--- a/baselines/common/test_identity.py
+++ /dev/null
@@ -1,44 +0,0 @@
-import pytest
-import tensorflow as tf
-import random
-import numpy as np
-from gym.spaces import np_random
-
-from baselines.a2c import a2c
-from baselines.ppo2 import ppo2
-from baselines.common.identity_env import IdentityEnv
-from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
-from baselines.ppo2.policies import MlpPolicy
-
-
-learn_func_list = [
-    lambda e: a2c.learn(policy=MlpPolicy, env=e, seed=0, total_timesteps=50000),
-    lambda e: ppo2.learn(policy=MlpPolicy, env=e, total_timesteps=50000, lr=1e-3, nsteps=128, ent_coef=0.01)
-]
-
-
-@pytest.mark.slow
-@pytest.mark.parametrize("learn_func", learn_func_list)
-def test_identity(learn_func):
-    '''
-    Test if the algorithm (with a given policy) 
-    can learn an identity transformation (i.e. return observation as an action)
-    '''
-    np.random.seed(0)
-    np_random.seed(0)
-    random.seed(0)
-
-    env = DummyVecEnv([lambda: IdentityEnv(10)])
-
-    with tf.Graph().as_default(), tf.Session().as_default():
-        tf.set_random_seed(0)
-        model = learn_func(env)
-
-        N_TRIALS = 1000
-        sum_rew = 0
-        obs = env.reset()
-        for i in range(N_TRIALS):
-            obs, rew, done, _ = env.step(model.step(obs)[0])
-            sum_rew += rew
-
-        assert sum_rew > 0.9 * N_TRIALS
diff --git a/baselines/common/tests/__init__.py b/baselines/common/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/baselines/common/tests/envs/__init__.py b/baselines/common/tests/envs/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/baselines/common/tests/envs/fixed_sequence_env.py b/baselines/common/tests/envs/fixed_sequence_env.py
new file mode 100644
index 0000000..9f1b03d
--- /dev/null
+++ b/baselines/common/tests/envs/fixed_sequence_env.py
@@ -0,0 +1,44 @@
+import numpy as np
+from gym import Env
+from gym.spaces import Discrete
+
+
+class FixedSequenceEnv(Env):
+    def __init__(
+            self,
+            n_actions=10,
+            seed=0,
+            episode_len=100
+    ):
+        self.np_random = np.random.RandomState()
+        self.np_random.seed(seed)
+        self.sequence = [self.np_random.randint(0, n_actions-1) for _ in range(episode_len)]
+
+        self.action_space = Discrete(n_actions)
+        self.observation_space = Discrete(1)
+
+        self.episode_len = episode_len
+        self.time = 0
+        self.reset()
+
+    def reset(self):
+        self.time = 0
+        return 0
+
+    def step(self, actions):
+        rew = self._get_reward(actions)
+        self._choose_next_state()
+        done = False
+        if self.episode_len and self.time >= self.episode_len:
+            rew = 0
+            done = True
+
+        return 0, rew, done, {}
+
+    def _choose_next_state(self):
+        self.time += 1
+
+    def _get_reward(self, actions):
+        return 1 if actions == self.sequence[self.time] else 0
+        
+
diff --git a/baselines/common/tests/envs/identity_env.py b/baselines/common/tests/envs/identity_env.py
new file mode 100644
index 0000000..005d3ff
--- /dev/null
+++ b/baselines/common/tests/envs/identity_env.py
@@ -0,0 +1,70 @@
+import numpy as np
+from abc import abstractmethod
+from gym import Env
+from gym.spaces import Discrete, Box
+
+
+class IdentityEnv(Env):
+    def __init__(
+            self,
+            episode_len=None
+    ):
+
+        self.episode_len = episode_len
+        self.time = 0
+        self.reset()
+
+    def reset(self):
+        self._choose_next_state()
+        self.time = 0
+        self.observation_space = self.action_space
+
+        return self.state
+
+    def step(self, actions):
+        rew = self._get_reward(actions)
+        self._choose_next_state()
+        done = False
+        if self.episode_len and self.time >= self.episode_len:
+            rew = 0
+            done = True
+
+        return self.state, rew, done, {}
+
+    def _choose_next_state(self):
+        self.state = self.action_space.sample()
+        self.time += 1
+
+    @abstractmethod
+    def _get_reward(self, actions):
+        raise NotImplementedError
+
+
+class DiscreteIdentityEnv(IdentityEnv):
+    def __init__(
+            self,
+            dim,
+            episode_len=None,
+    ):
+
+        self.action_space = Discrete(dim)
+        super().__init__(episode_len=episode_len)
+
+    def _get_reward(self, actions):
+        return 1 if self.state == actions else 0
+
+
+class BoxIdentityEnv(IdentityEnv):
+    def __init__(
+            self,
+            shape,
+            episode_len=None,
+    ):
+
+        self.action_space = Box(low=-1.0, high=1.0, shape=shape)
+        super().__init__(episode_len=episode_len)
+
+    def _get_reward(self, actions):
+        diff = actions - self.state
+        diff = diff[:]
+        return -0.5 * np.dot(diff, diff)
diff --git a/baselines/common/tests/envs/mnist_env.py b/baselines/common/tests/envs/mnist_env.py
new file mode 100644
index 0000000..563e215
--- /dev/null
+++ b/baselines/common/tests/envs/mnist_env.py
@@ -0,0 +1,70 @@
+import os.path as osp
+import numpy as np
+import tempfile
+import filelock
+from gym import Env
+from gym.spaces import Discrete, Box
+
+
+
+class MnistEnv(Env):
+    def __init__(
+            self,
+            seed=0,
+            episode_len=None,
+            no_images=None
+    ):
+        from tensorflow.examples.tutorials.mnist import input_data
+        # we could use temporary directory for this with a context manager and 
+        # TemporaryDirecotry, but then each test that uses mnist would re-download the data
+        # this way the data is not cleaned up, but we only download it once per machine
+        mnist_path = osp.join(tempfile.gettempdir(), 'MNIST_data')
+        with filelock.FileLock(mnist_path + '.lock'):
+           self.mnist = input_data.read_data_sets(mnist_path)
+
+        self.np_random = np.random.RandomState()
+        self.np_random.seed(seed)
+
+        self.observation_space = Box(low=0.0, high=1.0, shape=(28,28,1))
+        self.action_space = Discrete(10)
+        self.episode_len = episode_len
+        self.time = 0
+        self.no_images = no_images
+
+        self.train_mode()
+        self.reset()
+        
+    def reset(self):
+        self._choose_next_state()
+        self.time = 0
+
+        return self.state[0]
+
+    def step(self, actions):
+        rew = self._get_reward(actions)
+        self._choose_next_state()
+        done = False
+        if self.episode_len and self.time >= self.episode_len:
+            rew = 0
+            done = True
+
+        return self.state[0], rew, done, {}
+
+    def train_mode(self):
+        self.dataset = self.mnist.train
+
+    def test_mode(self):
+        self.dataset = self.mnist.test
+
+    def _choose_next_state(self):
+        max_index = (self.no_images if self.no_images is not None else self.dataset.num_examples) - 1
+        index = self.np_random.randint(0, max_index)
+        image = self.dataset.images[index].reshape(28,28,1)*255
+        label = self.dataset.labels[index]
+        self.state = (image, label)
+        self.time += 1
+
+    def _get_reward(self, actions):
+        return 1 if self.state[1] == actions else 0
+
+
diff --git a/baselines/common/tests/test_cartpole.py b/baselines/common/tests/test_cartpole.py
new file mode 100644
index 0000000..359006c
--- /dev/null
+++ b/baselines/common/tests/test_cartpole.py
@@ -0,0 +1,40 @@
+import pytest
+import gym
+
+from baselines.run import get_learn_function
+from baselines.common.tests.util import reward_per_episode_test
+
+common_kwargs = dict(
+    total_timesteps=30000,
+    network='mlp',
+    gamma=1.0,
+    seed=0,
+)
+   
+learn_kwargs = {
+    'a2c' : dict(nsteps=32, value_network='copy', lr=0.05),
+    'acktr': dict(nsteps=32, value_network='copy'),
+    'deepq': {},
+    'ppo2': dict(value_network='copy'),
+    'trpo_mpi': {}
+}
+
+@pytest.mark.slow
+@pytest.mark.parametrize("alg", learn_kwargs.keys())
+def test_cartpole(alg):
+    '''
+    Test if the algorithm (with an mlp policy)
+    can learn to balance the cartpole
+    '''
+
+    kwargs = common_kwargs.copy()
+    kwargs.update(learn_kwargs[alg])
+
+    learn_fn = lambda e: get_learn_function(alg)(env=e, **kwargs)
+    def env_fn(): 
+        
+        env = gym.make('CartPole-v0')
+        env.seed(0)
+        return env
+
+    reward_per_episode_test(env_fn, learn_fn, 100)
diff --git a/baselines/common/tests/test_fixed_sequence.py b/baselines/common/tests/test_fixed_sequence.py
new file mode 100644
index 0000000..f15ce0f
--- /dev/null
+++ b/baselines/common/tests/test_fixed_sequence.py
@@ -0,0 +1,51 @@
+import pytest
+from baselines.common.tests.envs.fixed_sequence_env import FixedSequenceEnv
+
+from baselines.common.tests.util import simple_test
+from baselines.run import get_learn_function
+
+common_kwargs = dict(
+    seed=0,
+    total_timesteps=50000,
+)
+    
+learn_kwargs = {
+    'a2c': {},
+    'ppo2': dict(nsteps=10, ent_coef=0.0, nminibatches=1),
+    # TODO enable sequential models for trpo_mpi (proper handling of nbatch and nsteps)
+    # github issue: https://github.com/openai/baselines/issues/188
+    # 'trpo_mpi': lambda e, p: trpo_mpi.learn(policy_fn=p(env=e), env=e, max_timesteps=30000, timesteps_per_batch=100, cg_iters=10, gamma=0.9, lam=1.0, max_kl=0.001)
+}
+
+
+alg_list = learn_kwargs.keys()
+rnn_list = ['lstm']
+
+@pytest.mark.slow
+@pytest.mark.parametrize("alg", alg_list)
+@pytest.mark.parametrize("rnn", rnn_list)
+def test_fixed_sequence(alg, rnn):
+    '''
+    Test if the algorithm (with a given policy)
+    can learn an identity transformation (i.e. return observation as an action)
+    '''
+
+    kwargs = learn_kwargs[alg]
+    kwargs.update(common_kwargs)
+
+    episode_len = 5
+    env_fn = lambda: FixedSequenceEnv(10, episode_len=episode_len)
+    learn = lambda e: get_learn_function(alg)(
+        env=e, 
+        network=rnn,
+        **kwargs
+    )
+
+    simple_test(env_fn, learn, 0.7)
+
+
+if __name__ == '__main__':
+    test_fixed_sequence('ppo2', 'lstm')
+
+    
+
diff --git a/baselines/common/tests/test_identity.py b/baselines/common/tests/test_identity.py
new file mode 100644
index 0000000..71d5a3e
--- /dev/null
+++ b/baselines/common/tests/test_identity.py
@@ -0,0 +1,55 @@
+import pytest
+from baselines.common.tests.envs.identity_env import DiscreteIdentityEnv, BoxIdentityEnv
+from baselines.run import get_learn_function
+from baselines.common.tests.util import simple_test
+
+common_kwargs = dict(
+    total_timesteps=30000,
+    network='mlp',
+    gamma=0.9,
+    seed=0,
+)
+   
+learn_kwargs = {
+    'a2c' : {},
+    'acktr': {},
+    'deepq': {},
+    'ppo2': dict(lr=1e-3, nsteps=64, ent_coef=0.0),
+    'trpo_mpi': dict(timesteps_per_batch=100, cg_iters=10, gamma=0.9, lam=1.0, max_kl=0.01)
+}
+
+
+@pytest.mark.slow
+@pytest.mark.parametrize("alg", learn_kwargs.keys())
+def test_discrete_identity(alg):
+    '''
+    Test if the algorithm (with an mlp policy)
+    can learn an identity transformation (i.e. return observation as an action)
+    '''
+
+    kwargs = learn_kwargs[alg]
+    kwargs.update(common_kwargs)
+
+    learn_fn = lambda e: get_learn_function(alg)(env=e, **kwargs)
+    env_fn = lambda: DiscreteIdentityEnv(10, episode_len=100)
+    simple_test(env_fn, learn_fn, 0.9)
+
+@pytest.mark.slow
+@pytest.mark.parametrize("alg", ['a2c', 'ppo2', 'trpo_mpi'])
+def test_continuous_identity(alg):
+    '''
+    Test if the algorithm (with an mlp policy)
+    can learn an identity transformation (i.e. return observation as an action)
+    to a required precision
+    '''
+
+    kwargs = learn_kwargs[alg]
+    kwargs.update(common_kwargs)
+    learn_fn = lambda e: get_learn_function(alg)(env=e, **kwargs)
+
+    env_fn = lambda: BoxIdentityEnv((1,), episode_len=100)
+    simple_test(env_fn, learn_fn, -0.1)
+
+if __name__ == '__main__':
+    test_continuous_identity('a2c')    
+
diff --git a/baselines/common/tests/test_mnist.py b/baselines/common/tests/test_mnist.py
new file mode 100644
index 0000000..5489c3a
--- /dev/null
+++ b/baselines/common/tests/test_mnist.py
@@ -0,0 +1,50 @@
+import pytest
+
+# from baselines.acer import acer_simple as acer
+from baselines.common.tests.envs.mnist_env import MnistEnv
+from baselines.common.tests.util import simple_test
+from baselines.run import get_learn_function
+
+
+# TODO investigate a2c and ppo2 failures - is it due to bad hyperparameters for this problem?  
+# GitHub issue https://github.com/openai/baselines/issues/189
+common_kwargs = {
+    'seed': 0,
+    'network':'cnn',
+    'gamma':0.9,
+    'pad':'SAME'
+}
+
+learn_args = {
+    'a2c': dict(total_timesteps=50000),
+    # TODO need to resolve inference (step) API differences for acer; also slow
+    # 'acer': dict(seed=0, total_timesteps=1000),
+    'deepq': dict(total_timesteps=5000),
+    'acktr': dict(total_timesteps=30000),
+    'ppo2': dict(total_timesteps=50000, lr=1e-3, nsteps=128, ent_coef=0.0),
+    'trpo_mpi': dict(total_timesteps=80000, timesteps_per_batch=100, cg_iters=10, lam=1.0, max_kl=0.001)
+}
+
+ 
+#tests pass, but are too slow on travis. Same algorithms are covered 
+# by other tests with less compute-hungry nn's and by benchmarks
+@pytest.mark.skip 
+@pytest.mark.slow
+@pytest.mark.parametrize("alg", learn_args.keys())
+def test_mnist(alg):
+    '''
+    Test if the algorithm can learn to classify MNIST digits. 
+    Uses CNN policy. 
+    '''
+    
+    learn_kwargs = learn_args[alg]
+    learn_kwargs.update(common_kwargs)
+    
+    learn = get_learn_function(alg)
+    learn_fn = lambda e: learn(env=e, **learn_kwargs)
+    env_fn = lambda: MnistEnv(seed=0, episode_len=100)
+
+    simple_test(env_fn, learn_fn, 0.6)
+
+if __name__ == '__main__':
+    test_mnist('deepq')
diff --git a/baselines/common/tests/test_serialization.py b/baselines/common/tests/test_serialization.py
new file mode 100644
index 0000000..ca3d222
--- /dev/null
+++ b/baselines/common/tests/test_serialization.py
@@ -0,0 +1,97 @@
+import os
+import tempfile
+import pytest
+import tensorflow as tf
+import numpy as np
+
+from baselines.common.tests.envs.mnist_env import MnistEnv
+from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
+from baselines.run import get_learn_function
+from baselines.common.tf_util import make_session, get_session
+
+from functools import partial
+
+
+learn_kwargs = {
+    'deepq': {},
+    'a2c': {}, 
+    'acktr': {},
+    'ppo2': {'nminibatches': 1, 'nsteps': 10},
+    'trpo_mpi': {},
+}
+
+network_kwargs = {
+    'mlp': {}, 
+    'cnn': {'pad': 'SAME'}, 
+    'lstm': {},
+    'cnn_lnlstm': {'pad': 'SAME'}
+}
+
+
+@pytest.mark.parametrize("learn_fn", learn_kwargs.keys())
+@pytest.mark.parametrize("network_fn", network_kwargs.keys())
+def test_serialization(learn_fn, network_fn):
+    '''
+    Test if the trained model can be serialized 
+    '''
+
+    
+    if network_fn.endswith('lstm') and learn_fn in ['acktr', 'trpo_mpi', 'deepq']:
+            # TODO make acktr work with recurrent policies
+            # and test
+            # github issue: https://github.com/openai/baselines/issues/194
+            return 
+
+    env = DummyVecEnv([lambda: MnistEnv(10, episode_len=100)])
+    ob = env.reset().copy()
+    learn = get_learn_function(learn_fn)
+
+    kwargs = {}
+    kwargs.update(network_kwargs[network_fn])
+    kwargs.update(learn_kwargs[learn_fn])
+
+
+    learn = partial(learn, env=env, network=network_fn, seed=0, **kwargs)
+
+    with tempfile.TemporaryDirectory() as td:
+        model_path = os.path.join(td, 'serialization_test_model')
+
+        with tf.Graph().as_default(), make_session().as_default():
+            model = learn(total_timesteps=100)
+            model.save(model_path)
+            mean1, std1 = _get_action_stats(model, ob)
+            variables_dict1 = _serialize_variables()
+
+        with tf.Graph().as_default(), make_session().as_default():
+            model = learn(total_timesteps=0, load_path=model_path)
+            mean2, std2 = _get_action_stats(model, ob)
+            variables_dict2 = _serialize_variables()
+
+        for k, v in variables_dict1.items():
+            np.testing.assert_allclose(v, variables_dict2[k], atol=0.01,
+                err_msg='saved and loaded variable {} value mismatch'.format(k))
+
+        np.testing.assert_allclose(mean1, mean2, atol=0.5)
+        np.testing.assert_allclose(std1, std2, atol=0.5)
+
+ 
+
+def _serialize_variables():
+    sess = get_session()
+    variables = tf.trainable_variables()    
+    values = sess.run(variables)
+    return {var.name: value for var, value in zip(variables, values)}
+    
+
+def _get_action_stats(model, ob):
+    ntrials = 1000
+    if model.initial_state is None or model.initial_state == []:
+        actions = np.array([model.step(ob)[0] for _ in range(ntrials)])
+    else:
+        actions = np.array([model.step(ob, S=model.initial_state, M=[False])[0] for _ in range(ntrials)])
+
+    mean = np.mean(actions, axis=0)
+    std = np.std(actions, axis=0)
+
+    return mean, std
+
diff --git a/baselines/common/tests/util.py b/baselines/common/tests/util.py
new file mode 100644
index 0000000..30b8954
--- /dev/null
+++ b/baselines/common/tests/util.py
@@ -0,0 +1,91 @@
+import tensorflow as tf
+import numpy as np
+from gym.spaces import np_random
+from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
+
+N_TRIALS = 10000
+N_EPISODES = 100
+
+def simple_test(env_fn, learn_fn, min_reward_fraction, n_trials=N_TRIALS):
+    np.random.seed(0)
+    np_random.seed(0)
+
+    env = DummyVecEnv([env_fn])
+
+
+    with tf.Graph().as_default(), tf.Session(config=tf.ConfigProto(allow_soft_placement=True)).as_default():
+        tf.set_random_seed(0)
+
+        model = learn_fn(env)
+
+        sum_rew = 0
+        done = True
+
+        for i in range(n_trials):
+            if done:
+                obs = env.reset()
+                state = model.initial_state
+
+            if state is not None:
+                a, v, state, _ = model.step(obs, S=state, M=[False])
+            else:
+                a, v, _, _ = model.step(obs)
+            
+            obs, rew, done, _ = env.step(a)
+            sum_rew += float(rew)
+
+        print("Reward in {} trials is {}".format(n_trials, sum_rew))
+        assert sum_rew > min_reward_fraction * n_trials, \
+            'sum of rewards {} is less than {} of the total number of trials {}'.format(sum_rew, min_reward_fraction, n_trials)
+
+
+
+def reward_per_episode_test(env_fn, learn_fn, min_avg_reward, n_trials=N_EPISODES):
+    env = DummyVecEnv([env_fn])
+
+    with tf.Graph().as_default(), tf.Session(config=tf.ConfigProto(allow_soft_placement=True)).as_default():
+        model = learn_fn(env)
+
+        N_TRIALS = 100    
+
+        observations, actions, rewards = rollout(env, model, N_TRIALS)
+        rewards = [sum(r) for r in rewards]
+
+        avg_rew = sum(rewards) / N_TRIALS
+        print("Average reward in {} episodes is {}".format(n_trials, avg_rew))
+        assert avg_rew > min_avg_reward, \
+            'average reward in {} episodes ({}) is less than {}'.format(n_trials, avg_rew, min_avg_reward)
+
+def rollout(env, model, n_trials):
+    rewards = []
+    actions = []
+    observations = []
+
+    for i in range(n_trials):
+        obs = env.reset()
+        state = model.initial_state
+        episode_rew = []
+        episode_actions = []
+        episode_obs = []
+
+        while True:
+            if state is not None:
+                a, v, state, _ = model.step(obs, S=state, M=[False])
+            else:
+                a,v, _, _ = model.step(obs)
+
+            obs, rew, done, _ = env.step(a)
+
+            episode_rew.append(rew)
+            episode_actions.append(a)
+            episode_obs.append(obs)
+
+            if done:
+                break
+
+        rewards.append(episode_rew)
+        actions.append(episode_actions)
+        observations.append(episode_obs)
+
+    return observations, actions, rewards
+
diff --git a/baselines/common/tf_util.py b/baselines/common/tf_util.py
index afcd593..733a15e 100644
--- a/baselines/common/tf_util.py
+++ b/baselines/common/tf_util.py
@@ -1,3 +1,4 @@
+import joblib
 import numpy as np
 import tensorflow as tf  # pylint: ignore-module
 import copy
@@ -48,17 +49,28 @@ def huber_loss(x, delta=1.0):
 # Global session
 # ================================================================
 
-def make_session(num_cpu=None, make_default=False, graph=None):
+def get_session(config=None):
+    """Get default session or create one with a given config"""
+    sess = tf.get_default_session()
+    if sess is None:
+        sess = make_session(config=config, make_default=True)
+    return sess
+
+def make_session(config=None, num_cpu=None, make_default=False, graph=None):
     """Returns a session that will use <num_cpu> CPU's only"""
     if num_cpu is None:
         num_cpu = int(os.getenv('RCALL_NUM_CPU', multiprocessing.cpu_count()))
-    tf_config = tf.ConfigProto(
-        inter_op_parallelism_threads=num_cpu,
-        intra_op_parallelism_threads=num_cpu)
+    if config is None:
+        config = tf.ConfigProto(
+            allow_soft_placement=True, 
+            inter_op_parallelism_threads=num_cpu,
+            intra_op_parallelism_threads=num_cpu)
+        config.gpu_options.allow_growth = True
+
     if make_default:
-        return tf.InteractiveSession(config=tf_config, graph=graph)
+        return tf.InteractiveSession(config=config, graph=graph)
     else:
-        return tf.Session(config=tf_config, graph=graph)
+        return tf.Session(config=config, graph=graph)
 
 def single_threaded_session():
     """Returns a session which will only use a single CPU"""
@@ -76,7 +88,7 @@ ALREADY_INITIALIZED = set()
 def initialize():
     """Initialize all the uninitialized variables in the global scope."""
     new_variables = set(tf.global_variables()) - ALREADY_INITIALIZED
-    tf.get_default_session().run(tf.variables_initializer(new_variables))
+    get_session().run(tf.variables_initializer(new_variables))
     ALREADY_INITIALIZED.update(new_variables)
 
 # ================================================================
@@ -85,7 +97,7 @@ def initialize():
 
 def normc_initializer(std=1.0, axis=0):
     def _initializer(shape, dtype=None, partition_info=None):  # pylint: disable=W0613
-        out = np.random.randn(*shape).astype(np.float32)
+        out = np.random.randn(*shape).astype(dtype.as_numpy_dtype)
         out *= std / np.sqrt(np.square(out).sum(axis=axis, keepdims=True))
         return tf.constant(out)
     return _initializer
@@ -179,7 +191,7 @@ class _Function(object):
         if hasattr(inpt, 'make_feed_dict'):
             feed_dict.update(inpt.make_feed_dict(value))
         else:
-            feed_dict[inpt] = value
+            feed_dict[inpt] = adjust_shape(inpt, value)
 
     def __call__(self, *args):
         assert len(args) <= len(self.inputs), "Too many arguments provided"
@@ -189,8 +201,8 @@ class _Function(object):
             self._feed_input(feed_dict, inpt, value)
         # Update feed dict with givens.
         for inpt in self.givens:
-            feed_dict[inpt] = feed_dict.get(inpt, self.givens[inpt])
-        results = tf.get_default_session().run(self.outputs_update, feed_dict=feed_dict)[:-1]
+            feed_dict[inpt] = adjust_shape(inpt, feed_dict.get(inpt, self.givens[inpt]))
+        results = get_session().run(self.outputs_update, feed_dict=feed_dict)[:-1]
         return results
 
 # ================================================================
@@ -243,27 +255,34 @@ class GetFlat(object):
     def __call__(self):
         return tf.get_default_session().run(self.op)
 
+def flattenallbut0(x):
+    return tf.reshape(x, [-1, intprod(x.get_shape().as_list()[1:])])
+
+# =============================================================
+# TF placeholders management
+# ============================================================
+
 _PLACEHOLDER_CACHE = {}  # name -> (placeholder, dtype, shape)
 
 def get_placeholder(name, dtype, shape):
     if name in _PLACEHOLDER_CACHE:
         out, dtype1, shape1 = _PLACEHOLDER_CACHE[name]
-        assert dtype1 == dtype and shape1 == shape
-        return out
-    else:
-        out = tf.placeholder(dtype=dtype, shape=shape, name=name)
-        _PLACEHOLDER_CACHE[name] = (out, dtype, shape)
-        return out
+        if out.graph == tf.get_default_graph():
+            assert dtype1 == dtype and shape1 == shape, \
+                'Placeholder with name {} has already been registered and has shape {}, different from requested {}'.format(name, shape1, shape)
+            return out
+
+    out = tf.placeholder(dtype=dtype, shape=shape, name=name)
+    _PLACEHOLDER_CACHE[name] = (out, dtype, shape)
+    return out
 
 def get_placeholder_cached(name):
     return _PLACEHOLDER_CACHE[name][0]
 
-def flattenallbut0(x):
-    return tf.reshape(x, [-1, intprod(x.get_shape().as_list()[1:])])
 
 
 # ================================================================
-# Diagnostics 
+# Diagnostics
 # ================================================================
 
 def display_var_info(vars):
@@ -283,7 +302,7 @@ def display_var_info(vars):
 def get_available_gpus():
     # recipe from here:
     # https://stackoverflow.com/questions/38559755/how-to-get-current-available-gpus-in-tensorflow?utm_medium=organic&utm_source=google_rich_qa&utm_campaign=google_rich_qa
- 
+
     from tensorflow.python.client import device_lib
     local_device_protos = device_lib.list_local_devices()
     return [x.name for x in local_device_protos if x.device_type == 'GPU']
@@ -292,13 +311,95 @@ def get_available_gpus():
 # Saving variables
 # ================================================================
 
-def load_state(fname):
+def load_state(fname, sess=None):
+    sess = sess or get_session()
     saver = tf.train.Saver()
     saver.restore(tf.get_default_session(), fname)
 
-def save_state(fname):
+def save_state(fname, sess=None):
+    sess = sess or get_session()
     os.makedirs(os.path.dirname(fname), exist_ok=True)
     saver = tf.train.Saver()
     saver.save(tf.get_default_session(), fname)
 
+# The methods above and below are clearly doing the same thing, and in a rather similar way
+# TODO: ensure there is no subtle differences and remove one
+
+def save_variables(save_path, variables=None, sess=None):
+    sess = sess or get_session()
+    variables = variables or tf.trainable_variables()
+    
+    ps = sess.run(variables)
+    save_dict = {v.name: value for v, value in zip(variables, ps)}
+    os.makedirs(os.path.dirname(save_path), exist_ok=True)
+    joblib.dump(save_dict, save_path)
+
+def load_variables(load_path, variables=None, sess=None):
+    sess = sess or get_session()
+    variables = variables or tf.trainable_variables()
+
+    loaded_params = joblib.load(os.path.expanduser(load_path))
+    restores = []
+    for v in variables:
+        restores.append(v.assign(loaded_params[v.name]))
+    sess.run(restores)
+
+
+# ================================================================
+# Shape adjustment for feeding into tf placeholders
+# ================================================================
+def adjust_shape(placeholder, data):
+    '''
+    adjust shape of the data to the shape of the placeholder if possible.
+    If shape is incompatible, AssertionError is thrown
+
+    Parameters:
+        placeholder     tensorflow input placeholder 
+        
+        data            input data to be (potentially) reshaped to be fed into placeholder
+    
+    Returns:
+        reshaped data
+    '''
+
+    if not isinstance(data, np.ndarray) and not isinstance(data, list):
+        return data
+    if isinstance(data, list):
+        data = np.array(data)
+    
+    placeholder_shape = [x or -1 for x in placeholder.shape.as_list()]
+    
+    assert _check_shape(placeholder_shape, data.shape), \
+        'Shape of data {} is not compatible with shape of the placeholder {}'.format(data.shape, placeholder_shape)
+
+    return np.reshape(data, placeholder_shape)  
+    
+
+def _check_shape(placeholder_shape, data_shape):
+    ''' check if two shapes are compatible (i.e. differ only by dimensions of size 1, or by the batch dimension)'''
+
+    return True
+    squeezed_placeholder_shape = _squeeze_shape(placeholder_shape)
+    squeezed_data_shape = _squeeze_shape(data_shape)
+    
+    for i, s_data in enumerate(squeezed_data_shape):
+        s_placeholder = squeezed_placeholder_shape[i]
+        if s_placeholder != -1 and s_data != s_placeholder:
+            return False
+
+    return True
+
+
+def _squeeze_shape(shape):
+    return [x for x in shape if x != 1]
+        
+# Tensorboard interfacing
+# ================================================================
+
+def launch_tensorboard_in_background(log_dir):
+    from tensorboard import main as tb
+    import threading
+    tf.flags.FLAGS.logdir = log_dir
+    t = threading.Thread(target=tb.main, args=([]))
+    t.start()
 
diff --git a/baselines/common/vec_env/dummy_vec_env.py b/baselines/common/vec_env/dummy_vec_env.py
index d0ae455..477bf30 100644
--- a/baselines/common/vec_env/dummy_vec_env.py
+++ b/baselines/common/vec_env/dummy_vec_env.py
@@ -30,15 +30,30 @@ class DummyVecEnv(VecEnv):
         self.actions = None
 
     def step_async(self, actions):
-        self.actions = actions
+        listify = True
+        try:
+            if len(actions) == self.num_envs:
+                listify = False
+        except TypeError:
+            pass
+
+        if not listify:
+            self.actions = actions
+        else:
+            assert self.num_envs == 1, "actions {} is either not a list or has a wrong size - cannot match to {} environments".format(actions, self.num_envs)
+            self.actions = [actions]
 
     def step_wait(self):
         for e in range(self.num_envs):
-            obs, self.buf_rews[e], self.buf_dones[e], self.buf_infos[e] = self.envs[e].step(self.actions[e])
+            action = self.actions[e]
+            if isinstance(self.envs[e].action_space, spaces.Discrete):
+                action = int(action)
+
+            obs, self.buf_rews[e], self.buf_dones[e], self.buf_infos[e] = self.envs[e].step(action)
             if self.buf_dones[e]:
                 obs = self.envs[e].reset()
             self._save_obs(e, obs)
-        return (self._obs_from_buf(), np.copy(self.buf_rews), np.copy(self.buf_dones),
+        return (np.copy(self._obs_from_buf()), np.copy(self.buf_rews), np.copy(self.buf_dones),
                 self.buf_infos.copy())
 
     def reset(self):
diff --git a/baselines/common/vec_env/subproc_vec_env.py b/baselines/common/vec_env/subproc_vec_env.py
index fb55df4..e5b5b32 100644
--- a/baselines/common/vec_env/subproc_vec_env.py
+++ b/baselines/common/vec_env/subproc_vec_env.py
@@ -7,26 +7,30 @@ from baselines.common.tile_images import tile_images
 def worker(remote, parent_remote, env_fn_wrapper):
     parent_remote.close()
     env = env_fn_wrapper.x()
-    while True:
-        cmd, data = remote.recv()
-        if cmd == 'step':
-            ob, reward, done, info = env.step(data)
-            if done:
+    try:
+        while True:
+            cmd, data = remote.recv()
+            if cmd == 'step':
+                ob, reward, done, info = env.step(data)
+                if done:
+                    ob = env.reset()
+                remote.send((ob, reward, done, info))
+            elif cmd == 'reset':
                 ob = env.reset()
-            remote.send((ob, reward, done, info))
-        elif cmd == 'reset':
-            ob = env.reset()
-            remote.send(ob)
-        elif cmd == 'render':
-            remote.send(env.render(mode='rgb_array'))
-        elif cmd == 'close':
-            remote.close()
-            break
-        elif cmd == 'get_spaces':
-            remote.send((env.observation_space, env.action_space))
-        else:
-            raise NotImplementedError
-
+                remote.send(ob)
+            elif cmd == 'render':
+                remote.send(env.render(mode='rgb_array'))
+            elif cmd == 'close':
+                remote.close()
+                break
+            elif cmd == 'get_spaces':
+                remote.send((env.observation_space, env.action_space))
+            else:
+                raise NotImplementedError
+    except KeyboardInterrupt:
+        print('SubprocVecEnv worker: got KeyboardInterrupt')
+    finally:
+        env.close()
 
 class SubprocVecEnv(VecEnv):
     def __init__(self, env_fns, spaces=None):
diff --git a/baselines/common/vec_env/vec_normalize.py b/baselines/common/vec_env/vec_normalize.py
index dda767d..5d5c5ad 100644
--- a/baselines/common/vec_env/vec_normalize.py
+++ b/baselines/common/vec_env/vec_normalize.py
@@ -10,6 +10,8 @@ class VecNormalize(VecEnvWrapper):
         VecEnvWrapper.__init__(self, venv)
         self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if ob else None
         self.ret_rms = RunningMeanStd(shape=()) if ret else None
+        #self.ob_rms = TfRunningMeanStd(shape=self.observation_space.shape, scope='observation_running_mean_std') if ob else None
+        #self.ret_rms = TfRunningMeanStd(shape=(), scope='return_running_mean_std') if ret else None
         self.clipob = clipob
         self.cliprew = cliprew
         self.ret = np.zeros(self.num_envs)
diff --git a/baselines/ddpg/ddpg.py b/baselines/ddpg/ddpg.py
index e2d4950..6664cc4 100644
--- a/baselines/ddpg/ddpg.py
+++ b/baselines/ddpg/ddpg.py
@@ -26,9 +26,9 @@ def reduce_std(x, axis=None, keepdims=False):
     return tf.sqrt(reduce_var(x, axis=axis, keepdims=keepdims))
 
 def reduce_var(x, axis=None, keepdims=False):
-    m = tf.reduce_mean(x, axis=axis, keep_dims=True)
+    m = tf.reduce_mean(x, axis=axis, keepdims=True)
     devs_squared = tf.square(x - m)
-    return tf.reduce_mean(devs_squared, axis=axis, keep_dims=keepdims)
+    return tf.reduce_mean(devs_squared, axis=axis, keepdims=keepdims)
 
 def get_target_updates(vars, target_vars, tau):
     logger.info('setting up target updates ...')
diff --git a/baselines/deepq/__init__.py b/baselines/deepq/__init__.py
index 4472399..6859c05 100644
--- a/baselines/deepq/__init__.py
+++ b/baselines/deepq/__init__.py
@@ -1,8 +1,8 @@
 from baselines.deepq import models  # noqa
 from baselines.deepq.build_graph import build_act, build_train  # noqa
-from baselines.deepq.simple import learn, load  # noqa
+from baselines.deepq.deepq import learn, load_act  # noqa
 from baselines.deepq.replay_buffer import ReplayBuffer, PrioritizedReplayBuffer  # noqa
 
 def wrap_atari_dqn(env):
     from baselines.common.atari_wrappers import wrap_deepmind
-    return wrap_deepmind(env, frame_stack=True, scale=True)
\ No newline at end of file
+    return wrap_deepmind(env, frame_stack=True, scale=True)
diff --git a/baselines/deepq/simple.py b/baselines/deepq/deepq.py
similarity index 90%
rename from baselines/deepq/simple.py
rename to baselines/deepq/deepq.py
index 4bad145..7d44acf 100644
--- a/baselines/deepq/simple.py
+++ b/baselines/deepq/deepq.py
@@ -10,20 +10,24 @@ import baselines.common.tf_util as U
 from baselines.common.tf_util import load_state, save_state
 from baselines import logger
 from baselines.common.schedules import LinearSchedule
-from baselines.common.input import observation_input
+from baselines.common import set_global_seeds
 
 from baselines import deepq
 from baselines.deepq.replay_buffer import ReplayBuffer, PrioritizedReplayBuffer
 from baselines.deepq.utils import ObservationInput
 
+from baselines.common.tf_util import get_session
+from baselines.deepq.models import build_q_func
+
 
 class ActWrapper(object):
     def __init__(self, act, act_params):
         self._act = act
         self._act_params = act_params
+        self.initial_state = None
 
     @staticmethod
-    def load(path):
+    def load_act(self, path):
         with open(path, "rb") as f:
             model_data, act_params = cloudpickle.load(f)
         act = deepq.build_act(**act_params)
@@ -42,7 +46,10 @@ class ActWrapper(object):
     def __call__(self, *args, **kwargs):
         return self._act(*args, **kwargs)
 
-    def save(self, path=None):
+    def step(self, observation, **kwargs):
+        return self._act([observation], **kwargs), None, None, None
+
+    def save_act(self, path=None):
         """Save model to a pickle located at `path`"""
         if path is None:
             path = os.path.join(logger.get_dir(), "model.pkl")
@@ -61,8 +68,11 @@ class ActWrapper(object):
         with open(path, "wb") as f:
             cloudpickle.dump((model_data, self._act_params), f)
 
+    def save(self, path):
+        save_state(path)
 
-def load(path):
+
+def load_act(path):
     """Load act function that was returned by learn function.
 
     Parameters
@@ -76,13 +86,14 @@ def load(path):
         function that takes a batch of observations
         and returns actions.
     """
-    return ActWrapper.load(path)
+    return ActWrapper.load_act(path)
 
 
 def learn(env,
-          q_func,
+          network,
+          seed=None,
           lr=5e-4,
-          max_timesteps=100000,
+          total_timesteps=100000,
           buffer_size=50000,
           exploration_fraction=0.1,
           exploration_final_eps=0.02,
@@ -100,7 +111,10 @@ def learn(env,
           prioritized_replay_beta_iters=None,
           prioritized_replay_eps=1e-6,
           param_noise=False,
-          callback=None):
+          callback=None,
+          load_path=None,
+          **network_kwargs
+            ):
     """Train a deepq model.
 
     Parameters
@@ -119,7 +133,7 @@ def learn(env,
         and returns a tensor of shape (batch_size, num_actions) with values of every action.
     lr: float
         learning rate for adam optimizer
-    max_timesteps: int
+    total_timesteps: int
         number of env steps to optimizer for
     buffer_size: int
         size of the replay buffer
@@ -153,12 +167,16 @@ def learn(env,
         initial value of beta for prioritized replay buffer
     prioritized_replay_beta_iters: int
         number of iterations over which beta will be annealed from initial value
-        to 1.0. If set to None equals to max_timesteps.
+        to 1.0. If set to None equals to total_timesteps.
     prioritized_replay_eps: float
         epsilon to add to the TD errors when updating priorities.
     callback: (locals, globals) -> None
         function called at every steps with state of the algorithm.
         If callback returns true training stops.
+    load_path: str
+        path to load the model from. (default: None)
+    **network_kwargs
+        additional keyword arguments to pass to the network builder. 
 
     Returns
     -------
@@ -168,8 +186,10 @@ def learn(env,
     """
     # Create all the functions necessary to train the model
 
-    sess = tf.Session()
-    sess.__enter__()
+    sess = get_session()
+    set_global_seeds(seed)
+
+    q_func = build_q_func(network, **network_kwargs)
 
     # capture the shape outside the closure so that the env object is not serialized
     # by cloudpickle when serializing make_obs_ph
@@ -194,12 +214,12 @@ def learn(env,
     }
 
     act = ActWrapper(act, act_params)
-
+  
     # Create the replay buffer
     if prioritized_replay:
         replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha)
         if prioritized_replay_beta_iters is None:
-            prioritized_replay_beta_iters = max_timesteps
+            prioritized_replay_beta_iters = total_timesteps
         beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                        initial_p=prioritized_replay_beta0,
                                        final_p=1.0)
@@ -207,7 +227,7 @@ def learn(env,
         replay_buffer = ReplayBuffer(buffer_size)
         beta_schedule = None
     # Create the schedule for exploration starting from 1.
-    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps),
+    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * total_timesteps),
                                  initial_p=1.0,
                                  final_p=exploration_final_eps)
 
@@ -225,12 +245,17 @@ def learn(env,
 
         model_file = os.path.join(td, "model")
         model_saved = False
+        
         if tf.train.latest_checkpoint(td) is not None:
             load_state(model_file)
             logger.log('Loaded model from {}'.format(model_file))
             model_saved = True
+        elif load_path is not None:
+            load_state(load_path)
+            logger.log('Loaded model from {}'.format(load_path))
+        
 
-        for t in range(max_timesteps):
+        for t in range(total_timesteps):
             if callback is not None:
                 if callback(locals(), globals()):
                     break
diff --git a/baselines/deepq/defaults.py b/baselines/deepq/defaults.py
new file mode 100644
index 0000000..d41fb18
--- /dev/null
+++ b/baselines/deepq/defaults.py
@@ -0,0 +1,21 @@
+def atari():
+    return dict(
+        network='conv_only',
+        lr=1e-4,
+        buffer_size=10000,
+        exploration_fraction=0.1,
+        exploration_final_eps=0.01,
+        train_freq=4,
+        learning_starts=10000,
+        target_network_update_freq=1000,
+        gamma=0.99,
+        prioritized_replay=True,
+        prioritized_replay_alpha=0.6,
+        checkpoint_freq=10000,
+        checkpoint_path=None,
+        dueling=True
+    )
+
+def retro():
+    return atari()
+
diff --git a/baselines/deepq/experiments/enjoy_retro.py b/baselines/deepq/experiments/enjoy_retro.py
new file mode 100644
index 0000000..526af16
--- /dev/null
+++ b/baselines/deepq/experiments/enjoy_retro.py
@@ -0,0 +1,34 @@
+import argparse
+
+import numpy as np
+
+from baselines import deepq
+from baselines.common import retro_wrappers
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--env', help='environment ID', default='SuperMarioBros-Nes')
+    parser.add_argument('--gamestate', help='game state to load', default='Level1-1')
+    parser.add_argument('--model', help='model pickle file from ActWrapper.save', default='model.pkl')
+    args = parser.parse_args()
+
+    env = retro_wrappers.make_retro(game=args.env, state=args.gamestate, max_episode_steps=None)
+    env = retro_wrappers.wrap_deepmind_retro(env)
+    act = deepq.load(args.model)
+
+    while True:
+        obs, done = env.reset(), False
+        episode_rew = 0
+        while not done:
+            env.render()
+            action = act(obs[None])[0]
+            env_action = np.zeros(env.action_space.n)
+            env_action[action] = 1
+            obs, rew, done, _ = env.step(env_action)
+            episode_rew += rew
+        print('Episode reward', episode_rew)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/baselines/deepq/experiments/run_retro.py b/baselines/deepq/experiments/run_retro.py
new file mode 100644
index 0000000..0338361
--- /dev/null
+++ b/baselines/deepq/experiments/run_retro.py
@@ -0,0 +1,49 @@
+import argparse
+
+from baselines import deepq
+from baselines.common import set_global_seeds
+from baselines import bench
+from baselines import logger
+from baselines.common import retro_wrappers
+import retro
+
+
+def main():
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('--env', help='environment ID', default='SuperMarioBros-Nes')
+    parser.add_argument('--gamestate', help='game state to load', default='Level1-1')
+    parser.add_argument('--seed', help='seed', type=int, default=0)
+    parser.add_argument('--num-timesteps', type=int, default=int(10e6))
+    args = parser.parse_args()
+    logger.configure()
+    set_global_seeds(args.seed)
+    env = retro_wrappers.make_retro(game=args.env, state=args.gamestate, max_episode_steps=10000, use_restricted_actions=retro.Actions.DISCRETE)
+    env.seed(args.seed)
+    env = bench.Monitor(env, logger.get_dir())
+    env = retro_wrappers.wrap_deepmind_retro(env)
+
+    model = deepq.models.cnn_to_mlp(
+        convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)],
+        hiddens=[256],
+        dueling=True
+    )
+    act = deepq.learn(
+        env,
+        q_func=model,
+        lr=1e-4,
+        max_timesteps=args.num_timesteps,
+        buffer_size=10000,
+        exploration_fraction=0.1,
+        exploration_final_eps=0.01,
+        train_freq=4,
+        learning_starts=10000,
+        target_network_update_freq=1000,
+        gamma=0.99,
+        prioritized_replay=True
+    )
+    act.save()
+    env.close()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/baselines/deepq/models.py b/baselines/deepq/models.py
index 198d795..c41b707 100644
--- a/baselines/deepq/models.py
+++ b/baselines/deepq/models.py
@@ -89,3 +89,41 @@ def cnn_to_mlp(convs, hiddens, dueling=False, layer_norm=False):
 
     return lambda *args, **kwargs: _cnn_to_mlp(convs, hiddens, dueling, layer_norm=layer_norm, *args, **kwargs)
 
+
+
+def build_q_func(network, hiddens=[256], dueling=True, layer_norm=False, **network_kwargs):
+    if isinstance(network, str):
+        from baselines.common.models import get_network_builder
+        network = get_network_builder(network)(**network_kwargs)   
+        
+    def q_func_builder(input_placeholder, num_actions, scope, reuse=False):
+        with tf.variable_scope(scope, reuse=reuse):
+            latent, _ = network(input_placeholder)
+            latent = layers.flatten(latent)
+
+            with tf.variable_scope("action_value"):
+                action_out = latent
+                for hidden in hiddens:
+                    action_out = layers.fully_connected(action_out, num_outputs=hidden, activation_fn=None)
+                    if layer_norm:
+                        action_out = layers.layer_norm(action_out, center=True, scale=True)
+                    action_out = tf.nn.relu(action_out)
+                action_scores = layers.fully_connected(action_out, num_outputs=num_actions, activation_fn=None)
+
+            if dueling:
+                with tf.variable_scope("state_value"):
+                    state_out = latent
+                    for hidden in hiddens:
+                        state_out = layers.fully_connected(state_out, num_outputs=hidden, activation_fn=None)
+                        if layer_norm:
+                            state_out = layers.layer_norm(state_out, center=True, scale=True)
+                        state_out = tf.nn.relu(state_out)
+                    state_score = layers.fully_connected(state_out, num_outputs=1, activation_fn=None)
+                action_scores_mean = tf.reduce_mean(action_scores, 1)
+                action_scores_centered = action_scores - tf.expand_dims(action_scores_mean, 1)
+                q_out = state_score + action_scores_centered
+            else:
+                q_out = action_scores
+            return q_out
+            
+    return q_func_builder
diff --git a/baselines/deepq/test_identity.py b/baselines/deepq/test_identity.py
deleted file mode 100644
index ef57e70..0000000
--- a/baselines/deepq/test_identity.py
+++ /dev/null
@@ -1,43 +0,0 @@
-import tensorflow as tf
-import random
-
-from baselines import deepq
-from baselines.common.identity_env import IdentityEnv
-
-
-def test_identity():
-
-    with tf.Graph().as_default():
-        env = IdentityEnv(10)
-        random.seed(0)
-
-        tf.set_random_seed(0)
-
-        param_noise = False
-        model = deepq.models.mlp([32])
-        act = deepq.learn(
-            env,
-            q_func=model,
-            lr=1e-3,
-            max_timesteps=10000,
-            buffer_size=50000,
-            exploration_fraction=0.1,
-            exploration_final_eps=0.02,
-            print_freq=10,
-            param_noise=param_noise,
-        )
-
-        tf.set_random_seed(0)
-
-        N_TRIALS = 1000
-        sum_rew = 0
-        obs = env.reset()
-        for i in range(N_TRIALS):
-            obs, rew, done, _ = env.step(act([obs]))
-            sum_rew += rew
-
-        assert sum_rew > 0.9 * N_TRIALS
-
-
-if __name__ == '__main__':
-    test_identity()
diff --git a/baselines/deepq/utils.py b/baselines/deepq/utils.py
index 90b932e..2914f43 100644
--- a/baselines/deepq/utils.py
+++ b/baselines/deepq/utils.py
@@ -1,4 +1,5 @@
 from baselines.common.input import observation_input
+from baselines.common.tf_util import adjust_shape
 
 import tensorflow as tf
 
@@ -36,7 +37,7 @@ class PlaceholderTfInput(TfInput):
         return self._placeholder
 
     def make_feed_dict(self, data):
-        return {self._placeholder: data}
+        return {self._placeholder: adjust_shape(self._placeholder, data)}
 
 
 class Uint8Input(PlaceholderTfInput):
diff --git a/baselines/ppo1/run_atari.py b/baselines/ppo1/run_atari.py
index 17941c6..96e3482 100644
--- a/baselines/ppo1/run_atari.py
+++ b/baselines/ppo1/run_atari.py
@@ -18,7 +18,7 @@ def train(env_id, num_timesteps, seed):
         logger.configure()
     else:
         logger.configure(format_strs=[])
-    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
+    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() if seed is not None else None
     set_global_seeds(workerseed)
     env = make_atari(env_id)
     def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613
diff --git a/baselines/ppo2/defaults.py b/baselines/ppo2/defaults.py
new file mode 100644
index 0000000..447a68d
--- /dev/null
+++ b/baselines/ppo2/defaults.py
@@ -0,0 +1,22 @@
+def mujoco():
+    return dict(
+        nsteps=2048,
+        nminibatches=32,
+        lam=0.95,
+        gamma=0.99,
+        noptepochs=10,
+        log_interval=1,
+        ent_coef=0.0,
+        lr=lambda f: 3e-4 * f,
+        cliprange=0.2,
+        value_network='copy'
+    )
+
+def atari():
+    return dict(
+        nsteps=128, nminibatches=4,
+        lam=0.95, gamma=0.99, noptepochs=4, log_interval=1,
+        ent_coef=.01,
+        lr=lambda f : f * 2.5e-4,
+        cliprange=lambda f : f * 0.1,
+    )
diff --git a/baselines/ppo2/policies.py b/baselines/ppo2/policies.py
deleted file mode 100644
index 6fbbb14..0000000
--- a/baselines/ppo2/policies.py
+++ /dev/null
@@ -1,146 +0,0 @@
-import numpy as np
-import tensorflow as tf
-from baselines.a2c.utils import conv, fc, conv_to_fc, batch_to_seq, seq_to_batch, lstm, lnlstm
-from baselines.common.distributions import make_pdtype
-from baselines.common.input import observation_input
-
-def nature_cnn(unscaled_images, **conv_kwargs):
-    """
-    CNN from Nature paper.
-    """
-    scaled_images = tf.cast(unscaled_images, tf.float32) / 255.
-    activ = tf.nn.relu
-    h = activ(conv(scaled_images, 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2),
-                   **conv_kwargs))
-    h2 = activ(conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2), **conv_kwargs))
-    h3 = activ(conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2), **conv_kwargs))
-    h3 = conv_to_fc(h3)
-    return activ(fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2)))
-
-class LnLstmPolicy(object):
-    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False):
-        nenv = nbatch // nsteps
-        X, processed_x = observation_input(ob_space, nbatch)
-        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
-        S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
-        self.pdtype = make_pdtype(ac_space)
-        with tf.variable_scope("model", reuse=reuse):
-            h = nature_cnn(processed_x)
-            xs = batch_to_seq(h, nenv, nsteps)
-            ms = batch_to_seq(M, nenv, nsteps)
-            h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm)
-            h5 = seq_to_batch(h5)
-            vf = fc(h5, 'v', 1)
-            self.pd, self.pi = self.pdtype.pdfromlatent(h5)
-
-        v0 = vf[:, 0]
-        a0 = self.pd.sample()
-        neglogp0 = self.pd.neglogp(a0)
-        self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)
-
-        def step(ob, state, mask):
-            return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask})
-
-        def value(ob, state, mask):
-            return sess.run(v0, {X:ob, S:state, M:mask})
-
-        self.X = X
-        self.M = M
-        self.S = S
-        self.vf = vf
-        self.step = step
-        self.value = value
-
-class LstmPolicy(object):
-
-    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False):
-        nenv = nbatch // nsteps
-        self.pdtype = make_pdtype(ac_space)
-        X, processed_x = observation_input(ob_space, nbatch)
-
-        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
-        S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
-        with tf.variable_scope("model", reuse=reuse):
-            h = nature_cnn(X)
-            xs = batch_to_seq(h, nenv, nsteps)
-            ms = batch_to_seq(M, nenv, nsteps)
-            h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
-            h5 = seq_to_batch(h5)
-            vf = fc(h5, 'v', 1)
-            self.pd, self.pi = self.pdtype.pdfromlatent(h5)
-
-        v0 = vf[:, 0]
-        a0 = self.pd.sample()
-        neglogp0 = self.pd.neglogp(a0)
-        self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)
-
-        def step(ob, state, mask):
-            return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask})
-
-        def value(ob, state, mask):
-            return sess.run(v0, {X:ob, S:state, M:mask})
-
-        self.X = X
-        self.M = M
-        self.S = S
-        self.vf = vf
-        self.step = step
-        self.value = value
-
-class CnnPolicy(object):
-
-    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False, **conv_kwargs): #pylint: disable=W0613
-        self.pdtype = make_pdtype(ac_space)
-        X, processed_x = observation_input(ob_space, nbatch)
-        with tf.variable_scope("model", reuse=reuse):
-            h = nature_cnn(processed_x, **conv_kwargs)
-            vf = fc(h, 'v', 1)[:,0]
-            self.pd, self.pi = self.pdtype.pdfromlatent(h, init_scale=0.01)
-
-        a0 = self.pd.sample()
-        neglogp0 = self.pd.neglogp(a0)
-        self.initial_state = None
-
-        def step(ob, *_args, **_kwargs):
-            a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob})
-            return a, v, self.initial_state, neglogp
-
-        def value(ob, *_args, **_kwargs):
-            return sess.run(vf, {X:ob})
-
-        self.X = X
-        self.vf = vf
-        self.step = step
-        self.value = value
-
-class MlpPolicy(object):
-    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False): #pylint: disable=W0613
-        self.pdtype = make_pdtype(ac_space)
-        with tf.variable_scope("model", reuse=reuse):
-            X, processed_x = observation_input(ob_space, nbatch)
-            activ = tf.tanh
-            processed_x = tf.layers.flatten(processed_x)
-            pi_h1 = activ(fc(processed_x, 'pi_fc1', nh=64, init_scale=np.sqrt(2)))
-            pi_h2 = activ(fc(pi_h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2)))
-            vf_h1 = activ(fc(processed_x, 'vf_fc1', nh=64, init_scale=np.sqrt(2)))
-            vf_h2 = activ(fc(vf_h1, 'vf_fc2', nh=64, init_scale=np.sqrt(2)))
-            vf = fc(vf_h2, 'vf', 1)[:,0]
-
-            self.pd, self.pi = self.pdtype.pdfromlatent(pi_h2, init_scale=0.01)
-
-
-        a0 = self.pd.sample()
-        neglogp0 = self.pd.neglogp(a0)
-        self.initial_state = None
-
-        def step(ob, *_args, **_kwargs):
-            a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob})
-            return a, v, self.initial_state, neglogp
-
-        def value(ob, *_args, **_kwargs):
-            return sess.run(vf, {X:ob})
-
-        self.X = X
-        self.vf = vf
-        self.step = step
-        self.value = value
diff --git a/baselines/ppo2/ppo2.py b/baselines/ppo2/ppo2.py
index fd34f52..3e14f26 100644
--- a/baselines/ppo2/ppo2.py
+++ b/baselines/ppo2/ppo2.py
@@ -1,21 +1,29 @@
 import os
 import time
-import joblib
+import functools
 import numpy as np
 import os.path as osp
 import tensorflow as tf
 from baselines import logger
 from collections import deque
-from baselines.common import explained_variance
+from baselines.common import explained_variance, set_global_seeds
+from baselines.common.policies import build_policy
 from baselines.common.runners import AbstractEnvRunner
+from baselines.common.tf_util import get_session, save_variables, load_variables
+from baselines.common.mpi_adam_optimizer import MpiAdamOptimizer
+
+from mpi4py import MPI
+from baselines.common.tf_util import initialize
+from baselines.common.mpi_util import sync_from_root
 
 class Model(object):
     def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train,
                 nsteps, ent_coef, vf_coef, max_grad_norm):
-        sess = tf.get_default_session()
+        sess = get_session()
 
-        act_model = policy(sess, ob_space, ac_space, nbatch_act, 1, reuse=False)
-        train_model = policy(sess, ob_space, ac_space, nbatch_train, nsteps, reuse=True)
+        with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE):
+            act_model = policy(nbatch_act, 1, sess)
+            train_model = policy(nbatch_train, nsteps, sess)
 
         A = train_model.pdtype.sample_placeholder([None])
         ADV = tf.placeholder(tf.float32, [None])
@@ -40,14 +48,16 @@ class Model(object):
         approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC))
         clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE)))
         loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef
-        with tf.variable_scope('model'):
-            params = tf.trainable_variables()
-        grads = tf.gradients(loss, params)
+        params = tf.trainable_variables('ppo2_model')
+        trainer = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=LR, epsilon=1e-5)
+        grads_and_var = trainer.compute_gradients(loss, params)
+        grads, var = zip(*grads_and_var)
+
         if max_grad_norm is not None:
             grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
-        grads = list(zip(grads, params))
-        trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5)
-        _train = trainer.apply_gradients(grads)
+        grads_and_var = list(zip(grads, var))
+
+        _train = trainer.apply_gradients(grads_and_var)
 
         def train(lr, cliprange, obs, returns, masks, actions, values, neglogpacs, states=None):
             advs = returns - values
@@ -63,17 +73,6 @@ class Model(object):
             )[:-1]
         self.loss_names = ['policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac']
 
-        def save(save_path):
-            ps = sess.run(params)
-            joblib.dump(ps, save_path)
-
-        def load(load_path):
-            loaded_params = joblib.load(load_path)
-            restores = []
-            for p, loaded_p in zip(params, loaded_params):
-                restores.append(p.assign(loaded_p))
-            sess.run(restores)
-            # If you want to load weights, also save/load observation scaling inside VecNormalize
 
         self.train = train
         self.train_model = train_model
@@ -81,9 +80,14 @@ class Model(object):
         self.step = act_model.step
         self.value = act_model.value
         self.initial_state = act_model.initial_state
-        self.save = save
-        self.load = load
-        tf.global_variables_initializer().run(session=sess) #pylint: disable=E1101
+
+        self.save = functools.partial(save_variables, sess=sess)
+        self.load = functools.partial(load_variables, sess=sess)
+
+        if MPI.COMM_WORLD.Get_rank() == 0:
+            initialize()
+        global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="")
+        sync_from_root(sess, global_variables) #pylint: disable=E1101
 
 class Runner(AbstractEnvRunner):
 
@@ -97,7 +101,7 @@ class Runner(AbstractEnvRunner):
         mb_states = self.states
         epinfos = []
         for _ in range(self.nsteps):
-            actions, values, self.states, neglogpacs = self.model.step(self.obs, self.states, self.dones)
+            actions, values, self.states, neglogpacs = self.model.step(self.obs, S=self.states, M=self.dones)
             mb_obs.append(self.obs.copy())
             mb_actions.append(actions)
             mb_values.append(values)
@@ -115,7 +119,7 @@ class Runner(AbstractEnvRunner):
         mb_values = np.asarray(mb_values, dtype=np.float32)
         mb_neglogpacs = np.asarray(mb_neglogpacs, dtype=np.float32)
         mb_dones = np.asarray(mb_dones, dtype=np.bool)
-        last_values = self.model.value(self.obs, self.states, self.dones)
+        last_values = self.model.value(self.obs, S=self.states, M=self.dones)
         #discount/bootstrap off value fn
         mb_returns = np.zeros_like(mb_rewards)
         mb_advs = np.zeros_like(mb_rewards)
@@ -145,10 +149,65 @@ def constfn(val):
         return val
     return f
 
-def learn(*, policy, env, nsteps, total_timesteps, ent_coef, lr,
+def learn(*, network, env, total_timesteps, seed=None, nsteps=2048, ent_coef=0.0, lr=3e-4,
             vf_coef=0.5,  max_grad_norm=0.5, gamma=0.99, lam=0.95,
             log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2,
-            save_interval=0, load_path=None):
+            save_interval=0, load_path=None, **network_kwargs):
+    '''
+    Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347)
+    
+    Parameters:
+    ----------
+
+    network:                          policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
+                                      specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns 
+                                      tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
+                                      neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
+                                      See baselines.common/policies.py/lstm for more details on using recurrent nets in policies
+
+    env: baselines.common.vec_env.VecEnv     environment. Needs to be vectorized for parallel environment simulation. 
+                                      The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class.
+
+    
+    nsteps: int                       number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where
+                                      nenv is number of environment copies simulated in parallel)
+
+    total_timesteps: int              number of timesteps (i.e. number of actions taken in the environment)
+
+    ent_coef: float                   policy entropy coefficient in the optimization objective
+
+    lr: float or function             learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the 
+                                      training and 0 is the end of the training.
+
+    vf_coef: float                    value function loss coefficient in the optimization objective
+
+    max_grad_norm: float or None      gradient norm clipping coefficient
+    
+    gamma: float                      discounting factor
+
+    lam: float                        advantage estimation discounting factor (lambda in the paper)
+
+    log_interval: int                 number of timesteps between logging events
+
+    nminibatches: int                 number of training minibatches per update
+
+    noptepochs: int                   number of training epochs per update
+
+    cliprange: float or function      clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training 
+                                      and 0 is the end of the training 
+
+    save_interval: int                number of timesteps between saving events
+
+    load_path: str                    path to load the model from
+
+    **network_kwargs:                 keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
+                                      For instance, 'mlp' network architecture has arguments num_hidden and num_layers. 
+
+    
+
+    '''
+    
+    set_global_seeds(seed)
 
     if isinstance(lr, float): lr = constfn(lr)
     else: assert callable(lr)
@@ -156,6 +215,8 @@ def learn(*, policy, env, nsteps, total_timesteps, ent_coef, lr,
     else: assert callable(cliprange)
     total_timesteps = int(total_timesteps)
 
+    policy = build_policy(env, network, **network_kwargs)
+
     nenvs = env.num_envs
     ob_space = env.observation_space
     ac_space = env.action_space
@@ -180,7 +241,6 @@ def learn(*, policy, env, nsteps, total_timesteps, ent_coef, lr,
     nupdates = total_timesteps//nbatch
     for update in range(1, nupdates+1):
         assert nbatch % nminibatches == 0
-        nbatch_train = nbatch // nminibatches
         tstart = time.time()
         frac = 1.0 - (update - 1.0) / nupdates
         lrnow = lr(frac)
@@ -228,8 +288,9 @@ def learn(*, policy, env, nsteps, total_timesteps, ent_coef, lr,
             logger.logkv('time_elapsed', tnow - tfirststart)
             for (lossval, lossname) in zip(lossvals, model.loss_names):
                 logger.logkv(lossname, lossval)
-            logger.dumpkvs()
-        if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir():
+            if MPI.COMM_WORLD.Get_rank() == 0:
+                logger.dumpkvs()
+        if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir() and MPI.COMM_WORLD.Get_rank() == 0:
             checkdir = osp.join(logger.get_dir(), 'checkpoints')
             os.makedirs(checkdir, exist_ok=True)
             savepath = osp.join(checkdir, '%.5i'%update)
@@ -240,3 +301,6 @@ def learn(*, policy, env, nsteps, total_timesteps, ent_coef, lr,
 
 def safemean(xs):
     return np.nan if len(xs) == 0 else np.mean(xs)
+
+
+
diff --git a/baselines/ppo2/run_atari.py b/baselines/ppo2/run_atari.py
deleted file mode 100644
index 322837a..0000000
--- a/baselines/ppo2/run_atari.py
+++ /dev/null
@@ -1,40 +0,0 @@
-#!/usr/bin/env python3
-import sys
-from baselines import logger
-from baselines.common.cmd_util import make_atari_env, atari_arg_parser
-from baselines.common.vec_env.vec_frame_stack import VecFrameStack
-from baselines.ppo2 import ppo2
-from baselines.ppo2.policies import CnnPolicy, LstmPolicy, LnLstmPolicy, MlpPolicy
-import multiprocessing
-import tensorflow as tf
-
-
-def train(env_id, num_timesteps, seed, policy):
-
-    ncpu = multiprocessing.cpu_count()
-    if sys.platform == 'darwin': ncpu //= 2
-    config = tf.ConfigProto(allow_soft_placement=True,
-                            intra_op_parallelism_threads=ncpu,
-                            inter_op_parallelism_threads=ncpu)
-    config.gpu_options.allow_growth = True #pylint: disable=E1101
-    tf.Session(config=config).__enter__()
-
-    env = VecFrameStack(make_atari_env(env_id, 8, seed), 4)
-    policy = {'cnn' : CnnPolicy, 'lstm' : LstmPolicy, 'lnlstm' : LnLstmPolicy, 'mlp': MlpPolicy}[policy]
-    ppo2.learn(policy=policy, env=env, nsteps=128, nminibatches=4,
-        lam=0.95, gamma=0.99, noptepochs=4, log_interval=1,
-        ent_coef=.01,
-        lr=lambda f : f * 2.5e-4,
-        cliprange=lambda f : f * 0.1,
-        total_timesteps=int(num_timesteps * 1.1))
-
-def main():
-    parser = atari_arg_parser()
-    parser.add_argument('--policy', help='Policy architecture', choices=['cnn', 'lstm', 'lnlstm', 'mlp'], default='cnn')
-    args = parser.parse_args()
-    logger.configure()
-    train(args.env, num_timesteps=args.num_timesteps, seed=args.seed,
-        policy=args.policy)
-
-if __name__ == '__main__':
-    main()
diff --git a/baselines/ppo2/run_mujoco.py b/baselines/ppo2/run_mujoco.py
deleted file mode 100644
index 282aa3f..0000000
--- a/baselines/ppo2/run_mujoco.py
+++ /dev/null
@@ -1,57 +0,0 @@
-#!/usr/bin/env python3
-import numpy as np
-from baselines.common.cmd_util import mujoco_arg_parser
-from baselines import bench, logger
-
-
-def train(env_id, num_timesteps, seed):
-    from baselines.common import set_global_seeds
-    from baselines.common.vec_env.vec_normalize import VecNormalize
-    from baselines.ppo2 import ppo2
-    from baselines.ppo2.policies import MlpPolicy
-    import gym
-    import tensorflow as tf
-    from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
-    ncpu = 1
-    config = tf.ConfigProto(allow_soft_placement=True,
-                            intra_op_parallelism_threads=ncpu,
-                            inter_op_parallelism_threads=ncpu)
-    tf.Session(config=config).__enter__()
-
-    def make_env():
-        env = gym.make(env_id)
-        env = bench.Monitor(env, logger.get_dir(), allow_early_resets=True)
-        return env
-
-    env = DummyVecEnv([make_env])
-    env = VecNormalize(env)
-
-    set_global_seeds(seed)
-    policy = MlpPolicy
-    model = ppo2.learn(policy=policy, env=env, nsteps=2048, nminibatches=32,
-                       lam=0.95, gamma=0.99, noptepochs=10, log_interval=1,
-                       ent_coef=0.0,
-                       lr=3e-4,
-                       cliprange=0.2,
-                       total_timesteps=num_timesteps)
-
-    return model, env
-
-
-def main():
-    args = mujoco_arg_parser().parse_args()
-    logger.configure()
-    model, env = train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
-
-    if args.play:
-        logger.log("Running trained model")
-        obs = np.zeros((env.num_envs,) + env.observation_space.shape)
-        obs[:] = env.reset()
-        while True:
-            actions = model.step(obs)[0]
-            obs[:]  = env.step(actions)[0]
-            env.render()
-
-
-if __name__ == '__main__':
-    main()
diff --git a/baselines/run.py b/baselines/run.py
new file mode 100644
index 0000000..cba8515
--- /dev/null
+++ b/baselines/run.py
@@ -0,0 +1,230 @@
+import sys
+import multiprocessing 
+import os
+import os.path as osp
+import gym
+from collections import defaultdict
+import tensorflow as tf
+
+from baselines.common.vec_env.vec_frame_stack import VecFrameStack
+from baselines.common.cmd_util import common_arg_parser, parse_unknown_args, make_mujoco_env, make_atari_env
+from baselines.common.tf_util import save_state, load_state, get_session
+from baselines import bench, logger
+from importlib import import_module
+
+from baselines.common.vec_env.vec_normalize import VecNormalize
+from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
+from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
+from baselines.common import atari_wrappers, retro_wrappers
+
+try:
+    from mpi4py import MPI
+except ImportError:
+    MPI = None
+
+_game_envs = defaultdict(set)
+for env in gym.envs.registry.all():
+    # solve this with regexes
+    env_type = env._entry_point.split(':')[0].split('.')[-1]
+    _game_envs[env_type].add(env.id)
+
+# reading benchmark names directly from retro requires 
+# importing retro here, and for some reason that crashes tensorflow 
+# in ubuntu 
+_game_envs['retro'] = set([
+    'BubbleBobble-Nes',
+    'SuperMarioBros-Nes',
+    'TwinBee3PokoPokoDaimaou-Nes',
+    'SpaceHarrier-Nes',
+    'SonicTheHedgehog-Genesis',
+    'Vectorman-Genesis',
+    'FinalFight-Snes',
+    'SpaceInvaders-Snes',
+])
+
+
+def train(args, extra_args):
+    env_type, env_id = get_env_type(args.env)
+        
+    total_timesteps = int(args.num_timesteps)
+    seed = args.seed
+
+    learn = get_learn_function(args.alg)
+    alg_kwargs = get_learn_function_defaults(args.alg, env_type)
+    alg_kwargs.update(extra_args)
+
+    env = build_env(args)
+
+    if args.network:
+        alg_kwargs['network'] = args.network
+    else:
+        if alg_kwargs.get('network') is None:
+            alg_kwargs['network'] = get_default_network(env_type)
+ 
+       
+    
+    print('Training {} on {}:{} with arguments \n{}'.format(args.alg, env_type, env_id, alg_kwargs))
+
+    model = learn(
+        env=env,  
+        seed=seed,
+        total_timesteps=total_timesteps,
+        **alg_kwargs
+    )
+
+    return model, env
+
+
+def build_env(args, render=False):
+    ncpu = multiprocessing.cpu_count()
+    if sys.platform == 'darwin': ncpu //= 2
+    nenv = args.num_env or ncpu if not render else 1
+    alg = args.alg
+    rank = MPI.COMM_WORLD.Get_rank() if MPI else 0
+    seed = args.seed    
+
+    env_type, env_id = get_env_type(args.env)
+    if env_type == 'mujoco':
+        get_session(tf.ConfigProto(allow_soft_placement=True,
+                                   intra_op_parallelism_threads=1, 
+                                   inter_op_parallelism_threads=1))
+
+        if args.num_env:
+            env = SubprocVecEnv([lambda: make_mujoco_env(env_id, seed + i if seed is not None else None, args.reward_scale) for i in range(args.num_env)])    
+        else:
+            env = DummyVecEnv([lambda: make_mujoco_env(env_id, seed, args.reward_scale)])
+
+        env = VecNormalize(env)
+
+    elif env_type == 'atari':
+        if alg == 'acer':
+            env = make_atari_env(env_id, nenv, seed)
+        elif alg == 'deepq':
+            env = atari_wrappers.make_atari(env_id)
+            env.seed(seed)
+            env = bench.Monitor(env, logger.get_dir())
+            env = atari_wrappers.wrap_deepmind(env, frame_stack=True, scale=True)
+        elif alg == 'trpo_mpi':
+            env = atari_wrappers.make_atari(env_id)
+            env.seed(seed)
+            env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), str(rank)))
+            env = atari_wrappers.wrap_deepmind(env)
+            # TODO check if the second seeding is necessary, and eventually remove
+            env.seed(seed)
+        else:
+            frame_stack_size = 4
+            env = VecFrameStack(make_atari_env(env_id, nenv, seed), frame_stack_size)
+
+    elif env_type == 'retro':
+        import retro
+        gamestate = args.gamestate or 'Level1-1'
+        env = retro_wrappers.make_retro(game=args.env, state=gamestate, max_episode_steps=10000, use_restricted_actions=retro.Actions.DISCRETE)
+        env.seed(args.seed)
+        env = bench.Monitor(env, logger.get_dir())
+        env = retro_wrappers.wrap_deepmind_retro(env)
+        
+    elif env_type == 'classic':
+        def make_env():
+            e = gym.make(env_id)
+            e.seed(seed)
+            return e
+            
+        env = DummyVecEnv([make_env])
+ 
+    return env
+
+
+def get_env_type(env_id):
+    if env_id in _game_envs.keys():
+        env_type = env_id
+        env_id =  [g for g in _game_envs[env_type]][0]
+    else:
+        env_type = None
+        for g, e in _game_envs.items():
+            if env_id in e:
+                env_type = g
+                break 
+        assert env_type is not None, 'env_id {} is not recognized in env types'.format(env_id, _game_envs.keys())
+
+    return env_type, env_id
+
+def get_default_network(env_type):
+    if env_type == 'mujoco' or env_type=='classic':
+        return 'mlp'
+    if env_type == 'atari':
+        return 'cnn'
+
+    raise ValueError('Unknown env_type {}'.format(env_type))
+    
+def get_alg_module(alg, submodule=None):
+    submodule = submodule or alg
+    try:
+        # first try to import the alg module from baselines
+        alg_module = import_module('.'.join(['baselines', alg, submodule]))
+    except ImportError:
+        # then from rl_algs
+        alg_module = import_module('.'.join(['rl_' + 'algs', alg, submodule]))
+    
+    return alg_module
+        
+
+def get_learn_function(alg):
+    return get_alg_module(alg).learn
+
+def get_learn_function_defaults(alg, env_type):
+    try:
+        alg_defaults = get_alg_module(alg, 'defaults')
+        kwargs = getattr(alg_defaults, env_type)()
+    except (ImportError, AttributeError):
+        kwargs = {}       
+    return kwargs
+    
+def parse(v): 
+    '''
+    convert value of a command-line arg to a python object if possible, othewise, keep as string
+    '''
+
+    assert isinstance(v, str)
+    try:
+        return eval(v) 
+    except (NameError, SyntaxError): 
+        return v
+
+
+def main():
+    # configure logger, disable logging in child MPI processes (with rank > 0) 
+            
+    arg_parser = common_arg_parser()
+    args, unknown_args = arg_parser.parse_known_args()
+    extra_args = {k: parse(v) for k,v in parse_unknown_args(unknown_args).items()}
+
+    
+    if MPI is None or MPI.COMM_WORLD.Get_rank() == 0:
+        rank = 0
+        logger.configure()
+    else:
+        logger.configure(format_strs = [])
+        rank = MPI.COMM_WORLD.Get_rank()
+
+    model, _ = train(args, extra_args)
+
+    if args.save_path is not None and rank == 0:
+        save_path = osp.expanduser(args.save_path)
+        model.save(save_path)
+    
+
+    if args.play:
+        logger.log("Running trained model")
+        env = build_env(args, render=True)
+        obs = env.reset()
+        while True:
+            actions = model.step(obs)[0]
+            obs, _, done, _  = env.step(actions)
+            env.render()
+            if done:
+                obs = env.reset()
+            
+
+
+if __name__ == '__main__':
+    main()
diff --git a/baselines/trpo_mpi/defaults.py b/baselines/trpo_mpi/defaults.py
new file mode 100644
index 0000000..96b6cb3
--- /dev/null
+++ b/baselines/trpo_mpi/defaults.py
@@ -0,0 +1,30 @@
+from rl_common.models import mlp, cnn_small
+
+
+def atari():
+    return dict(
+        network = cnn_small(),
+        timesteps_per_batch=512, 
+        max_kl=0.001,
+        cg_iters=10,
+        cg_damping=1e-3,
+        gamma=0.98,
+        lam=1.0,
+        vf_iters=3,
+        vf_stepsize=1e-4,
+        entcoeff=0.00,
+    )
+
+def mujoco():
+    return dict(
+        network = mlp(num_hidden=32, num_layers=2),
+        timesteps_per_batch=1024,
+        max_kl=0.01,
+        cg_iters=10,
+        cg_damping=0.1,
+        gamma=0.99,
+        lam=0.98,
+        vf_iters=5,
+        vf_stepsize=1e-3,
+        normalize_observations=True, 
+    )
diff --git a/baselines/trpo_mpi/nosharing_cnn_policy.py b/baselines/trpo_mpi/nosharing_cnn_policy.py
deleted file mode 100644
index 97b2dcd..0000000
--- a/baselines/trpo_mpi/nosharing_cnn_policy.py
+++ /dev/null
@@ -1,56 +0,0 @@
-import baselines.common.tf_util as U
-import tensorflow as tf
-import gym
-from baselines.common.distributions import make_pdtype
-
-class CnnPolicy(object):
-    recurrent = False
-    def __init__(self, name, ob_space, ac_space):
-        with tf.variable_scope(name):
-            self._init(ob_space, ac_space)
-            self.scope = tf.get_variable_scope().name
-
-    def _init(self, ob_space, ac_space):
-        assert isinstance(ob_space, gym.spaces.Box)
-
-        self.pdtype = pdtype = make_pdtype(ac_space)
-        sequence_length = None
-
-        ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))
-
-        obscaled = ob / 255.0
-
-        with tf.variable_scope("pol"):
-            x = obscaled
-            x = tf.nn.relu(U.conv2d(x, 8, "l1", [8, 8], [4, 4], pad="VALID"))
-            x = tf.nn.relu(U.conv2d(x, 16, "l2", [4, 4], [2, 2], pad="VALID"))
-            x = U.flattenallbut0(x)
-            x = tf.nn.relu(tf.layers.dense(x, 128, name='lin', kernel_initializer=U.normc_initializer(1.0)))
-            logits = tf.layers.dense(x, pdtype.param_shape()[0], name='logits', kernel_initializer=U.normc_initializer(0.01))
-            self.pd = pdtype.pdfromflat(logits)
-        with tf.variable_scope("vf"):
-            x = obscaled
-            x = tf.nn.relu(U.conv2d(x, 8, "l1", [8, 8], [4, 4], pad="VALID"))
-            x = tf.nn.relu(U.conv2d(x, 16, "l2", [4, 4], [2, 2], pad="VALID"))
-            x = U.flattenallbut0(x)
-            x = tf.nn.relu(tf.layers.dense(x, 128, name='lin', kernel_initializer=U.normc_initializer(1.0)))
-            self.vpred = tf.layers.dense(x, 1, name='value', kernel_initializer=U.normc_initializer(1.0))
-            self.vpredz = self.vpred
-
-        self.state_in = []
-        self.state_out = []
-
-        stochastic = tf.placeholder(dtype=tf.bool, shape=())
-        ac = self.pd.sample()
-        self._act = U.function([stochastic, ob], [ac, self.vpred])
-
-    def act(self, stochastic, ob):
-        ac1, vpred1 =  self._act(stochastic, ob[None])
-        return ac1[0], vpred1[0]
-    def get_variables(self):
-        return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope)
-    def get_trainable_variables(self):
-        return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)
-    def get_initial_state(self):
-        return []
-
diff --git a/baselines/trpo_mpi/run_atari.py b/baselines/trpo_mpi/run_atari.py
deleted file mode 100644
index f31ebfd..0000000
--- a/baselines/trpo_mpi/run_atari.py
+++ /dev/null
@@ -1,43 +0,0 @@
-    #!/usr/bin/env python3
-from mpi4py import MPI
-from baselines.common import set_global_seeds
-import os.path as osp
-import gym, logging
-from baselines import logger
-from baselines import bench
-from baselines.common.atari_wrappers import make_atari, wrap_deepmind
-from baselines.common.cmd_util import atari_arg_parser
-
-def train(env_id, num_timesteps, seed):
-    from baselines.trpo_mpi.nosharing_cnn_policy import CnnPolicy
-    from baselines.trpo_mpi import trpo_mpi
-    import baselines.common.tf_util as U
-    rank = MPI.COMM_WORLD.Get_rank()
-    sess = U.single_threaded_session()
-    sess.__enter__()
-    if rank == 0:
-        logger.configure()
-    else:
-        logger.configure(format_strs=[])
-
-    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
-    set_global_seeds(workerseed)
-    env = make_atari(env_id)
-    def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613
-        return CnnPolicy(name=name, ob_space=env.observation_space, ac_space=env.action_space)
-    env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), str(rank)))
-    env.seed(workerseed)
-
-    env = wrap_deepmind(env)
-    env.seed(workerseed)
-
-    trpo_mpi.learn(env, policy_fn, timesteps_per_batch=512, max_kl=0.001, cg_iters=10, cg_damping=1e-3,
-        max_timesteps=int(num_timesteps * 1.1), gamma=0.98, lam=1.0, vf_iters=3, vf_stepsize=1e-4, entcoeff=0.00)
-    env.close()
-
-def main():
-    args = atari_arg_parser().parse_args()
-    train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
-
-if __name__ == "__main__":
-    main()
diff --git a/baselines/trpo_mpi/run_mujoco.py b/baselines/trpo_mpi/run_mujoco.py
deleted file mode 100644
index 220bb91..0000000
--- a/baselines/trpo_mpi/run_mujoco.py
+++ /dev/null
@@ -1,36 +0,0 @@
-#!/usr/bin/env python3
-# noinspection PyUnresolvedReferences
-from mpi4py import MPI
-from baselines.common.cmd_util import make_mujoco_env, mujoco_arg_parser
-from baselines import logger
-from baselines.ppo1.mlp_policy import MlpPolicy
-from baselines.trpo_mpi import trpo_mpi
-
-def train(env_id, num_timesteps, seed):
-    import baselines.common.tf_util as U
-    sess = U.single_threaded_session()
-    sess.__enter__()
-
-    rank = MPI.COMM_WORLD.Get_rank()
-    if rank == 0:
-        logger.configure()
-    else:
-        logger.configure(format_strs=[])
-        logger.set_level(logger.DISABLED)
-    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
-    def policy_fn(name, ob_space, ac_space):
-        return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
-            hid_size=32, num_hid_layers=2)
-    env = make_mujoco_env(env_id, workerseed)
-    trpo_mpi.learn(env, policy_fn, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1,
-        max_timesteps=num_timesteps, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3)
-    env.close()
-
-def main():
-    args = mujoco_arg_parser().parse_args()
-    train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
-
-
-if __name__ == '__main__':
-    main()
-
diff --git a/baselines/trpo_mpi/trpo_mpi.py b/baselines/trpo_mpi/trpo_mpi.py
index e23d9ac..d84b0fc 100644
--- a/baselines/trpo_mpi/trpo_mpi.py
+++ b/baselines/trpo_mpi/trpo_mpi.py
@@ -6,8 +6,11 @@ import time
 from baselines.common import colorize
 from mpi4py import MPI
 from collections import deque
+from baselines.common import set_global_seeds
 from baselines.common.mpi_adam import MpiAdam
 from baselines.common.cg import cg
+from baselines.common.input import observation_placeholder
+from baselines.common.policies import build_policy
 from contextlib import contextmanager
 
 def traj_segment_generator(pi, env, horizon, stochastic):
@@ -33,7 +36,7 @@ def traj_segment_generator(pi, env, horizon, stochastic):
 
     while True:
         prevac = ac
-        ac, vpred = pi.act(stochastic, ob)
+        ac, vpred, _, _ = pi.step(ob, stochastic=stochastic)
         # Slight weirdness here because we need value function at time T
         # before returning segment [0, T-1] so we get the correct
         # terminal value
@@ -41,7 +44,7 @@ def traj_segment_generator(pi, env, horizon, stochastic):
             yield {"ob" : obs, "rew" : rews, "vpred" : vpreds, "new" : news,
                     "ac" : acs, "prevac" : prevacs, "nextvpred": vpred * (1 - new),
                     "ep_rets" : ep_rets, "ep_lens" : ep_lens}
-            _, vpred = pi.act(stochastic, ob)
+            _, vpred, _, _ = pi.step(ob, stochastic=stochastic)
             # Be careful!!! if you change the downstream algorithm to aggregate
             # several of these batches, then be sure to do a deepcopy
             ep_rets = []
@@ -79,30 +82,100 @@ def add_vtarg_and_adv(seg, gamma, lam):
         gaelam[t] = lastgaelam = delta + gamma * lam * nonterminal * lastgaelam
     seg["tdlamret"] = seg["adv"] + seg["vpred"]
 
-def learn(env, policy_fn, *,
-        timesteps_per_batch, # what to train on
-        max_kl, cg_iters,
-        gamma, lam, # advantage estimation
+def learn(*,
+        network, 
+        env,
+        total_timesteps, 
+        timesteps_per_batch=1024, # what to train on
+        max_kl=0.001, 
+        cg_iters=10,   
+        gamma=0.99, 
+        lam=1.0, # advantage estimation
+        seed=None,
         entcoeff=0.0,
         cg_damping=1e-2,
         vf_stepsize=3e-4,
         vf_iters =3,
-        max_timesteps=0, max_episodes=0, max_iters=0,  # time constraint
-        callback=None
+        max_episodes=0, max_iters=0,  # time constraint
+        callback=None,
+        load_path=None,
+        **network_kwargs
         ):
+    '''
+    learn a policy function with TRPO algorithm
+    
+    Parameters:
+    ----------
+
+    network                 neural network to learn. Can be either string ('mlp', 'cnn', 'lstm', 'lnlstm' for basic types)
+                            or function that takes input placeholder and returns tuple (output, None) for feedforward nets
+                            or (output, (state_placeholder, state_output, mask_placeholder)) for recurrent nets
+
+    env                     environment (one of the gym environments or wrapped via baselines.common.vec_env.VecEnv-type class
+
+    timesteps_per_batch     timesteps per gradient estimation batch
+
+    max_kl                  max KL divergence between old policy and new policy ( KL(pi_old || pi) )
+
+    entcoeff                coefficient of policy entropy term in the optimization objective
+
+    cg_iters                number of iterations of conjugate gradient algorithm
+
+    cg_damping              conjugate gradient damping 
+
+    vf_stepsize             learning rate for adam optimizer used to optimie value function loss
+
+    vf_iters                number of iterations of value function optimization iterations per each policy optimization step
+
+    total_timesteps           max number of timesteps
+
+    max_episodes            max number of episodes
+    
+    max_iters               maximum number of policy optimization iterations
+
+    callback                function to be called with (locals(), globals()) each policy optimization step
+    
+    load_path               str, path to load the model from (default: None, i.e. no model is loaded)
+
+    **network_kwargs        keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
+
+    Returns:
+    -------
+
+    learnt model
+
+    '''
+    
+    
     nworkers = MPI.COMM_WORLD.Get_size()
     rank = MPI.COMM_WORLD.Get_rank()
+
+    cpus_per_worker = 1
+    U.get_session(config=tf.ConfigProto(
+            allow_soft_placement=True, 
+            inter_op_parallelism_threads=cpus_per_worker,
+            intra_op_parallelism_threads=cpus_per_worker
+    ))
+    
+
+    policy = build_policy(env, network, value_network='copy', **network_kwargs)
+    set_global_seeds(seed)
+
     np.set_printoptions(precision=3)
     # Setup losses and stuff
     # ----------------------------------------
     ob_space = env.observation_space
     ac_space = env.action_space
-    pi = policy_fn("pi", ob_space, ac_space)
-    oldpi = policy_fn("oldpi", ob_space, ac_space)
+
+    ob = observation_placeholder(ob_space)
+    with tf.variable_scope("pi"):
+        pi = policy(observ_placeholder=ob)
+    with tf.variable_scope("oldpi"):
+        oldpi = policy(observ_placeholder=ob)
+
     atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable)
     ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return
 
-    ob = U.get_placeholder_cached(name="ob")
     ac = pi.pdtype.sample_placeholder([None])
 
     kloldnew = oldpi.pd.kl(pi.pd)
@@ -111,7 +184,7 @@ def learn(env, policy_fn, *,
     meanent = tf.reduce_mean(ent)
     entbonus = entcoeff * meanent
 
-    vferr = tf.reduce_mean(tf.square(pi.vpred - ret))
+    vferr = tf.reduce_mean(tf.square(pi.vf - ret))
 
     ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold
     surrgain = tf.reduce_mean(ratio * atarg)
@@ -122,9 +195,12 @@ def learn(env, policy_fn, *,
 
     dist = meankl
 
-    all_var_list = pi.get_trainable_variables()
-    var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")]
-    vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")]
+    all_var_list = get_trainable_variables("pi")
+    # var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")]
+    # vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")]
+    var_list = get_pi_trainable_variables("pi")
+    vf_var_list = get_vf_trainable_variables("pi")
+
     vfadam = MpiAdam(vf_var_list)
 
     get_flat = U.GetFlat(var_list)
@@ -142,7 +218,8 @@ def learn(env, policy_fn, *,
     fvp = U.flatgrad(gvp, var_list)
 
     assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv)
-        for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())])
+        for (oldv, newv) in zipsame(get_variables("oldpi"), get_variables("pi"))])
+
     compute_losses = U.function([ob, ac, atarg], losses)
     compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)])
     compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp)
@@ -166,6 +243,9 @@ def learn(env, policy_fn, *,
         return out
 
     U.initialize()
+    if load_path is not None:
+        pi.load(load_path)
+    
     th_init = get_flat()
     MPI.COMM_WORLD.Bcast(th_init, root=0)
     set_from_flat(th_init)
@@ -183,11 +263,16 @@ def learn(env, policy_fn, *,
     lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths
     rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards
 
-    assert sum([max_iters>0, max_timesteps>0, max_episodes>0])==1
+    if sum([max_iters>0, total_timesteps>0, max_episodes>0])==0:
+        # noththing to be done
+        return pi
+
+    assert sum([max_iters>0, total_timesteps>0, max_episodes>0]) < 2, \
+        'out of max_iters, total_timesteps, and max_episodes only one should be specified'
 
     while True:
         if callback: callback(locals(), globals())
-        if max_timesteps and timesteps_so_far >= max_timesteps:
+        if total_timesteps and timesteps_so_far >= total_timesteps:
             break
         elif max_episodes and episodes_so_far >= max_episodes:
             break
@@ -287,5 +372,20 @@ def learn(env, policy_fn, *,
         if rank==0:
             logger.dump_tabular()
 
+    return pi
+
 def flatten_lists(listoflists):
-    return [el for list_ in listoflists for el in list_]
\ No newline at end of file
+    return [el for list_ in listoflists for el in list_]
+
+def get_variables(scope):
+    return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope)
+
+def get_trainable_variables(scope):
+    return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope)
+
+def get_vf_trainable_variables(scope):
+    return [v for v in get_trainable_variables(scope) if 'vf' in v.name[len(scope):].split('/')]    
+
+def get_pi_trainable_variables(scope):
+    return [v for v in get_trainable_variables(scope) if 'pi' in v.name[len(scope):].split('/')]    
+
diff --git a/conftest.py b/conftest.py
new file mode 100644
index 0000000..3493c45
--- /dev/null
+++ b/conftest.py
@@ -0,0 +1,19 @@
+import pytest
+
+
+def pytest_addoption(parser):
+    parser.addoption('--runslow', action='store_true', default=False, help='run slow tests')
+
+
+def pytest_collection_modifyitems(config, items):
+    if config.getoption('--runslow'):
+        # --runslow given in cli: do not skip slow tests
+        return
+    skip_slow = pytest.mark.skip(reason='need --runslow option to run')
+    slow_tests = []
+    for item in items:
+        if 'slow' in item.keywords:
+            slow_tests.append(item.name)
+            item.add_marker(skip_slow)
+
+    print('skipping slow tests', ' '.join(slow_tests), 'use --runslow to run this')
diff --git a/setup.py b/setup.py
index bf8badc..35673ce 100644
--- a/setup.py
+++ b/setup.py
@@ -14,7 +14,6 @@ setup(name='baselines',
           'scipy',
           'tqdm',
           'joblib',
-          'zmq',
           'dill',
           'progressbar2',
           'mpi4py',
@@ -23,6 +22,12 @@ setup(name='baselines',
           'click',
           'opencv-python'
       ],
+      extras_require={
+        'test': [
+            'filelock',
+            'pytest'
+        ]
+      },
       description='OpenAI baselines: high quality implementations of reinforcement learning algorithms',
       author='OpenAI',
       url='https://github.com/openai/baselines',