Import internal repo (#409)

* import rl-algs from 2e3a166 commit * extra import of the baselines badge * exported commit with identity test * proper rng seeding in the test_identity
2018-05-21 15:24:00 -07:00
parent 9cb7ece338
commit 24fe3d6576
22 changed files with 321 additions and 116 deletions
--- a/README.md
+++ b/README.md
@@ -52,6 +52,8 @@ Install baselines package
 ```bash
 pip install -e .
 ```
+### MuJoCo
+Some of the baselines examples use [MuJoCo](http://www.mujoco.org) (multi-joint dynamics in contact) physics simulator, which is proprietary and requires binaries and a license (temporary 30-day license can be obtained from [www.mujoco.org](http://www.mujoco.org)). Instructions on setting up MuJoCo can be found [here](https://github.com/openai/mujoco-py)

 ## Testing the installation
 All unit tests in baselines can be run using pytest runner:
--- a/baselines/a2c/a2c.py
+++ b/baselines/a2c/a2c.py
@@ -131,7 +131,6 @@ class Runner(AbstractEnvRunner):
        return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values

 def learn(policy, env, seed, nsteps=5, total_timesteps=int(80e6), vf_coef=0.5, ent_coef=0.01, max_grad_norm=0.5, lr=7e-4, lrschedule='linear', epsilon=1e-5, alpha=0.99, gamma=0.99, log_interval=100):
-    tf.reset_default_graph()
    set_global_seeds(seed)

    nenvs = env.num_envs
@@ -158,3 +157,4 @@ def learn(policy, env, seed, nsteps=5, total_timesteps=int(80e6), vf_coef=0.5, e
            logger.record_tabular("explained_variance", float(ev))
            logger.dump_tabular()
    env.close()
+    return model
--- a/baselines/a2c/policies.py
+++ b/baselines/a2c/policies.py
@@ -2,6 +2,7 @@ import numpy as np
 import tensorflow as tf
 from baselines.a2c.utils import conv, fc, conv_to_fc, batch_to_seq, seq_to_batch, lstm, lnlstm
 from baselines.common.distributions import make_pdtype
+from baselines.common.input import observation_input

 def nature_cnn(unscaled_images, **conv_kwargs):
    """
@@ -19,14 +20,12 @@ def nature_cnn(unscaled_images, **conv_kwargs):
 class LnLstmPolicy(object):
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False):
        nenv = nbatch // nsteps
-        nh, nw, nc = ob_space.shape
-        ob_shape = (nbatch, nh, nw, nc)
-        X = tf.placeholder(tf.uint8, ob_shape) #obs
+        X, processed_x = observation_input(ob_space, nbatch)
        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
        self.pdtype = make_pdtype(ac_space)
        with tf.variable_scope("model", reuse=reuse):
-            h = nature_cnn(X)
+            h = nature_cnn(processed_x)
            xs = batch_to_seq(h, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm)
@@ -56,11 +55,9 @@ class LstmPolicy(object):

    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False):
        nenv = nbatch // nsteps
-
-        nh, nw, nc = ob_space.shape
-        ob_shape = (nbatch, nh, nw, nc)
        self.pdtype = make_pdtype(ac_space)
-        X = tf.placeholder(tf.uint8, ob_shape) #obs
+        X, processed_x = observation_input(ob_space, nbatch)
+
        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
        with tf.variable_scope("model", reuse=reuse):
@@ -93,12 +90,10 @@ class LstmPolicy(object):
 class CnnPolicy(object):

    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False, **conv_kwargs): #pylint: disable=W0613
-        nh, nw, nc = ob_space.shape
-        ob_shape = (nbatch, nh, nw, nc)
        self.pdtype = make_pdtype(ac_space)
-        X = tf.placeholder(tf.uint8, ob_shape) #obs
+        X, processed_x = observation_input(ob_space, nbatch)
        with tf.variable_scope("model", reuse=reuse):
-            h = nature_cnn(X, **conv_kwargs)
+            h = nature_cnn(processed_x, **conv_kwargs)
            vf = fc(h, 'v', 1)[:,0]
            self.pd, self.pi = self.pdtype.pdfromlatent(h, init_scale=0.01)

@@ -120,15 +115,14 @@ class CnnPolicy(object):

 class MlpPolicy(object):
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False): #pylint: disable=W0613
-        ob_shape = (nbatch,) + ob_space.shape
        self.pdtype = make_pdtype(ac_space)
-        X = tf.placeholder(tf.float32, ob_shape, name='Ob') #obs
        with tf.variable_scope("model", reuse=reuse):
+            X, processed_x = observation_input(ob_space, nbatch)
            activ = tf.tanh
-            flatten = tf.layers.flatten
-            pi_h1 = activ(fc(flatten(X), 'pi_fc1', nh=64, init_scale=np.sqrt(2)))
+            processed_x = tf.layers.flatten(processed_x)
+            pi_h1 = activ(fc(processed_x, 'pi_fc1', nh=64, init_scale=np.sqrt(2)))
            pi_h2 = activ(fc(pi_h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2)))
-            vf_h1 = activ(fc(flatten(X), 'vf_fc1', nh=64, init_scale=np.sqrt(2)))
+            vf_h1 = activ(fc(processed_x, 'vf_fc1', nh=64, init_scale=np.sqrt(2)))
            vf_h2 = activ(fc(vf_h1, 'vf_fc2', nh=64, init_scale=np.sqrt(2)))
            vf = fc(vf_h2, 'vf', 1)[:,0]

--- a/baselines/bench/benchmarks.py
+++ b/baselines/bench/benchmarks.py
@@ -9,6 +9,8 @@ _atariexpl7 = ['Freeway', 'Gravitar', 'MontezumaRevenge', 'Pitfall', 'PrivateEye
 _BENCHMARKS = []

 remove_version_re = re.compile(r'-v\d+$')
+
+
 def register_benchmark(benchmark):
    for b in _BENCHMARKS:
        if b['name'] == benchmark['name']:
@@ -138,3 +140,11 @@ register_benchmark({
    'tasks': [{'desc': _game, 'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(10e6)} for _game in _atari50]
 })

+# HER DDPG
+
+register_benchmark({
+    'name': 'HerDdpg',
+    'description': 'Smoke-test only benchmark of HER',
+    'tasks': [{'trials': 1, 'env_id': 'FetchReach-v1'}]
+})
+
--- a/baselines/common/cmd_util.py
+++ b/baselines/common/cmd_util.py
@@ -74,6 +74,7 @@ def mujoco_arg_parser():
    parser.add_argument('--env', help='environment ID', type=str, default='Reacher-v2')
    parser.add_argument('--seed', help='RNG seed', type=int, default=0)
    parser.add_argument('--num-timesteps', type=int, default=int(1e6))
+    parser.add_argument('--play', default=False, action='store_true')
    return parser

 def robotics_arg_parser():
--- a/baselines/common/identity_env.py
+++ b/baselines/common/identity_env.py
@@ -0,0 +1,30 @@
+from gym import Env
+from gym.spaces import Discrete
+
+
+class IdentityEnv(Env):
+    def __init__(
+            self,
+            dim,
+            ep_length=100,
+    ):
+
+        self.action_space = Discrete(dim)
+        self.reset()
+
+    def reset(self):
+        self._choose_next_state()
+        self.observation_space = self.action_space
+
+        return self.state
+
+    def step(self, actions):
+        rew = self._get_reward(actions)
+        self._choose_next_state()
+        return self.state, rew, False, {}
+
+    def _choose_next_state(self):
+        self.state = self.action_space.sample()
+
+    def _get_reward(self, actions):
+        return 1 if self.state == actions else 0
--- a/baselines/common/input.py
+++ b/baselines/common/input.py
@@ -0,0 +1,30 @@
+import tensorflow as tf
+from gym.spaces import Discrete, Box
+
+def observation_input(ob_space, batch_size=None, name='Ob'):
+    '''
+    Build observation input with encoding depending on the 
+    observation space type
+    Params:
+    
+    ob_space: observation space (should be one of gym.spaces)
+    batch_size: batch size for input (default is None, so that resulting input placeholder can take tensors with any batch size)
+    name: tensorflow variable name for input placeholder
+
+    returns: tuple (input_placeholder, processed_input_tensor)
+    '''
+    if isinstance(ob_space, Discrete):
+        input_x  = tf.placeholder(shape=(batch_size,), dtype=tf.int32, name=name)
+        processed_x = tf.to_float(tf.one_hot(input_x, ob_space.n))
+        return input_x, processed_x
+
+    elif isinstance(ob_space, Box):
+        input_shape = (batch_size,) + ob_space.shape
+        input_x = tf.placeholder(shape=input_shape, dtype=ob_space.dtype, name=name)
+        processed_x = tf.to_float(input_x)
+        return input_x, processed_x
+
+    else:
+        raise NotImplementedError
+
+ 
--- a/baselines/common/runners.py
+++ b/baselines/common/runners.py
@@ -7,7 +7,7 @@ class AbstractEnvRunner(ABC):
        self.model = model
        nenv = env.num_envs
        self.batch_ob_shape = (nenv*nsteps,) + env.observation_space.shape
-        self.obs = np.zeros((nenv,) + env.observation_space.shape, dtype=model.train_model.X.dtype.name)
+        self.obs = np.zeros((nenv,) + env.observation_space.shape, dtype=env.observation_space.dtype.name)
        self.obs[:] = env.reset()
        self.nsteps = nsteps
        self.states = model.initial_state
--- a/baselines/common/test_identity.py
+++ b/baselines/common/test_identity.py
@@ -0,0 +1,44 @@
+import pytest
+import tensorflow as tf
+import random
+import numpy as np
+from gym.spaces import np_random
+
+from baselines.a2c import a2c
+from baselines.ppo2 import ppo2
+from baselines.common.identity_env import IdentityEnv
+from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
+from baselines.ppo2.policies import MlpPolicy
+
+
+learn_func_list = [
+    lambda e: a2c.learn(policy=MlpPolicy, env=e, seed=0, total_timesteps=50000),
+    lambda e: ppo2.learn(policy=MlpPolicy, env=e, total_timesteps=50000, lr=1e-3, nsteps=128, ent_coef=0.01)
+]
+
+
+@pytest.mark.slow
+@pytest.mark.parametrize("learn_func", learn_func_list)
+def test_identity(learn_func):
+    '''
+    Test if the algorithm (with a given policy) 
+    can learn an identity transformation (i.e. return observation as an action)
+    '''
+    np.random.seed(0)
+    np_random.seed(0)
+    random.seed(0)
+
+    env = DummyVecEnv([lambda: IdentityEnv(10)])
+
+    with tf.Graph().as_default(), tf.Session().as_default():
+        tf.set_random_seed(0)
+        model = learn_func(env)
+
+        N_TRIALS = 1000
+        sum_rew = 0
+        obs = env.reset()
+        for i in range(N_TRIALS):
+            obs, rew, done, _ = env.step(model.step(obs)[0])
+            sum_rew += rew
+
+        assert sum_rew > 0.9 * N_TRIALS
--- a/baselines/common/tf_util.py
+++ b/baselines/common/tf_util.py
@@ -279,3 +279,27 @@ def display_var_info(vars):
        logger.info("   %s%s %i params %s" % (name, " "*(55-len(name)), v_params, str(v.shape)))

    logger.info("Total model parameters: %0.2f million" % (count_params*1e-6))
+
+
+def get_available_gpus():
+    # recipe from here:
+    # https://stackoverflow.com/questions/38559755/how-to-get-current-available-gpus-in-tensorflow?utm_medium=organic&utm_source=google_rich_qa&utm_campaign=google_rich_qa
+ 
+    from tensorflow.python.client import device_lib
+    local_device_protos = device_lib.list_local_devices()
+    return [x.name for x in local_device_protos if x.device_type == 'GPU']
+
+# ================================================================
+# Saving variables
+# ================================================================
+
+def load_state(fname):
+    saver = tf.train.Saver()
+    saver.restore(tf.get_default_session(), fname)
+
+def save_state(fname):
+    os.makedirs(os.path.dirname(fname), exist_ok=True)
+    saver = tf.train.Saver()
+    saver.save(tf.get_default_session(), fname)
+
+
--- a/baselines/common/vec_env/dummy_vec_env.py
+++ b/baselines/common/vec_env/dummy_vec_env.py
@@ -11,18 +11,18 @@ class DummyVecEnv(VecEnv):
        shapes, dtypes = {}, {}
        self.keys = []
        obs_space = env.observation_space
+
        if isinstance(obs_space, spaces.Dict):
            assert isinstance(obs_space.spaces, OrderedDict)
-            for key, box in obs_space.spaces.items():
-                assert isinstance(box, spaces.Box)
-                shapes[key] = box.shape
-                dtypes[key] = box.dtype
-                self.keys.append(key)
+            subspaces = obs_space.spaces
        else:
-            box = obs_space
-            assert isinstance(box, spaces.Box)
-            self.keys = [None]
-            shapes, dtypes = { None: box.shape }, { None: box.dtype }
+            subspaces = {None: obs_space}
+
+        for key, box in subspaces.items():
+            shapes[key] = box.shape
+            dtypes[key] = box.dtype
+            self.keys.append(key)
+        
        self.buf_obs = { k: np.zeros((self.num_envs,) + tuple(shapes[k]), dtype=dtypes[k]) for k in self.keys }
        self.buf_dones = np.zeros((self.num_envs,), dtype=np.bool)
        self.buf_rews  = np.zeros((self.num_envs,), dtype=np.float32)
@@ -50,6 +50,9 @@ class DummyVecEnv(VecEnv):
    def close(self):
        return

+    def render(self):
+        return [e.render() for e in self.envs]
+
    def _save_obs(self, e, obs):
        for k in self.keys:
            if k is None:
--- a/baselines/deepq/experiments/custom_cartpole.py
+++ b/baselines/deepq/experiments/custom_cartpole.py
@@ -9,7 +9,7 @@ import baselines.common.tf_util as U
 from baselines import logger
 from baselines import deepq
 from baselines.deepq.replay_buffer import ReplayBuffer
-from baselines.deepq.utils import BatchInput
+from baselines.deepq.utils import ObservationInput
 from baselines.common.schedules import LinearSchedule


@@ -28,7 +28,7 @@ if __name__ == '__main__':
        env = gym.make("CartPole-v0")
        # Create all the functions necessary to train the model
        act, train, update_target, debug = deepq.build_train(
-            make_obs_ph=lambda name: BatchInput(env.observation_space.shape, name=name),
+            make_obs_ph=lambda name: ObservationInput(env.observation_space, name=name),
            q_func=model,
            num_actions=env.action_space.n,
            optimizer=tf.train.AdamOptimizer(learning_rate=5e-4),
--- a/baselines/deepq/experiments/run_atari.py
+++ b/baselines/deepq/experiments/run_atari.py
@@ -14,6 +14,9 @@ def main():
    parser.add_argument('--prioritized-replay-alpha', type=float, default=0.6)
    parser.add_argument('--dueling', type=int, default=1)
    parser.add_argument('--num-timesteps', type=int, default=int(10e6))
+    parser.add_argument('--checkpoint-freq', type=int, default=10000)
+    parser.add_argument('--checkpoint-path', type=str, default=None)
+
    args = parser.parse_args()
    logger.configure()
    set_global_seeds(args.seed)
@@ -39,7 +42,9 @@ def main():
        target_network_update_freq=1000,
        gamma=0.99,
        prioritized_replay=bool(args.prioritized),
-        prioritized_replay_alpha=args.prioritized_replay_alpha
+        prioritized_replay_alpha=args.prioritized_replay_alpha,
+        checkpoint_freq=args.checkpoint_freq,
+        checkpoint_path=args.checkpoint_path,
    )

    env.close()
--- a/baselines/deepq/simple.py
+++ b/baselines/deepq/simple.py
@@ -6,13 +6,15 @@ import zipfile
 import cloudpickle
 import numpy as np

-import gym
 import baselines.common.tf_util as U
+from baselines.common.tf_util import load_state, save_state
 from baselines import logger
 from baselines.common.schedules import LinearSchedule
+from baselines.common.input import observation_input
+
 from baselines import deepq
 from baselines.deepq.replay_buffer import ReplayBuffer, PrioritizedReplayBuffer
-from baselines.deepq.utils import BatchInput, load_state, save_state
+from baselines.deepq.utils import ObservationInput


 class ActWrapper(object):
@@ -88,6 +90,7 @@ def learn(env,
          batch_size=32,
          print_freq=100,
          checkpoint_freq=10000,
+          checkpoint_path=None,
          learning_starts=1000,
          gamma=1.0,
          target_network_update_freq=500,
@@ -170,9 +173,9 @@ def learn(env,

    # capture the shape outside the closure so that the env object is not serialized
    # by cloudpickle when serializing make_obs_ph
-    observation_space_shape = env.observation_space.shape
+
    def make_obs_ph(name):
-        return BatchInput(observation_space_shape, name=name)
+        return ObservationInput(env.observation_space, name=name)

    act, train, update_target, debug = deepq.build_train(
        make_obs_ph=make_obs_ph,
@@ -216,9 +219,17 @@ def learn(env,
    saved_mean_reward = None
    obs = env.reset()
    reset = True
+
    with tempfile.TemporaryDirectory() as td:
-        model_saved = False
+        td = checkpoint_path or td
+
        model_file = os.path.join(td, "model")
+        model_saved = False
+        if tf.train.latest_checkpoint(td) is not None:
+            load_state(model_file)
+            logger.log('Loaded model from {}'.format(model_file))
+            model_saved = True
+
        for t in range(max_timesteps):
            if callback is not None:
                if callback(locals(), globals()):
--- a/baselines/deepq/test_identity.py
+++ b/baselines/deepq/test_identity.py
@@ -0,0 +1,43 @@
+import tensorflow as tf
+import random
+
+from baselines import deepq
+from baselines.common.identity_env import IdentityEnv
+
+
+def test_identity():
+
+    with tf.Graph().as_default():
+        env = IdentityEnv(10)
+        random.seed(0)
+
+        tf.set_random_seed(0)
+
+        param_noise = False
+        model = deepq.models.mlp([32])
+        act = deepq.learn(
+            env,
+            q_func=model,
+            lr=1e-3,
+            max_timesteps=10000,
+            buffer_size=50000,
+            exploration_fraction=0.1,
+            exploration_final_eps=0.02,
+            print_freq=10,
+            param_noise=param_noise,
+        )
+
+        tf.set_random_seed(0)
+
+        N_TRIALS = 1000
+        sum_rew = 0
+        obs = env.reset()
+        for i in range(N_TRIALS):
+            obs, rew, done, _ = env.step(act([obs]))
+            sum_rew += rew
+
+        assert sum_rew > 0.9 * N_TRIALS
+
+
+if __name__ == '__main__':
+    test_identity()
--- a/baselines/deepq/utils.py
+++ b/baselines/deepq/utils.py
@@ -1,24 +1,12 @@
-import os
+from baselines.common.input import observation_input

 import tensorflow as tf

-# ================================================================
-# Saving variables
-# ================================================================
-
-def load_state(fname):
-    saver = tf.train.Saver()
-    saver.restore(tf.get_default_session(), fname)
-
-def save_state(fname):
-    os.makedirs(os.path.dirname(fname), exist_ok=True)
-    saver = tf.train.Saver()
-    saver.save(tf.get_default_session(), fname)
-
 # ================================================================
 # Placeholders
 # ================================================================

+
 class TfInput(object):
    def __init__(self, name="(unnamed)"):
        """Generalized Tensorflow placeholder. The main differences are:
@@ -50,20 +38,6 @@ class PlaceholderTfInput(TfInput):
    def make_feed_dict(self, data):
        return {self._placeholder: data}

-class BatchInput(PlaceholderTfInput):
-    def __init__(self, shape, dtype=tf.float32, name=None):
-        """Creates a placeholder for a batch of tensors of a given shape and dtype
-
-        Parameters
-        ----------
-        shape: [int]
-            shape of a single elemenet of the batch
-        dtype: tf.dtype
-            number representation used for tensor contents
-        name: str
-            name of the underlying placeholder
-        """
-        super().__init__(tf.placeholder(dtype, [None] + list(shape), name=name))

 class Uint8Input(PlaceholderTfInput):
    def __init__(self, shape, name=None):
@@ -85,4 +59,25 @@ class Uint8Input(PlaceholderTfInput):
        self._output = tf.cast(super().get(), tf.float32) / 255.0

    def get(self):
-        return self._output
+        return self._output
+
+
+class ObservationInput(PlaceholderTfInput):
+    def __init__(self, observation_space, name=None):
+        """Creates an input placeholder tailored to a specific observation space
+        
+        Parameters
+        ----------
+
+        observation_space: 
+                observation space of the environment. Should be one of the gym.spaces types
+        name: str 
+                tensorflow name of the underlying placeholder
+        """
+        inpt, self.processed_inpt = observation_input(observation_space, name=name)
+        super().__init__(inpt)
+
+    def get(self):
+        return self.processed_inpt
+    
+    
--- a/baselines/her/experiment/config.py
+++ b/baselines/her/experiment/config.py
@@ -1,7 +1,4 @@
-from copy import deepcopy
 import numpy as np
-import json
-import os
 import gym

 from baselines import logger
@@ -10,7 +7,7 @@ from baselines.her.her import make_sample_her_transitions


 DEFAULT_ENV_PARAMS = {
-    'FetchReach-v0': {
+    'FetchReach-v1': {
        'n_cycles': 10,
    },
 }
@@ -51,6 +48,8 @@ DEFAULT_PARAMS = {


 CACHED_ENVS = {}
+
+
 def cached_make_env(make_env):
    """
    Only creates a new environment from the provided function if one has not yet already been
@@ -68,6 +67,7 @@ def prepare_params(kwargs):
    ddpg_params = dict()

    env_name = kwargs['env_name']
+
    def make_env():
        return gym.make(env_name)
    kwargs['make_env'] = make_env
@@ -75,7 +75,7 @@ def prepare_params(kwargs):
    assert hasattr(tmp_env, '_max_episode_steps')
    kwargs['T'] = tmp_env._max_episode_steps
    tmp_env.reset()
-    kwargs['max_u'] = np.array(kwargs['max_u']) if type(kwargs['max_u']) == list else kwargs['max_u']
+    kwargs['max_u'] = np.array(kwargs['max_u']) if isinstance(kwargs['max_u'], list) else kwargs['max_u']
    kwargs['gamma'] = 1. - 1. / kwargs['T']
    if 'lr' in kwargs:
        kwargs['pi_lr'] = kwargs['lr']
@@ -83,7 +83,7 @@ def prepare_params(kwargs):
        del kwargs['lr']
    for name in ['buffer_size', 'hidden', 'layers',
                 'network_class',
-                 'polyak', 
+                 'polyak',
                 'batch_size', 'Q_lr', 'pi_lr',
                 'norm_eps', 'norm_clip', 'max_u',
                 'action_l2', 'clip_obs', 'scope', 'relative_goals']:
@@ -103,6 +103,7 @@ def log_params(params, logger=logger):
 def configure_her(params):
    env = cached_make_env(params['make_env'])
    env.reset()
+
    def reward_fun(ag_2, g, info):  # vectorized
        return env.compute_reward(achieved_goal=ag_2, desired_goal=g, info=info)

--- a/baselines/her/experiment/train.py
+++ b/baselines/her/experiment/train.py
@@ -13,6 +13,8 @@ import baselines.her.experiment.config as config
 from baselines.her.rollout import RolloutWorker
 from baselines.her.util import mpi_fork

+from subprocess import CalledProcessError
+

 def mpi_average(value):
    if value == []:
@@ -81,12 +83,17 @@ def train(policy, rollout_worker, evaluator,


 def launch(
-    env_name, logdir, n_epochs, num_cpu, seed, replay_strategy, policy_save_interval, clip_return,
+    env, logdir, n_epochs, num_cpu, seed, replay_strategy, policy_save_interval, clip_return,
    override_params={}, save_policies=True
 ):
    # Fork for multi-CPU MPI implementation.
    if num_cpu > 1:
-        whoami = mpi_fork(num_cpu)
+        try:
+            whoami = mpi_fork(num_cpu, ['--bind-to', 'core'])
+        except CalledProcessError:
+            # fancy version of mpi call failed, try simple version
+            whoami = mpi_fork(num_cpu)
+
        if whoami == 'parent':
            sys.exit(0)
        import baselines.common.tf_util as U
@@ -109,10 +116,10 @@ def launch(

    # Prepare params.
    params = config.DEFAULT_PARAMS
-    params['env_name'] = env_name
+    params['env_name'] = env
    params['replay_strategy'] = replay_strategy
-    if env_name in config.DEFAULT_ENV_PARAMS:
-        params.update(config.DEFAULT_ENV_PARAMS[env_name])  # merge env-specific parameters in
+    if env in config.DEFAULT_ENV_PARAMS:
+        params.update(config.DEFAULT_ENV_PARAMS[env])  # merge env-specific parameters in
    params.update(**override_params)  # makes it possible to override any parameter
    with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f:
        json.dump(params, f)
@@ -126,7 +133,7 @@ def launch(
            'You are running HER with just a single MPI worker. This will work, but the ' +
            'experiments that we report in Plappert et al. (2018, https://arxiv.org/abs/1802.09464) ' +
            'were obtained with --num_cpu 19. This makes a significant difference and if you ' +
-            'are looking to reproduce those results, be aware of this. Please also refer to ' + 
+            'are looking to reproduce those results, be aware of this. Please also refer to ' +
            'https://github.com/openai/baselines/issues/314 for further details.')
        logger.warn('****************')
        logger.warn()
@@ -168,7 +175,7 @@ def launch(


@click.command()
-@click.option('--env_name', type=str, default='FetchReach-v0', help='the name of the OpenAI Gym environment that you want to train on')
+@click.option('--env', type=str, default='FetchReach-v1', help='the name of the OpenAI Gym environment that you want to train on')
@click.option('--logdir', type=str, default=None, help='the path to where logs and policy pickles should go. If not specified, creates a folder in /tmp/')
@click.option('--n_epochs', type=int, default=50, help='the number of training epochs to run')
@click.option('--num_cpu', type=int, default=1, help='the number of CPU cores to use (using MPI)')
--- a/baselines/her/util.py
+++ b/baselines/her/util.py
@@ -58,12 +58,12 @@ def nn(input, layers_sizes, reuse=None, flatten=False, name=""):
    """Creates a simple neural network
    """
    for i, size in enumerate(layers_sizes):
-        activation = tf.nn.relu if i < len(layers_sizes)-1 else None
+        activation = tf.nn.relu if i < len(layers_sizes) - 1 else None
        input = tf.layers.dense(inputs=input,
                                units=size,
                                kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                reuse=reuse,
-                                name=name+'_'+str(i))
+                                name=name + '_' + str(i))
        if activation:
            input = activation(input)
    if flatten:
@@ -85,7 +85,7 @@ def install_mpi_excepthook():
    sys.excepthook = new_hook


-def mpi_fork(n):
+def mpi_fork(n, extra_mpi_args=[]):
    """Re-launches the current script with workers
    Returns "parent" for original parent, "child" for MPI children
    """
@@ -99,14 +99,10 @@ def mpi_fork(n):
            IN_MPI="1"
        )
        # "-bind-to core" is crucial for good performance
-        args = [
-            "mpirun",
-            "-np",
-            str(n),
-            "-bind-to",
-            "core",
-            sys.executable
-        ]
+        args = ["mpirun", "-np", str(n)] + \
+            extra_mpi_args + \
+            [sys.executable]
+
        args += sys.argv
        subprocess.check_call(args, env=env)
        return "parent"
@@ -140,5 +136,5 @@ def reshape_for_broadcasting(source, target):
    before broadcasting it with MPI.
    """
    dim = len(target.get_shape())
-    shape = ([1] * (dim-1)) + [-1]
+    shape = ([1] * (dim - 1)) + [-1]
    return tf.reshape(tf.cast(source, target.dtype), shape)
--- a/baselines/ppo2/policies.py
+++ b/baselines/ppo2/policies.py
@@ -2,6 +2,7 @@ import numpy as np
 import tensorflow as tf
 from baselines.a2c.utils import conv, fc, conv_to_fc, batch_to_seq, seq_to_batch, lstm, lnlstm
 from baselines.common.distributions import make_pdtype
+from baselines.common.input import observation_input

 def nature_cnn(unscaled_images, **conv_kwargs):
    """
@@ -19,14 +20,12 @@ def nature_cnn(unscaled_images, **conv_kwargs):
 class LnLstmPolicy(object):
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False):
        nenv = nbatch // nsteps
-        nh, nw, nc = ob_space.shape
-        ob_shape = (nbatch, nh, nw, nc)
-        X = tf.placeholder(tf.uint8, ob_shape) #obs
+        X, processed_x = observation_input(ob_space, nbatch)
        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
        self.pdtype = make_pdtype(ac_space)
        with tf.variable_scope("model", reuse=reuse):
-            h = nature_cnn(X)
+            h = nature_cnn(processed_x)
            xs = batch_to_seq(h, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm)
@@ -56,11 +55,9 @@ class LstmPolicy(object):

    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False):
        nenv = nbatch // nsteps
-
-        nh, nw, nc = ob_space.shape
-        ob_shape = (nbatch, nh, nw, nc)
        self.pdtype = make_pdtype(ac_space)
-        X = tf.placeholder(tf.uint8, ob_shape) #obs
+        X, processed_x = observation_input(ob_space, nbatch)
+
        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
        with tf.variable_scope("model", reuse=reuse):
@@ -93,12 +90,10 @@ class LstmPolicy(object):
 class CnnPolicy(object):

    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False, **conv_kwargs): #pylint: disable=W0613
-        nh, nw, nc = ob_space.shape
-        ob_shape = (nbatch, nh, nw, nc)
        self.pdtype = make_pdtype(ac_space)
-        X = tf.placeholder(tf.uint8, ob_shape) #obs
+        X, processed_x = observation_input(ob_space, nbatch)
        with tf.variable_scope("model", reuse=reuse):
-            h = nature_cnn(X, **conv_kwargs)
+            h = nature_cnn(processed_x, **conv_kwargs)
            vf = fc(h, 'v', 1)[:,0]
            self.pd, self.pi = self.pdtype.pdfromlatent(h, init_scale=0.01)

@@ -120,15 +115,14 @@ class CnnPolicy(object):

 class MlpPolicy(object):
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False): #pylint: disable=W0613
-        ob_shape = (nbatch,) + ob_space.shape
        self.pdtype = make_pdtype(ac_space)
-        X = tf.placeholder(tf.float32, ob_shape, name='Ob') #obs
        with tf.variable_scope("model", reuse=reuse):
+            X, processed_x = observation_input(ob_space, nbatch)
            activ = tf.tanh
-            flatten = tf.layers.flatten
-            pi_h1 = activ(fc(flatten(X), 'pi_fc1', nh=64, init_scale=np.sqrt(2)))
+            processed_x = tf.layers.flatten(processed_x)
+            pi_h1 = activ(fc(processed_x, 'pi_fc1', nh=64, init_scale=np.sqrt(2)))
            pi_h2 = activ(fc(pi_h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2)))
-            vf_h1 = activ(fc(flatten(X), 'vf_fc1', nh=64, init_scale=np.sqrt(2)))
+            vf_h1 = activ(fc(processed_x, 'vf_fc1', nh=64, init_scale=np.sqrt(2)))
            vf_h2 = activ(fc(vf_h1, 'vf_fc2', nh=64, init_scale=np.sqrt(2)))
            vf = fc(vf_h2, 'vf', 1)[:,0]

--- a/baselines/ppo2/ppo2.py
+++ b/baselines/ppo2/ppo2.py
@@ -236,6 +236,7 @@ def learn(*, policy, env, nsteps, total_timesteps, ent_coef, lr,
            print('Saving to', savepath)
            model.save(savepath)
    env.close()
+    return model

 def safemean(xs):
    return np.nan if len(xs) == 0 else np.mean(xs)
--- a/baselines/ppo2/run_mujoco.py
+++ b/baselines/ppo2/run_mujoco.py
@@ -1,8 +1,9 @@
 #!/usr/bin/env python3
-import argparse
+import numpy as np
 from baselines.common.cmd_util import mujoco_arg_parser
 from baselines import bench, logger

+
 def train(env_id, num_timesteps, seed):
    from baselines.common import set_global_seeds
    from baselines.common.vec_env.vec_normalize import VecNormalize
@@ -16,27 +17,40 @@ def train(env_id, num_timesteps, seed):
                            intra_op_parallelism_threads=ncpu,
                            inter_op_parallelism_threads=ncpu)
    tf.Session(config=config).__enter__()
+
    def make_env():
        env = gym.make(env_id)
-        env = bench.Monitor(env, logger.get_dir())
+        env = bench.Monitor(env, logger.get_dir(), allow_early_resets=True)
        return env
+
    env = DummyVecEnv([make_env])
    env = VecNormalize(env)

    set_global_seeds(seed)
    policy = MlpPolicy
-    ppo2.learn(policy=policy, env=env, nsteps=2048, nminibatches=32,
-        lam=0.95, gamma=0.99, noptepochs=10, log_interval=1,
-        ent_coef=0.0,
-        lr=3e-4,
-        cliprange=0.2,
-        total_timesteps=num_timesteps)
+    model = ppo2.learn(policy=policy, env=env, nsteps=2048, nminibatches=32,
+                       lam=0.95, gamma=0.99, noptepochs=10, log_interval=1,
+                       ent_coef=0.0,
+                       lr=3e-4,
+                       cliprange=0.2,
+                       total_timesteps=num_timesteps)
+
+    return model, env


 def main():
    args = mujoco_arg_parser().parse_args()
    logger.configure()
-    train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
+    model, env = train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
+
+    if args.play:
+        logger.log("Running trained model")
+        obs = np.zeros((env.num_envs,) + env.observation_space.shape)
+        obs[:] = env.reset()
+        while True:
+            actions = model.step(obs)[0]
+            obs[:]  = env.step(actions)[0]
+            env.render()


 if __name__ == '__main__':