add tuple pdtype

2019-05-23 15:43:48 -07:00
28 changed files with 316 additions and 193 deletions
--- a/2
+++ b/2
@@ -11,7 +11,7 @@ WORKDIR $CODE_DIR/baselines
 # Clean up pycache and pyc files
 RUN rm -rf __pycache__ && \
    find . -name "*.pyc" -delete && \
-    pip install 'tensorflow < 2' && \
+    pip install tensorflow && \
    pip install -e .[test]


--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-**Status:** Maintenance (expect bug fixes and minor updates)
+**Status:** Active (under active development, breaking changes may occur)

 <img src="data/logo.jpg" width=25% align="right" /> [![Build status](https://travis-ci.org/openai/baselines.svg?branch=master)](https://travis-ci.org/openai/baselines)

@@ -39,24 +39,21 @@ To activate a virtualenv:
 More thorough tutorial on virtualenvs and options can be found [here](https://virtualenv.pypa.io/en/stable/) 


-## Tensorflow versions
-The master branch supports Tensorflow from version 1.4 to 1.14. For Tensorflow 2.0 support, please use tf2 branch.
-
 ## Installation
 - Clone the repo and cd into it:
    ```bash
    git clone https://github.com/openai/baselines.git
    cd baselines
    ```
- If you don't have TensorFlow installed already, install your favourite flavor of TensorFlow. In most cases, you may use
+- If you don't have TensorFlow installed already, install your favourite flavor of TensorFlow. In most cases, 
    ```bash 
-    pip install tensorflow-gpu==1.14 # if you have a CUDA-compatible gpu and proper drivers
+    pip install tensorflow-gpu # if you have a CUDA-compatible gpu and proper drivers
    ```
    or 
    ```bash
-    pip install tensorflow==1.14
+    pip install tensorflow
    ```
-    to install Tensorflow 1.14, which is the latest version of Tensorflow supported by the master branch. Refer to [TensorFlow installation guide](https://www.tensorflow.org/install/)
+    should be sufficient. Refer to [TensorFlow installation guide](https://www.tensorflow.org/install/)
    for more details. 

 - Install baselines package
@@ -101,8 +98,6 @@ python -m baselines.run --alg=deepq --env=PongNoFrameskip-v4 --num_timesteps=1e6
 ```

 ## Saving, loading and visualizing models
-
-### Saving and loading the model
 The algorithms serialization API is not properly unified yet; however, there is a simple method to save / restore trained models. 
 `--save_path` and `--load_path` command-line option loads the tensorflow state from a given path before training, and saves it after the training, respectively. 
 Let's imagine you'd like to train ppo2 on Atari Pong,  save the model and then later visualize what has it learnt.
@@ -116,17 +111,8 @@ python -m baselines.run --alg=ppo2 --env=PongNoFrameskip-v4 --num_timesteps=0 --

 *NOTE:* Mujoco environments require normalization to work properly, so we wrap them with VecNormalize wrapper. Currently, to ensure the models are saved with normalization (so that trained models can be restored and run without further training) the normalization coefficients are saved as tensorflow variables. This can decrease the performance somewhat, so if you require high-throughput steps with Mujoco and do not need saving/restoring the models, it may make sense to use numpy normalization instead. To do that, set 'use_tf=False` in [baselines/run.py](baselines/run.py#L116). 

-### Logging and vizualizing learning curves and other training metrics
-By default, all summary data, including progress, standard output, is saved to a unique directory in a temp folder, specified by a call to Python's [tempfile.gettempdir()](https://docs.python.org/3/library/tempfile.html#tempfile.gettempdir).
-The directory can be changed with the `--log_path` command-line option.
-```bash
-python -m baselines.run --alg=ppo2 --env=PongNoFrameskip-v4 --num_timesteps=2e7 --save_path=~/models/pong_20M_ppo2 --log_path=~/logs/Pong/
-```
-*NOTE:* Please be aware that the logger will overwrite files of the same name in an existing directory, thus it's recommended that folder names be given a unique timestamp to prevent overwritten logs.
-
-Another way the temp directory can be changed is through the use of the `$OPENAI_LOGDIR` environment variable.
-
-For examples on how to load and display the training data, see [here](docs/viz/viz.ipynb).
+## Loading and vizualizing learning curves and other training metrics
+See [here](docs/viz/viz.ipynb) for instructions on how to load and display the training data. 

 ## Subpackages

--- a/baselines/acer/acer.py
+++ b/baselines/acer/acer.py
@@ -6,7 +6,7 @@ from baselines import logger

 from baselines.common import set_global_seeds
 from baselines.common.policies import build_policy
-from baselines.common.tf_util import get_session, save_variables, load_variables
+from baselines.common.tf_util import get_session, save_variables
 from baselines.common.vec_env.vec_frame_stack import VecFrameStack

 from baselines.a2c.utils import batch_to_seq, seq_to_batch
@@ -216,8 +216,7 @@ class Model(object):


        self.train = train
-        self.save = functools.partial(save_variables, sess=sess)
-        self.load = functools.partial(load_variables, sess=sess)
+        self.save = functools.partial(save_variables, sess=sess, variables=params)
        self.train_model = train_model
        self.step_model = step_model
        self._step = _step
@@ -359,9 +358,6 @@ def learn(network, env, seed=None, nsteps=20, total_timesteps=int(80e6), q_coef=
                  total_timesteps=total_timesteps, lrschedule=lrschedule, c=c,
                  trust_region=trust_region, alpha=alpha, delta=delta)

-    if load_path is not None:
-        model.load(load_path)
-
    runner = Runner(env=env, model=model, nsteps=nsteps)
    if replay_ratio > 0:
        buffer = Buffer(env=env, nsteps=nsteps, size=buffer_size)
--- a/baselines/bench/init.py
+++ b/baselines/bench/init.py
@@ -1,3 +1,2 @@
-# flake8: noqa F403
 from baselines.bench.benchmarks import *
 from baselines.bench.monitor import *
--- a/baselines/bench/benchmarks.py
+++ b/baselines/bench/benchmarks.py
@@ -1,4 +1,5 @@
 import re
+import os.path as osp
 import os
 SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))

--- a/baselines/bench/monitor.py
+++ b/baselines/bench/monitor.py
@@ -1,11 +1,13 @@
 __all__ = ['Monitor', 'get_monitor_files', 'load_results']

+import gym
 from gym.core import Wrapper
 import time
 from glob import glob
 import csv
 import os.path as osp
 import json
+import numpy as np

 class Monitor(Wrapper):
    EXT = "monitor.csv"
@@ -77,7 +79,6 @@ class Monitor(Wrapper):
        self.total_steps += 1

    def close(self):
-        super(Monitor, self).close()
        if self.f is not None:
            self.f.close()

@@ -161,3 +162,27 @@ def load_results(dir):
    df['t'] -= min(header['t_start'] for header in headers)
    df.headers = headers # HACK to preserve backwards compatibility
    return df
+
+def test_monitor():
+    env = gym.make("CartPole-v1")
+    env.seed(0)
+    mon_file = "/tmp/baselines-test-%s.monitor.csv" % uuid.uuid4()
+    menv = Monitor(env, mon_file)
+    menv.reset()
+    for _ in range(1000):
+        _, _, done, _ = menv.step(0)
+        if done:
+            menv.reset()
+
+    f = open(mon_file, 'rt')
+
+    firstline = f.readline()
+    assert firstline.startswith('#')
+    metadata = json.loads(firstline[1:])
+    assert metadata['env_id'] == "CartPole-v1"
+    assert set(metadata.keys()) == {'env_id', 'gym_version', 't_start'},  "Incorrect keys in monitor metadata"
+
+    last_logline = pandas.read_csv(f, index_col=None)
+    assert set(last_logline.keys()) == {'l', 't', 'r'}, "Incorrect keys in monitor logline"
+    f.close()
+    os.remove(mon_file)
--- a/baselines/bench/test_monitor.py
+++ b/baselines/bench/test_monitor.py
@@ -1,31 +0,0 @@
-from .monitor import Monitor
-import gym
-import json
-
-def test_monitor():
-    import pandas
-    import os
-    import uuid
-
-    env = gym.make("CartPole-v1")
-    env.seed(0)
-    mon_file = "/tmp/baselines-test-%s.monitor.csv" % uuid.uuid4()
-    menv = Monitor(env, mon_file)
-    menv.reset()
-    for _ in range(1000):
-        _, _, done, _ = menv.step(0)
-        if done:
-            menv.reset()
-
-    f = open(mon_file, 'rt')
-
-    firstline = f.readline()
-    assert firstline.startswith('#')
-    metadata = json.loads(firstline[1:])
-    assert metadata['env_id'] == "CartPole-v1"
-    assert set(metadata.keys()) == {'env_id', 't_start'},  "Incorrect keys in monitor metadata"
-
-    last_logline = pandas.read_csv(f, index_col=None)
-    assert set(last_logline.keys()) == {'l', 't', 'r'}, "Incorrect keys in monitor logline"
-    f.close()
-    os.remove(mon_file)
--- a/baselines/common/atari_wrappers.py
+++ b/baselines/common/atari_wrappers.py
@@ -254,13 +254,6 @@ class LazyFrames(object):
        return len(self._force())

    def __getitem__(self, i):
-        return self._force()[i]
-
-    def count(self):
-        frames = self._force()
-        return frames.shape[frames.ndim - 1]
-
-    def frame(self, i):
        return self._force()[..., i]

 def make_atari(env_id, max_episode_steps=None):
--- a/baselines/common/cmd_util.py
+++ b/baselines/common/cmd_util.py
@@ -9,7 +9,7 @@ except ImportError:
    MPI = None

 import gym
-from gym.wrappers import FlattenObservation, FilterObservation
+from gym.wrappers import FlattenDictWrapper
 from baselines import logger
 from baselines.bench import Monitor
 from baselines.common import set_global_seeds
@@ -81,7 +81,8 @@ def make_env(env_id, env_type, mpi_rank=0, subrank=0, seed=None, reward_scale=1.
        env = gym.make(env_id, **env_kwargs)

    if flatten_dict_observations and isinstance(env.observation_space, gym.spaces.Dict):
-        env = FlattenObservation(env)
+        keys = env.observation_space.spaces.keys()
+        env = gym.wrappers.FlattenDictWrapper(env, dict_keys=list(keys))

    env.seed(seed + subrank if seed is not None else None)
    env = Monitor(env,
@@ -127,7 +128,7 @@ def make_robotics_env(env_id, seed, rank=0):
    """
    set_global_seeds(seed)
    env = gym.make(env_id)
-    env = FlattenObservation(FilterObservation(env, ['observation', 'desired_goal']))
+    env = FlattenDictWrapper(env, ['observation', 'desired_goal'])
    env = Monitor(
        env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)),
        info_keywords=('is_success',))
@@ -169,7 +170,6 @@ def common_arg_parser():
    parser.add_argument('--save_path', help='Path to save trained model to', default=None, type=str)
    parser.add_argument('--save_video_interval', help='Save video every x steps (0 = disabled)', default=0, type=int)
    parser.add_argument('--save_video_length', help='Length of recorded video. Default: 200', default=200, type=int)
-    parser.add_argument('--log_path', help='Directory to save learning curve data.', default=None, type=str)
    parser.add_argument('--play', default=False, action='store_true')
    return parser

@@ -186,7 +186,7 @@ def robotics_arg_parser():

 def parse_unknown_args(args):
    """
-    Parse arguments not consumed by arg parser into a dictionary
+    Parse arguments not consumed by arg parser into a dicitonary
    """
    retval = {}
    preceded_by_key = False
--- a/baselines/common/distributions.py
+++ b/baselines/common/distributions.py
@@ -275,6 +275,133 @@ class BernoulliPd(Pd):
    def fromflat(cls, flat):
        return cls(flat)

+
+
+def _np_cast(x, dtype):
+    """Numpy cast, equivalent to tf.cast"""
+    return x.astype(dtype)
+
+
+def decode_tuple_sample(pdtypes, x):
+    """
+    Cast and convert a sample from its dense concatenated state back to constituent parts.
+
+    Arguments
+    ---------
+
+    :param pdtypes: list<PdType>, a TuplePdType's child PdTypes.
+    :param x: np.ndarray or tf.Tensor.
+              Shape is [..., sum(pdtype.sample_shape for pdtype in pdtypes)]
+
+    :return output, list<np.ndarray> or list<tf.Tensor>, the split and correctly casted
+            policy samples.
+    """
+    if isinstance(x, np.ndarray):
+        cast_fn = _np_cast
+        numpy_casting = True
+    else:
+        cast_fn = tf.cast
+        numpy_casting = False
+
+    so_far = 0
+    xs = []
+    for pdtype in pdtypes:
+        sample_size = pdtype.sample_shape()[0] if len(pdtype.sample_shape()) > 0 else 1
+        if len(pdtype.sample_shape()) == 0:
+            slided_x = x[..., so_far]
+        else:
+            slided_x = x[..., so_far:so_far + sample_size]
+
+        desired_dtype = pdtype.sample_dtype()
+        if numpy_casting:
+            desired_dtype = desired_dtype.as_numpy_dtype
+        if desired_dtype != x:
+            slided_x = cast_fn(slided_x, desired_dtype)
+        xs.append(slided_x)
+        so_far += sample_size
+    return xs
+
+
+class TuplePd(Pd):
+    def __init__(self, sample_dtype, pdtypes, logits):
+        self.pdtypes = pdtypes
+        self.sample_dtype = sample_dtype
+        self.pds = []
+        so_far = 0
+        for pdtype in self.pdtypes:
+            param_shape = pdtype.param_shape()[0]
+            self.pds.append(pdtype.pdfromflat(logits[..., so_far:so_far + param_shape]))
+            so_far += param_shape
+
+    def flatparam(self):
+        return tf.concat([pd.flatparam() for pd in self.pds], axis=-1)
+
+    def mode(self):
+        return self.tuple_sample_concat([pd.mode() for pd in self.pds])
+
+    def tuple_sample_concat(self, samples):
+        out = []
+        for sample, pdtype in zip(samples, self.pdtypes):
+            if len(pdtype.sample_shape()) == 0:
+                sample = tf.expand_dims(sample, axis=-1)
+            if sample.dtype != self.sample_dtype:
+                sample = tf.cast(sample, self.sample_dtype)
+            out.append(sample)
+        return tf.concat(out, axis=-1)
+
+    def sample(self):
+        return self.tuple_sample_concat([pd.sample() for pd in self.pds])
+
+    def neglogp(self, x):
+        return tf.add_n([pd.neglogp(xi) for pd, xi in zip(self.pds, decode_tuple_sample(self.pdtypes, x))])
+
+    def entropy(self):
+        return tf.add_n([pd.entropy() for pd in self.pds])
+
+
+def _dtype_promotion(old, new):
+    """
+    Find the highest precision common ground between two tensorflow datatypes.
+    if old is None, it is ignored.
+    """
+    if old is None or (new.is_floating and old.is_integer):
+        return new
+    if old.is_floating and old.is_integer:
+        return old
+    if (old.is_floating and new.is_floating) or (new.is_integer and new.is_integer):
+        # take the largest type (e.g. float64 over float32)
+        return old if old.size > new.size else new
+    raise ValueError("No idea how to promote {} and {}.".format(old, new))
+
+
+class TuplePdType(PdType):
+    def __init__(self, space):
+        self.internal_pdtypes = [make_pdtype(space) for space in space.spaces]
+
+    def decode_sample(self, x):
+        return decode_tuple_sample(self.internal_pdtypes, x)
+
+    def pdclass(self):
+        return TuplePd
+
+    def pdfromflat(self, flat):
+        return TuplePd(self.sample_dtype(), self.internal_pdtypes, flat)
+
+    def param_shape(self):
+        return [sum([pdtype.param_shape()[0]
+                     for pdtype in self.internal_pdtypes])]
+
+    def sample_shape(self):
+        return [sum([pdtype.sample_shape()[0] if len(pdtype.sample_shape()) > 0 else 1
+                     for pdtype in self.internal_pdtypes])]
+
+    def sample_dtype(self):
+        dtype = None
+        for pdtype in self.internal_pdtypes:
+            dtype = _dtype_promotion(dtype, pdtype.sample_dtype())
+        return dtype
+
+
 def make_pdtype(ac_space):
    from gym import spaces
    if isinstance(ac_space, spaces.Box):
@@ -286,9 +413,12 @@ def make_pdtype(ac_space):
        return MultiCategoricalPdType(ac_space.nvec)
    elif isinstance(ac_space, spaces.MultiBinary):
        return BernoulliPdType(ac_space.n)
+    elif isinstance(ac_space, spaces.Tuple):
+        return TuplePdType(ac_space)
    else:
        raise NotImplementedError

+
 def shape_el(v, i):
    maybe = v.get_shape()[i]
    if maybe is not None:
--- a/baselines/common/mpi_adam_optimizer.py
+++ b/baselines/common/mpi_adam_optimizer.py
@@ -65,7 +65,7 @@ def check_synced(localval, comm=None):
    vals = comm.gather(localval)
    if comm.rank == 0:
        assert all(val==vals[0] for val in vals[1:]),\
-            'MpiAdamOptimizer detected that different workers have different weights: {}'.format(vals)
+            f'MpiAdamOptimizer detected that different workers have different weights: {vals}'

@with_mpi(timeout=5)
 def test_nonfreeze():
--- a/baselines/common/mpi_moments.py
+++ b/baselines/common/mpi_moments.py
@@ -12,9 +12,8 @@ def mpi_mean(x, axis=0, comm=None, keepdims=False):
    localsum = np.zeros(n+1, x.dtype)
    localsum[:n] = xsum.ravel()
    localsum[n] = x.shape[axis]
-    # globalsum = np.zeros_like(localsum)
-    # comm.Allreduce(localsum, globalsum, op=MPI.SUM)
-    globalsum = comm.allreduce(localsum, op=MPI.SUM)
+    globalsum = np.zeros_like(localsum)
+    comm.Allreduce(localsum, globalsum, op=MPI.SUM)
    return globalsum[:n].reshape(xsum.shape) / globalsum[n], globalsum[n]

 def mpi_moments(x, axis=0, comm=None, keepdims=False):
--- a/baselines/common/vec_env/shmem_vec_env.py
+++ b/baselines/common/vec_env/shmem_vec_env.py
@@ -70,11 +70,9 @@ class ShmemVecEnv(VecEnv):
        assert len(actions) == len(self.parent_pipes)
        for pipe, act in zip(self.parent_pipes, actions):
            pipe.send(('step', act))
-        self.waiting_step = True

    def step_wait(self):
        outs = [pipe.recv() for pipe in self.parent_pipes]
-        self.waiting_step = False
        obs, rews, dones, infos = zip(*outs)
        return self._decode_obses(obs), np.array(rews), np.array(dones), infos

--- a/baselines/common/vec_env/subproc_vec_env.py
+++ b/baselines/common/vec_env/subproc_vec_env.py
@@ -4,36 +4,33 @@ import numpy as np
 from .vec_env import VecEnv, CloudpickleWrapper, clear_mpi_env_vars


-def worker(remote, parent_remote, env_fn_wrappers):
-    def step_env(env, action):
-        ob, reward, done, info = env.step(action)
-        if done:
-            ob = env.reset()
-        return ob, reward, done, info
-
+def worker(remote, parent_remote, env_fn_wrapper):
    parent_remote.close()
-    envs = [env_fn_wrapper() for env_fn_wrapper in env_fn_wrappers.x]
+    env = env_fn_wrapper.x()
    try:
        while True:
            cmd, data = remote.recv()
            if cmd == 'step':
-                remote.send([step_env(env, action) for env, action in zip(envs, data)])
+                ob, reward, done, info = env.step(data)
+                if done:
+                    ob = env.reset()
+                remote.send((ob, reward, done, info))
            elif cmd == 'reset':
-                remote.send([env.reset() for env in envs])
+                ob = env.reset()
+                remote.send(ob)
            elif cmd == 'render':
-                remote.send([env.render(mode='rgb_array') for env in envs])
+                remote.send(env.render(mode='rgb_array'))
            elif cmd == 'close':
                remote.close()
                break
            elif cmd == 'get_spaces_spec':
-                remote.send(CloudpickleWrapper((envs[0].observation_space, envs[0].action_space, envs[0].spec)))
+                remote.send((env.observation_space, env.action_space, env.spec))
            else:
                raise NotImplementedError
    except KeyboardInterrupt:
        print('SubprocVecEnv worker: got KeyboardInterrupt')
    finally:
-        for env in envs:
-            env.close()
+        env.close()


 class SubprocVecEnv(VecEnv):
@@ -41,23 +38,17 @@ class SubprocVecEnv(VecEnv):
    VecEnv that runs multiple environments in parallel in subproceses and communicates with them via pipes.
    Recommended to use when num_envs > 1 and step() can be a bottleneck.
    """
-    def __init__(self, env_fns, spaces=None, context='spawn', in_series=1):
+    def __init__(self, env_fns, spaces=None, context='spawn'):
        """
        Arguments:

        env_fns: iterable of callables -  functions that create environments to run in subprocesses. Need to be cloud-pickleable
-        in_series: number of environments to run in series in a single process
-        (e.g. when len(env_fns) == 12 and in_series == 3, it will run 4 processes, each running 3 envs in series)
        """
        self.waiting = False
        self.closed = False
-        self.in_series = in_series
        nenvs = len(env_fns)
-        assert nenvs % in_series == 0, "Number of envs must be divisible by number of envs to run in series"
-        self.nremotes = nenvs // in_series
-        env_fns = np.array_split(env_fns, self.nremotes)
        ctx = mp.get_context(context)
-        self.remotes, self.work_remotes = zip(*[ctx.Pipe() for _ in range(self.nremotes)])
+        self.remotes, self.work_remotes = zip(*[ctx.Pipe() for _ in range(nenvs)])
        self.ps = [ctx.Process(target=worker, args=(work_remote, remote, CloudpickleWrapper(env_fn)))
                   for (work_remote, remote, env_fn) in zip(self.work_remotes, self.remotes, env_fns)]
        for p in self.ps:
@@ -68,13 +59,12 @@ class SubprocVecEnv(VecEnv):
            remote.close()

        self.remotes[0].send(('get_spaces_spec', None))
-        observation_space, action_space, self.spec = self.remotes[0].recv().x
+        observation_space, action_space, self.spec = self.remotes[0].recv()
        self.viewer = None
-        VecEnv.__init__(self, nenvs, observation_space, action_space)
+        VecEnv.__init__(self, len(env_fns), observation_space, action_space)

    def step_async(self, actions):
        self._assert_not_closed()
-        actions = np.array_split(actions, self.nremotes)
        for remote, action in zip(self.remotes, actions):
            remote.send(('step', action))
        self.waiting = True
@@ -82,7 +72,6 @@ class SubprocVecEnv(VecEnv):
    def step_wait(self):
        self._assert_not_closed()
        results = [remote.recv() for remote in self.remotes]
-        results = _flatten_list(results)
        self.waiting = False
        obs, rews, dones, infos = zip(*results)
        return _flatten_obs(obs), np.stack(rews), np.stack(dones), infos
@@ -91,9 +80,7 @@ class SubprocVecEnv(VecEnv):
        self._assert_not_closed()
        for remote in self.remotes:
            remote.send(('reset', None))
-        obs = [remote.recv() for remote in self.remotes]
-        obs = _flatten_list(obs)
-        return _flatten_obs(obs)
+        return _flatten_obs([remote.recv() for remote in self.remotes])

    def close_extras(self):
        self.closed = True
@@ -110,7 +97,6 @@ class SubprocVecEnv(VecEnv):
        for pipe in self.remotes:
            pipe.send(('render', None))
        imgs = [pipe.recv() for pipe in self.remotes]
-        imgs = _flatten_list(imgs)
        return imgs

    def _assert_not_closed(self):
@@ -129,10 +115,3 @@ def _flatten_obs(obs):
        return {k: np.stack([o[k] for o in obs]) for k in keys}
    else:
        return np.stack(obs)
-
-def _flatten_list(l):
-    assert isinstance(l, (list, tuple))
-    assert len(l) > 0
-    assert all([len(l_) > 0 for l_ in l])
-
-    return [l__ for l_ in l for l__ in l_]
--- a/baselines/common/vec_env/test_vec_env.py
+++ b/baselines/common/vec_env/test_vec_env.py
@@ -67,50 +67,6 @@ def test_vec_env(klass, dtype):  # pylint: disable=R0914
    assert_venvs_equal(env1, env2, num_steps=num_steps)


-@pytest.mark.parametrize('dtype', ('uint8', 'float32'))
-@pytest.mark.parametrize('num_envs_in_series', (3, 4, 6))
-def test_sync_sampling(dtype, num_envs_in_series):
-    """
-    Test that a SubprocVecEnv running with envs in series
-    outputs the same as DummyVecEnv.
-    """
-    num_envs = 12
-    num_steps = 100
-    shape = (3, 8)
-
-    def make_fn(seed):
-        """
-        Get an environment constructor with a seed.
-        """
-        return lambda: SimpleEnv(seed, shape, dtype)
-    fns = [make_fn(i) for i in range(num_envs)]
-    env1 = DummyVecEnv(fns)
-    env2 = SubprocVecEnv(fns, in_series=num_envs_in_series)
-    assert_venvs_equal(env1, env2, num_steps=num_steps)
-
-
-@pytest.mark.parametrize('dtype', ('uint8', 'float32'))
-@pytest.mark.parametrize('num_envs_in_series', (3, 4, 6))
-def test_sync_sampling_sanity(dtype, num_envs_in_series):
-    """
-    Test that a SubprocVecEnv running with envs in series
-    outputs the same as SubprocVecEnv without running in series.
-    """
-    num_envs = 12
-    num_steps = 100
-    shape = (3, 8)
-
-    def make_fn(seed):
-        """
-        Get an environment constructor with a seed.
-        """
-        return lambda: SimpleEnv(seed, shape, dtype)
-    fns = [make_fn(i) for i in range(num_envs)]
-    env1 = SubprocVecEnv(fns)
-    env2 = SubprocVecEnv(fns, in_series=num_envs_in_series)
-    assert_venvs_equal(env1, env2, num_steps=num_steps)
-
-
 class SimpleEnv(gym.Env):
    """
    An environment with a pre-determined observation space
--- a/baselines/common/vec_env/util.py
+++ b/baselines/common/vec_env/util.py
@@ -38,9 +38,6 @@ def obs_space_info(obs_space):
    if isinstance(obs_space, gym.spaces.Dict):
        assert isinstance(obs_space.spaces, OrderedDict)
        subspaces = obs_space.spaces
-    elif isinstance(obs_space, gym.spaces.Tuple):
-        assert isinstance(obs_space.spaces, tuple)
-        subspaces = {i: obs_space.spaces[i] for i in range(len(obs_space.spaces))}
    else:
        subspaces = {None: obs_space}
    keys = []
--- a/baselines/ddpg/ddpg_learner.py
+++ b/baselines/ddpg/ddpg_learner.py
@@ -378,6 +378,11 @@ class DDPG(object):
            self.param_noise_stddev: self.param_noise.current_stddev,
        })

+        if MPI is not None:
+            mean_distance = MPI.COMM_WORLD.allreduce(distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size()
+        else:
+            mean_distance = distance
+
        if MPI is not None:
            mean_distance = MPI.COMM_WORLD.allreduce(distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size()
        else:
--- a/baselines/deepq/build_graph.py
+++ b/baselines/deepq/build_graph.py
@@ -13,7 +13,7 @@ The functions in this file can are used to create the following functions:
    stochastic: bool
        if set to False all the actions are always deterministic (default False)
    update_eps_ph: float
-        update epsilon a new value, if negative no update happens
+        update epsilon a new value, if negative not update happens
        (default: no update)

    Returns
--- a/baselines/deepq/deepq.py
+++ b/baselines/deepq/deepq.py
@@ -142,8 +142,9 @@ def learn(env,
        final value of random action probability
    train_freq: int
        update the model every `train_freq` steps.
+        set to None to disable printing
    batch_size: int
-        size of a batch sampled from replay buffer for training
+        size of a batched sampled from replay buffer for training
    print_freq: int
        how often to print out training progress
        set to None to disable printing
--- a/baselines/deepq/models.py
+++ b/baselines/deepq/models.py
@@ -2,6 +2,101 @@ import tensorflow as tf
 import tensorflow.contrib.layers as layers


+def _mlp(hiddens, input_, num_actions, scope, reuse=False, layer_norm=False):
+    with tf.variable_scope(scope, reuse=reuse):
+        out = input_
+        for hidden in hiddens:
+            out = layers.fully_connected(out, num_outputs=hidden, activation_fn=None)
+            if layer_norm:
+                out = layers.layer_norm(out, center=True, scale=True)
+            out = tf.nn.relu(out)
+        q_out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None)
+        return q_out
+
+
+def mlp(hiddens=[], layer_norm=False):
+    """This model takes as input an observation and returns values of all actions.
+
+    Parameters
+    ----------
+    hiddens: [int]
+        list of sizes of hidden layers
+    layer_norm: bool
+        if true applies layer normalization for every layer
+        as described in https://arxiv.org/abs/1607.06450
+
+    Returns
+    -------
+    q_func: function
+        q_function for DQN algorithm.
+    """
+    return lambda *args, **kwargs: _mlp(hiddens, layer_norm=layer_norm, *args, **kwargs)
+
+
+def _cnn_to_mlp(convs, hiddens, dueling, input_, num_actions, scope, reuse=False, layer_norm=False):
+    with tf.variable_scope(scope, reuse=reuse):
+        out = input_
+        with tf.variable_scope("convnet"):
+            for num_outputs, kernel_size, stride in convs:
+                out = layers.convolution2d(out,
+                                           num_outputs=num_outputs,
+                                           kernel_size=kernel_size,
+                                           stride=stride,
+                                           activation_fn=tf.nn.relu)
+        conv_out = layers.flatten(out)
+        with tf.variable_scope("action_value"):
+            action_out = conv_out
+            for hidden in hiddens:
+                action_out = layers.fully_connected(action_out, num_outputs=hidden, activation_fn=None)
+                if layer_norm:
+                    action_out = layers.layer_norm(action_out, center=True, scale=True)
+                action_out = tf.nn.relu(action_out)
+            action_scores = layers.fully_connected(action_out, num_outputs=num_actions, activation_fn=None)
+
+        if dueling:
+            with tf.variable_scope("state_value"):
+                state_out = conv_out
+                for hidden in hiddens:
+                    state_out = layers.fully_connected(state_out, num_outputs=hidden, activation_fn=None)
+                    if layer_norm:
+                        state_out = layers.layer_norm(state_out, center=True, scale=True)
+                    state_out = tf.nn.relu(state_out)
+                state_score = layers.fully_connected(state_out, num_outputs=1, activation_fn=None)
+            action_scores_mean = tf.reduce_mean(action_scores, 1)
+            action_scores_centered = action_scores - tf.expand_dims(action_scores_mean, 1)
+            q_out = state_score + action_scores_centered
+        else:
+            q_out = action_scores
+        return q_out
+
+
+def cnn_to_mlp(convs, hiddens, dueling=False, layer_norm=False):
+    """This model takes as input an observation and returns values of all actions.
+
+    Parameters
+    ----------
+    convs: [(int, int, int)]
+        list of convolutional layers in form of
+        (num_outputs, kernel_size, stride)
+    hiddens: [int]
+        list of sizes of hidden layers
+    dueling: bool
+        if true double the output MLP to compute a baseline
+        for action scores
+    layer_norm: bool
+        if true applies layer normalization for every layer
+        as described in https://arxiv.org/abs/1607.06450
+
+    Returns
+    -------
+    q_func: function
+        q_function for DQN algorithm.
+    """
+
+    return lambda *args, **kwargs: _cnn_to_mlp(convs, hiddens, dueling, layer_norm=layer_norm, *args, **kwargs)
+
+
+
 def build_q_func(network, hiddens=[256], dueling=True, layer_norm=False, **network_kwargs):
    if isinstance(network, str):
        from baselines.common.models import get_network_builder
--- a/baselines/gail/behavior_clone.py
+++ b/baselines/gail/behavior_clone.py
@@ -23,7 +23,7 @@ from baselines.gail.dataset.mujoco_dset import Mujoco_Dset

 def argsparser():
    parser = argparse.ArgumentParser("Tensorflow Implementation of Behavior Cloning")
-    parser.add_argument('--env_id', help='environment ID', default='Hopper-v2')
+    parser.add_argument('--env_id', help='environment ID', default='Hopper-v1')
    parser.add_argument('--seed', help='RNG seed', type=int, default=0)
    parser.add_argument('--expert_path', type=str, default='data/deterministic.trpo.Hopper.0.00.npz')
    parser.add_argument('--checkpoint_dir', help='the directory to save model', default='checkpoint')
@@ -73,7 +73,7 @@ def learn(env, policy_func, dataset, optim_batch_size=128, max_iters=1e4,
        savedir_fname = tempfile.TemporaryDirectory().name
    else:
        savedir_fname = osp.join(ckpt_dir, task_name)
-    U.save_variables(savedir_fname, variables=pi.get_variables())
+    U.save_state(savedir_fname, var_list=pi.get_variables())
    return savedir_fname


--- a/baselines/gail/dataset/mujoco_dset.py
+++ b/baselines/gail/dataset/mujoco_dset.py
@@ -77,7 +77,7 @@ class Mujoco_Dset(object):
        self.log_info()

    def log_info(self):
-        logger.log("Total trajectories: %d" % self.num_traj)
+        logger.log("Total trajectorues: %d" % self.num_traj)
        logger.log("Total transitions: %d" % self.num_transition)
        logger.log("Average returns: %f" % self.avg_ret)
        logger.log("Std for returns: %f" % self.std_ret)
--- a/baselines/gail/run_mujoco.py
+++ b/baselines/gail/run_mujoco.py
@@ -165,7 +165,7 @@ def runner(env, policy_func, load_model_path, timesteps_per_batch, number_trajs,
    U.initialize()
    # Prepare for rollouts
    # ----------------------------------------
-    U.load_variables(load_model_path)
+    U.load_state(load_model_path)

    obs_list = []
    acs_list = []
--- a/baselines/her/rollout.py
+++ b/baselines/her/rollout.py
@@ -15,7 +15,8 @@ class RolloutWorker:
        """Rollout worker generates experience by interacting with one or many environments.

        Args:
-            venv: vectorized gym environments.
+            make_env (function): a factory function that creates a new instance of the environment
+                when called
            policy (object): the policy that is used to act
            dims (dict of ints): the dimensions for observations (o), goals (g), and actions (u)
            logger (object): the logger that is used by the rollout worker
--- a/baselines/logger.py
+++ b/baselines/logger.py
@@ -379,8 +379,7 @@ def configure(dir=None, format_strs=None, comm=None, log_suffix=''):
        dir = osp.join(tempfile.gettempdir(),
            datetime.datetime.now().strftime("openai-%Y-%m-%d-%H-%M-%S-%f"))
    assert isinstance(dir, str)
-    dir = os.path.expanduser(dir)
-    os.makedirs(os.path.expanduser(dir), exist_ok=True)
+    os.makedirs(dir, exist_ok=True)

    rank = get_rank_without_mpi_import()
    if rank > 0:
@@ -395,8 +394,7 @@ def configure(dir=None, format_strs=None, comm=None, log_suffix=''):
    output_formats = [make_output_format(f, dir, log_suffix) for f in format_strs]

    Logger.CURRENT = Logger(dir=dir, output_formats=output_formats, comm=comm)
-    if output_formats:
-        log('Logging to %s'%dir)
+    log('Logging to %s'%dir)

 def _configure_default_logger():
    configure()
--- a/baselines/run.py
+++ b/baselines/run.py
@@ -32,7 +32,7 @@ except ImportError:
 _game_envs = defaultdict(set)
 for env in gym.envs.registry.all():
    # TODO: solve this with regexes
-    env_type = env.entry_point.split(':')[0].split('.')[-1]
+    env_type = env._entry_point.split(':')[0].split('.')[-1]
    _game_envs[env_type].add(env.id)

 # reading benchmark names directly from retro requires
@@ -126,7 +126,7 @@ def get_env_type(args):

    # Re-parse the gym registry, since we could have new envs since last time.
    for env in gym.envs.registry.all():
-        env_type = env.entry_point.split(':')[0].split('.')[-1]
+        env_type = env._entry_point.split(':')[0].split('.')[-1]
        _game_envs[env_type].add(env.id)  # This is a set so add is idempotent

    if env_id in _game_envs.keys():
@@ -192,12 +192,6 @@ def parse_cmdline_kwargs(args):
    return {k: parse(v) for k,v in parse_unknown_args(args).items()}


-def configure_logger(log_path, **kwargs):
-    if log_path is not None:
-        logger.configure(log_path)
-    else:
-        logger.configure(**kwargs)
-

 def main(args):
    # configure logger, disable logging in child MPI processes (with rank > 0)
@@ -208,10 +202,10 @@ def main(args):

    if MPI is None or MPI.COMM_WORLD.Get_rank() == 0:
        rank = 0
-        configure_logger(args.log_path)
+        logger.configure()
    else:
+        logger.configure(format_strs=[])
        rank = MPI.COMM_WORLD.Get_rank()
-        configure_logger(args.log_path, format_strs=[])

    model, env = train(args, extra_args)

@@ -226,7 +220,7 @@ def main(args):
        state = model.initial_state if hasattr(model, 'initial_state') else None
        dones = np.zeros((1,))

-        episode_rew = np.zeros(env.num_envs) if isinstance(env, VecEnv) else np.zeros(1)
+        episode_rew = 0
        while True:
            if state is not None:
                actions, _, state, _ = model.step(obs,S=state, M=dones)
@@ -234,13 +228,13 @@ def main(args):
                actions, _, _, _ = model.step(obs)

            obs, rew, done, _ = env.step(actions)
-            episode_rew += rew
+            episode_rew += rew[0] if isinstance(env, VecEnv) else rew
            env.render()
-            done_any = done.any() if isinstance(done, np.ndarray) else done
-            if done_any:
-                for i in np.nonzero(done)[0]:
-                    print('episode_rew={}'.format(episode_rew[i]))
-                    episode_rew[i] = 0
+            done = done.any() if isinstance(done, np.ndarray) else done
+            if done:
+                print('episode_rew={}'.format(episode_rew))
+                episode_rew = 0
+                obs = env.reset()

    env.close()

--- a/setup.cfg
+++ b/setup.cfg
@@ -4,3 +4,4 @@ exclude =
    .git,
    __pycache__,
    baselines/ppo1,
+    baselines/bench,
--- a/setup.py
+++ b/setup.py
@@ -31,7 +31,7 @@ setup(name='baselines',
      packages=[package for package in find_packages()
                if package.startswith('baselines')],
      install_requires=[
-          'gym>=0.15.4, <0.16.0',
+          'gym>=0.10.0, <1.0.0',
          'scipy',
          'tqdm',
          'joblib',
@@ -44,7 +44,7 @@ setup(name='baselines',
      author='OpenAI',
      url='https://github.com/openai/baselines',
      author_email='gym@openai.com',
-      version='0.1.6')
+      version='0.1.5')


 # ensure there is some tensorflow build with version above 1.4