lstm network builders using tf lstm

merged refactor
2018-08-10 14:23:45 -07:00 · 2018-08-10 14:14:46 -07:00
18 changed files with 93 additions and 18396 deletions
--- a/.benchmark_pattern
+++ b/.benchmark_pattern
@@ -1 +1 @@
-
+ppo2
--- a/README.md
+++ b/README.md
@@ -112,6 +112,10 @@ This should get to the mean reward per episode about 5k. To load and visualize t
 *NOTE:* At the moment Mujoco training uses VecNormalize wrapper for the environment which is not being saved correctly; so loading the models trained on Mujoco will not work well if the environment is recreated. If necessary, you can work around that by replacing RunningMeanStd by TfRunningMeanStd in [baselines/common/vec_env/vec_normalize.py](baselines/common/vec_env/vec_normalize.py#L12). This way, mean and std of environment normalizing wrapper will be saved in tensorflow variables and included in the model file; however, training is slower that way - hence not including it by default


+
+
+
+
 ## Subpackages

 - [A2C](baselines/a2c)
@@ -121,19 +125,10 @@ This should get to the mean reward per episode about 5k. To load and visualize t
 - [DQN](baselines/deepq)
 - [GAIL](baselines/gail)
 - [HER](baselines/her)
- [PPO1](baselines/ppo1) (obsolete version, left here temporarily)
- [PPO2](baselines/ppo2) 
+- [PPO1](baselines/ppo1) (Multi-CPU using MPI)
+- [PPO2](baselines/ppo2) (Optimized for GPU)
 - [TRPO](baselines/trpo_mpi)

-
-
-## Benchmarks
-Results of benchmarks on Mujoco (1M timesteps) and Atari (10M timesteps) are available 
-[here for Mujoco](https://htmlpreview.github.com/?https://github.com/openai/baselines/blob/master/benchmarks_mujoco1M.htm) 
-and
-[here for Atari](https://htmlpreview.github.com/?https://github.com/openai/baselines/blob/master/benchmarks_atari10M.htm) 
-respectively. Note that these results may be not on the latest version of the code, particular commit hash with which results were obtained is specified on the benchmarks page. 
-
 To cite this repository in publications:

    @misc{baselines,
@@ -144,4 +139,3 @@ To cite this repository in publications:
      journal = {GitHub repository},
      howpublished = {\url{https://github.com/openai/baselines}},
    }
-
--- a/baselines/common/atari_wrappers.py
+++ b/baselines/common/atari_wrappers.py
@@ -156,7 +156,7 @@ class FrameStack(gym.Wrapper):
        self.k = k
        self.frames = deque([], maxlen=k)
        shp = env.observation_space.shape
-        self.observation_space = spaces.Box(low=0, high=255, shape=(shp[0], shp[1], shp[2] * k), dtype=env.observation_space.dtype)
+        self.observation_space = spaces.Box(low=0, high=255, shape=(shp[0], shp[1], shp[2] * k), dtype=np.uint8)

    def reset(self):
        ob = self.env.reset()
@@ -176,7 +176,6 @@ class FrameStack(gym.Wrapper):
 class ScaledFloatFrame(gym.ObservationWrapper):
    def __init__(self, env):
        gym.ObservationWrapper.__init__(self, env)
-        self.observation_space = gym.spaces.Box(low=0, high=1, shape=env.observation_space.shape, dtype=np.float32)

    def observation(self, observation):
        # careful! This undoes the memory optimization, use
--- a/baselines/common/models.py
+++ b/baselines/common/models.py
@@ -138,7 +138,7 @@ def conv_only(convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], **conv_kwargs):
    '''

    def network_fn(X):
-        out = tf.cast(X, tf.float32) / 255.
+        out = X
        with tf.variable_scope("convnet"):
            for num_outputs, kernel_size, stride in convs:
                out = layers.convolution2d(out,
--- a/baselines/common/tests/test_fixed_sequence.py
+++ b/baselines/common/tests/test_fixed_sequence.py
@@ -6,7 +6,8 @@ from baselines.run import get_learn_function

 common_kwargs = dict(
    seed=0,
-    total_timesteps=50000,
+    total_timesteps=20000,
+    nlstm=64
 )
    
 learn_kwargs = {
@@ -19,7 +20,7 @@ learn_kwargs = {


 alg_list = learn_kwargs.keys()
-rnn_list = ['lstm']
+rnn_list = ['lstm', 'tflstm', 'tflstm_static']

@pytest.mark.slow
@pytest.mark.parametrize("alg", alg_list)
@@ -41,11 +42,11 @@ def test_fixed_sequence(alg, rnn):
        **kwargs
    )

-    simple_test(env_fn, learn, 0.7)
+    simple_test(env_fn, learn, 0.3)


 if __name__ == '__main__':
-    test_fixed_sequence('ppo2', 'lstm')
+    test_fixed_sequence('ppo2', 'tflstm')

    

--- a/baselines/common/tests/util.py
+++ b/baselines/common/tests/util.py
@@ -2,6 +2,7 @@ import tensorflow as tf
 import numpy as np
 from gym.spaces import np_random
 from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
+from baselines.bench.monitor import Monitor

 N_TRIALS = 10000
 N_EPISODES = 100
@@ -10,7 +11,7 @@ def simple_test(env_fn, learn_fn, min_reward_fraction, n_trials=N_TRIALS):
    np.random.seed(0)
    np_random.seed(0)

-    env = DummyVecEnv([env_fn])
+    env = DummyVecEnv([lambda: Monitor(env_fn(), None, allow_early_resets=True)])


    with tf.Graph().as_default(), tf.Session(config=tf.ConfigProto(allow_soft_placement=True)).as_default():
--- a/baselines/common/vec_env/init.py
+++ b/baselines/common/vec_env/init.py
@@ -1,34 +1,28 @@
 from abc import ABC, abstractmethod
 from baselines import logger

-
 class AlreadySteppingError(Exception):
    """
    Raised when an asynchronous step is running while
    step_async() is called again.
    """
-
    def __init__(self):
        msg = 'already running an async step'
        Exception.__init__(self, msg)

-
 class NotSteppingError(Exception):
    """
    Raised when an asynchronous step is not running but
    step_wait() is called.
    """
-
    def __init__(self):
        msg = 'not running an async step'
        Exception.__init__(self, msg)

-
 class VecEnv(ABC):
    """
    An abstract asynchronous, vectorized environment.
    """
-
    def __init__(self, num_envs, observation_space, action_space):
        self.num_envs = num_envs
        self.observation_space = observation_space
@@ -38,7 +32,7 @@ class VecEnv(ABC):
    def reset(self):
        """
        Reset all the environments and return an array of
-        observations, or a dict of observation arrays.
+        observations, or a tuple of observation arrays.

        If step_async is still doing work, that work will
        be cancelled and step_wait() should not be called
@@ -64,7 +58,7 @@ class VecEnv(ABC):
        Wait for the step taken with step_async().

        Returns (obs, rews, dones, infos):
-         - obs: an array of observations, or a dict of
+         - obs: an array of observations, or a tuple of
                arrays of observations.
         - rews: an array of rewards
         - dones: an array of "episode done" booleans
@@ -80,16 +74,11 @@ class VecEnv(ABC):
        pass

    def step(self, actions):
-        """
-        Step the environments synchronously.
-
-        This is available for backwards compatibility.
-        """
        self.step_async(actions)
        return self.step_wait()

    def render(self, mode='human'):
-        logger.warn('Render not defined for %s' % self)
+        logger.warn('Render not defined for %s'%self)

    @property
    def unwrapped(self):
@@ -98,19 +87,13 @@ class VecEnv(ABC):
        else:
            return self

-
 class VecEnvWrapper(VecEnv):
-    """
-    An environment wrapper that applies to an entire batch
-    of environments at once.
-    """
-
    def __init__(self, venv, observation_space=None, action_space=None):
        self.venv = venv
-        VecEnv.__init__(self,
-                        num_envs=venv.num_envs,
-                        observation_space=observation_space or venv.observation_space,
-                        action_space=action_space or venv.action_space)
+        VecEnv.__init__(self, 
+            num_envs=venv.num_envs,
+            observation_space=observation_space or venv.observation_space, 
+            action_space=action_space or venv.action_space)

    def step_async(self, actions):
        self.venv.step_async(actions)
@@ -129,19 +112,15 @@ class VecEnvWrapper(VecEnv):
    def render(self):
        self.venv.render()

-
 class CloudpickleWrapper(object):
    """
    Uses cloudpickle to serialize contents (otherwise multiprocessing tries to use pickle)
    """
-
    def __init__(self, x):
        self.x = x
-
    def __getstate__(self):
        import cloudpickle
        return cloudpickle.dumps(self.x)
-
    def __setstate__(self, ob):
        import pickle
        self.x = pickle.loads(ob)
--- a/baselines/common/vec_env/dummy_vec_env.py
+++ b/baselines/common/vec_env/dummy_vec_env.py
@@ -1,40 +1,59 @@
 import numpy as np
+from gym import spaces
+from collections import OrderedDict
 from . import VecEnv
-from .util import copy_obs_dict, dict_to_obs, obs_space_info
-

 class DummyVecEnv(VecEnv):
-    """
-    A VecEnv that wraps raw gym.Envs.
-
-    This can be used when an algorithm requires a VecEnv
-    but you want to use a vanilla gym.Env instance.
-    It is also useful for avoiding IPC overhead when you
-    don't need to run environments in parallel.
-    """
-
    def __init__(self, env_fns):
        self.envs = [fn() for fn in env_fns]
        env = self.envs[0]
        VecEnv.__init__(self, len(env_fns), env.observation_space, env.action_space)
+        shapes, dtypes = {}, {}
+        self.keys = []
        obs_space = env.observation_space
-        self.keys, shapes, dtypes = obs_space_info(obs_space)
-        self.buf_obs = {k: np.zeros((self.num_envs,) + tuple(shapes[k]), dtype=dtypes[k]) for k in self.keys}
+
+        if isinstance(obs_space, spaces.Dict):
+            assert isinstance(obs_space.spaces, OrderedDict)
+            subspaces = obs_space.spaces
+        else:
+            subspaces = {None: obs_space}
+
+        for key, box in subspaces.items():
+            shapes[key] = box.shape
+            dtypes[key] = box.dtype
+            self.keys.append(key)
+        
+        self.buf_obs = { k: np.zeros((self.num_envs,) + tuple(shapes[k]), dtype=dtypes[k]) for k in self.keys }
        self.buf_dones = np.zeros((self.num_envs,), dtype=np.bool)
-        self.buf_rews = np.zeros((self.num_envs,), dtype=np.float32)
+        self.buf_rews  = np.zeros((self.num_envs,), dtype=np.float32)
        self.buf_infos = [{} for _ in range(self.num_envs)]
        self.actions = None

    def step_async(self, actions):
-        self.actions = actions
+        listify = True
+        try:
+            if len(actions) == self.num_envs:
+                listify = False
+        except TypeError:
+            pass
+
+        if not listify:
+            self.actions = actions
+        else:
+            assert self.num_envs == 1, "actions {} is either not a list or has a wrong size - cannot match to {} environments".format(actions, self.num_envs)
+            self.actions = [actions]

    def step_wait(self):
        for e in range(self.num_envs):
-            obs, self.buf_rews[e], self.buf_dones[e], self.buf_infos[e] = self.envs[e].step(self.actions[e])
+            action = self.actions[e]
+            if isinstance(self.envs[e].action_space, spaces.Discrete):
+                action = int(action)
+
+            obs, self.buf_rews[e], self.buf_dones[e], self.buf_infos[e] = self.envs[e].step(action)
            if self.buf_dones[e]:
                obs = self.envs[e].reset()
            self._save_obs(e, obs)
-        return (self._obs_from_buf(), np.copy(self.buf_rews), np.copy(self.buf_dones),
+        return (np.copy(self._obs_from_buf()), np.copy(self.buf_rews), np.copy(self.buf_dones),
                self.buf_infos.copy())

    def reset(self):
@@ -44,8 +63,7 @@ class DummyVecEnv(VecEnv):
        return self._obs_from_buf()

    def close(self):
-        for e in self.envs:
-            e.close()
+        return

    def render(self, mode='human'):
        return [e.render(mode=mode) for e in self.envs]
@@ -58,4 +76,7 @@ class DummyVecEnv(VecEnv):
                self.buf_obs[k][e] = obs[k]

    def _obs_from_buf(self):
-        return dict_to_obs(copy_obs_dict(self.buf_obs))
+        if self.keys==[None]:
+            return self.buf_obs[None]
+        else:
+            return self.buf_obs
--- a/baselines/common/vec_env/shmem_vec_env.py
+++ b/baselines/common/vec_env/shmem_vec_env.py
@@ -1,146 +0,0 @@
-"""
-An interface for asynchronous vectorized environments.
-"""
-
-from multiprocessing import Pipe, Array, Process
-import numpy as np
-from . import VecEnv, CloudpickleWrapper
-import ctypes
-from baselines import logger
-from baselines.common.tile_images import tile_images
-
-from .util import dict_to_obs, obs_space_info, obs_to_dict
-
-_NP_TO_CT = {np.float32: ctypes.c_float,
-             np.int32: ctypes.c_int32,
-             np.int8: ctypes.c_int8,
-             np.uint8: ctypes.c_char,
-             np.bool: ctypes.c_bool}
-
-
-class ShmemVecEnv(VecEnv):
-    """
-    An AsyncEnv that uses multiprocessing to run multiple
-    environments in parallel.
-    """
-
-    def __init__(self, env_fns, spaces=None):
-        """
-        If you don't specify observation_space, we'll have to create a dummy
-        environment to get it.
-        """
-        if spaces:
-            observation_space, action_space = spaces
-        else:
-            logger.log('Creating dummy env object to get spaces')
-            with logger.scoped_configure(format_strs=[]):
-                dummy = env_fns[0]()
-                observation_space, action_space = dummy.observation_space, dummy.action_space
-                dummy.close()
-                del dummy
-        VecEnv.__init__(self, len(env_fns), observation_space, action_space)
-        self.obs_keys, self.obs_shapes, self.obs_dtypes = obs_space_info(observation_space)
-        self.obs_bufs = [
-            {k: Array(_NP_TO_CT[self.obs_dtypes[k].type], int(np.prod(self.obs_shapes[k]))) for k in self.obs_keys}
-            for _ in env_fns]
-        self.parent_pipes = []
-        self.procs = []
-        for env_fn, obs_buf in zip(env_fns, self.obs_bufs):
-            wrapped_fn = CloudpickleWrapper(env_fn)
-            parent_pipe, child_pipe = Pipe()
-            proc = Process(target=_subproc_worker,
-                           args=(child_pipe, parent_pipe, wrapped_fn, obs_buf, self.obs_shapes, self.obs_dtypes, self.obs_keys))
-            proc.daemon = True
-            self.procs.append(proc)
-            self.parent_pipes.append(parent_pipe)
-            proc.start()
-            child_pipe.close()
-        self.waiting_step = False
-
-    def reset(self):
-        if self.waiting_step:
-            logger.warn('Called reset() while waiting for the step to complete')
-            self.step_wait()
-        for pipe in self.parent_pipes:
-            pipe.send(('reset', None))
-        return self._decode_obses([pipe.recv() for pipe in self.parent_pipes])
-
-    def step_async(self, actions):
-        assert len(actions) == len(self.parent_pipes)
-        for pipe, act in zip(self.parent_pipes, actions):
-            pipe.send(('step', act))
-
-    def step_wait(self):
-        outs = [pipe.recv() for pipe in self.parent_pipes]
-        obs, rews, dones, infos = zip(*outs)
-        return self._decode_obses(obs), np.array(rews), np.array(dones), infos
-
-    def close(self):
-        if self.waiting_step:
-            self.step_wait()
-        for pipe in self.parent_pipes:
-            pipe.send(('close', None))
-        for pipe in self.parent_pipes:
-            pipe.recv()
-            pipe.close()
-        for proc in self.procs:
-            proc.join()
-
-    def render(self, mode='human'):
-        for pipe in self.parent_pipes:
-            pipe.send(('render', None))
-        imgs = [pipe.recv() for pipe in self.parent_pipes]
-        bigimg = tile_images(imgs)
-        if mode == 'human':
-            import cv2
-            cv2.imshow('vecenv', bigimg[:, :, ::-1])
-            cv2.waitKey(1)
-        elif mode == 'rgb_array':
-            return bigimg
-        else:
-            raise NotImplementedError
-
-    def _decode_obses(self, obs):
-        result = {}
-        for k in self.obs_keys:
-            bufs = [b[k] for b in self.obs_bufs]
-            o = [np.frombuffer(b.get_obj(), dtype=self.obs_dtypes[k]).reshape(self.obs_shapes[k]) for b in bufs]
-            result[k] = np.array(o)
-        return dict_to_obs(result)
-
-
-def _subproc_worker(pipe, parent_pipe, env_fn_wrapper, obs_bufs, obs_shapes, obs_dtypes, keys):
-    """
-    Control a single environment instance using IPC and
-    shared memory.
-    """
-    def _write_obs(maybe_dict_obs):
-        flatdict = obs_to_dict(maybe_dict_obs)
-        for k in keys:
-            dst = obs_bufs[k].get_obj()
-            dst_np = np.frombuffer(dst, dtype=obs_dtypes[k]).reshape(obs_shapes[k])  # pylint: disable=W0212
-            np.copyto(dst_np, flatdict[k])
-
-    env = env_fn_wrapper.x()
-    parent_pipe.close()
-    try:
-        while True:
-            cmd, data = pipe.recv()
-            if cmd == 'reset':
-                pipe.send(_write_obs(env.reset()))
-            elif cmd == 'step':
-                obs, reward, done, info = env.step(data)
-                if done:
-                    obs = env.reset()
-                pipe.send((_write_obs(obs), reward, done, info))
-            elif cmd == 'render':
-                pipe.send(env.render(mode='rgb_array'))
-            elif cmd == 'close':
-                pipe.send(None)
-                break
-            else:
-                raise RuntimeError('Got unrecognized cmd %s' % cmd)
-    except KeyboardInterrupt:
-        print('ShmemVecEnv worker: got KeyboardInterrupt')
-    finally:
-        env.close()
--- a/baselines/common/vec_env/subproc_vec_env.py
+++ b/baselines/common/vec_env/subproc_vec_env.py
@@ -1,6 +1,6 @@
 import numpy as np
 from multiprocessing import Process, Pipe
-from . import VecEnv, CloudpickleWrapper
+from baselines.common.vec_env import VecEnv, CloudpickleWrapper
 from baselines.common.tile_images import tile_images


@@ -32,7 +32,6 @@ def worker(remote, parent_remote, env_fn_wrapper):
    finally:
        env.close()

-
 class SubprocVecEnv(VecEnv):
    def __init__(self, env_fns, spaces=None):
        """
@@ -43,9 +42,9 @@ class SubprocVecEnv(VecEnv):
        nenvs = len(env_fns)
        self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(nenvs)])
        self.ps = [Process(target=worker, args=(work_remote, remote, CloudpickleWrapper(env_fn)))
-                   for (work_remote, remote, env_fn) in zip(self.work_remotes, self.remotes, env_fns)]
+            for (work_remote, remote, env_fn) in zip(self.work_remotes, self.remotes, env_fns)]
        for p in self.ps:
-            p.daemon = True  # if the main process crashes, we should not cause things to hang
+            p.daemon = True # if the main process crashes, we should not cause things to hang
            p.start()
        for remote in self.work_remotes:
            remote.close()
@@ -79,7 +78,7 @@ class SubprocVecEnv(VecEnv):
        if self.closed:
            return
        if self.waiting:
-            for remote in self.remotes:
+            for remote in self.remotes:            
                remote.recv()
        for remote in self.remotes:
            remote.send(('close', None))
@@ -94,9 +93,9 @@ class SubprocVecEnv(VecEnv):
        bigimg = tile_images(imgs)
        if mode == 'human':
            import cv2
-            cv2.imshow('vecenv', bigimg[:, :, ::-1])
+            cv2.imshow('vecenv', bigimg[:,:,::-1])
            cv2.waitKey(1)
        elif mode == 'rgb_array':
            return bigimg
        else:
-            raise NotImplementedError
+            raise NotImplementedError
--- a/baselines/common/vec_env/test_vec_env.py
+++ b/baselines/common/vec_env/test_vec_env.py
@@ -1,85 +0,0 @@
-"""
-Tests for asynchronous vectorized environments.
-"""
-
-import gym
-import numpy as np
-import pytest
-from .dummy_vec_env import DummyVecEnv
-from .shmem_vec_env import ShmemVecEnv
-from .subproc_vec_env import SubprocVecEnv
-
-
-@pytest.mark.parametrize('klass', (ShmemVecEnv, SubprocVecEnv))
-@pytest.mark.parametrize('dtype', ('uint8', 'float32'))
-def test_vec_env(klass, dtype):  # pylint: disable=R0914
-    """
-    Test that a vectorized environment is equivalent to
-    DummyVecEnv, since DummyVecEnv is less likely to be
-    error prone.
-    """
-    num_envs = 3
-    num_steps = 100
-    shape = (3, 8)
-
-    def make_fn(seed):
-        """
-        Get an environment constructor with a seed.
-        """
-        return lambda: _SimpleEnv(seed, shape, dtype)
-    fns = [make_fn(i) for i in range(num_envs)]
-    env1 = DummyVecEnv(fns)
-    env2 = klass(fns)
-    try:
-        obs1, obs2 = env1.reset(), env2.reset()
-        assert np.array(obs1).shape == np.array(obs2).shape
-        assert np.allclose(obs1, obs2)
-        np.random.seed(1337)
-        for _ in range(num_steps):
-            joint_shape = (len(fns),) + shape
-            actions = np.array(np.random.randint(0, 0x100, size=joint_shape),
-                               dtype=dtype)
-            for env in [env1, env2]:
-                env.step_async(actions)
-            outs1 = env1.step_wait()
-            outs2 = env2.step_wait()
-            for out1, out2 in zip(outs1[:3], outs2[:3]):
-                assert np.array(out1).shape == np.array(out2).shape
-                assert np.allclose(out1, out2)
-            assert list(outs1[3]) == list(outs2[3])
-    finally:
-        env1.close()
-        env2.close()
-
-
-class _SimpleEnv(gym.Env):
-    """
-    An environment with a pre-determined observation space
-    and RNG seed.
-    """
-
-    def __init__(self, seed, shape, dtype):
-        np.random.seed(seed)
-        self._dtype = dtype
-        self._start_obs = np.array(np.random.randint(0, 0x100, size=shape),
-                                   dtype=dtype)
-        self._max_steps = seed + 1
-        self._cur_obs = None
-        self._cur_step = 0
-        self.action_space = gym.spaces.Box(low=0, high=100, shape=shape, dtype=dtype)
-        self.observation_space = self.action_space
-
-    def step(self, action):
-        self._cur_obs += np.array(action, dtype=self._dtype)
-        self._cur_step += 1
-        done = self._cur_step >= self._max_steps
-        reward = self._cur_step / self._max_steps
-        return self._cur_obs, reward, done, {'foo': 'bar' + str(reward)}
-
-    def reset(self):
-        self._cur_obs = self._start_obs
-        self._cur_step = 0
-        return self._cur_obs
-
-    def render(self, mode=None):
-        raise NotImplementedError
--- a/baselines/common/vec_env/util.py
+++ b/baselines/common/vec_env/util.py
@@ -1,59 +0,0 @@
-"""
-Helpers for dealing with vectorized environments.
-"""
-
-from collections import OrderedDict
-
-import gym
-import numpy as np
-
-
-def copy_obs_dict(obs):
-    """
-    Deep-copy an observation dict.
-    """
-    return {k: np.copy(v) for k, v in obs.items()}
-
-
-def dict_to_obs(obs_dict):
-    """
-    Convert an observation dict into a raw array if the
-    original observation space was not a Dict space.
-    """
-    if set(obs_dict.keys()) == {None}:
-        return obs_dict[None]
-    return obs_dict
-
-
-def obs_space_info(obs_space):
-    """
-    Get dict-structured information about a gym.Space.
-
-    Returns:
-      A tuple (keys, shapes, dtypes):
-        keys: a list of dict keys.
-        shapes: a dict mapping keys to shapes.
-        dtypes: a dict mapping keys to dtypes.
-    """
-    if isinstance(obs_space, gym.spaces.Dict):
-        assert isinstance(obs_space.spaces, OrderedDict)
-        subspaces = obs_space.spaces
-    else:
-        subspaces = {None: obs_space}
-    keys = []
-    shapes = {}
-    dtypes = {}
-    for key, box in subspaces.items():
-        keys.append(key)
-        shapes[key] = box.shape
-        dtypes[key] = box.dtype
-    return keys, shapes, dtypes
-
-
-def obs_to_dict(obs):
-    """
-    Convert an observation into a dict.
-    """
-    if isinstance(obs, dict):
-        return obs
-    return {None: obs}
--- a/baselines/common/vec_env/vec_frame_stack.py
+++ b/baselines/common/vec_env/vec_frame_stack.py
@@ -1,16 +1,18 @@
-from . import VecEnvWrapper
+from baselines.common.vec_env import VecEnvWrapper
 import numpy as np
 from gym import spaces

-
 class VecFrameStack(VecEnvWrapper):
+    """
+    Vectorized environment base class
+    """
    def __init__(self, venv, nstack):
        self.venv = venv
        self.nstack = nstack
-        wos = venv.observation_space  # wrapped ob space
+        wos = venv.observation_space # wrapped ob space
        low = np.repeat(wos.low, self.nstack, axis=-1)
        high = np.repeat(wos.high, self.nstack, axis=-1)
-        self.stackedobs = np.zeros((venv.num_envs,) + low.shape, low.dtype)
+        self.stackedobs = np.zeros((venv.num_envs,)+low.shape, low.dtype)
        observation_space = spaces.Box(low=low, high=high, dtype=venv.observation_space.dtype)
        VecEnvWrapper.__init__(self, venv, observation_space=observation_space)

@@ -24,6 +26,9 @@ class VecFrameStack(VecEnvWrapper):
        return self.stackedobs, rews, news, infos

    def reset(self):
+        """
+        Reset all environments
+        """
        obs = self.venv.reset()
        self.stackedobs[...] = 0
        self.stackedobs[..., -obs.shape[-1]:] = obs
--- a/baselines/common/vec_env/vec_monitor.py
+++ b/baselines/common/vec_env/vec_monitor.py
@@ -1,29 +0,0 @@
-from . import VecEnvWrapper
-import numpy as np
-
-
-class VecMonitor(VecEnvWrapper):
-    def __init__(self, venv):
-        VecEnvWrapper.__init__(self, venv)
-        self.eprets = None
-        self.eplens = None
-
-    def reset(self):
-        obs = self.venv.reset()
-        self.eprets = np.zeros(self.num_envs, 'f')
-        self.eplens = np.zeros(self.num_envs, 'i')
-        return obs
-
-    def step_wait(self):
-        obs, rews, dones, infos = self.venv.step_wait()
-        self.eprets += rews
-        self.eplens += 1
-        newinfos = []
-        for (i, (done, ret, eplen, info)) in enumerate(zip(dones, self.eprets, self.eplens, infos)):
-            info = info.copy()
-            if done:
-                info['episode'] = {'r': ret, 'l': eplen}
-                self.eprets[i] = 0
-                self.eplens[i] = 0
-            newinfos.append(info)
-        return obs, rews, dones, newinfos
--- a/baselines/common/vec_env/vec_normalize.py
+++ b/baselines/common/vec_env/vec_normalize.py
@@ -1,18 +1,17 @@
-from . import VecEnvWrapper
+from baselines.common.vec_env import VecEnvWrapper
 from baselines.common.running_mean_std import RunningMeanStd
 import numpy as np

-
 class VecNormalize(VecEnvWrapper):
    """
-    A vectorized wrapper that normalizes the observations
-    and returns from an environment.
+    Vectorized environment base class
    """
-
    def __init__(self, venv, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8):
        VecEnvWrapper.__init__(self, venv)
        self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if ob else None
        self.ret_rms = RunningMeanStd(shape=()) if ret else None
+        #self.ob_rms = TfRunningMeanStd(shape=self.observation_space.shape, scope='observation_running_mean_std') if ob else None
+        #self.ret_rms = TfRunningMeanStd(shape=(), scope='return_running_mean_std') if ret else None
        self.clipob = clipob
        self.cliprew = cliprew
        self.ret = np.zeros(self.num_envs)
@@ -20,6 +19,12 @@ class VecNormalize(VecEnvWrapper):
        self.epsilon = epsilon

    def step_wait(self):
+        """
+        Apply sequence of actions to sequence of environments
+        actions -> (observations, rewards, news)
+
+        where 'news' is a boolean vector indicating whether each element is new.
+        """
        obs, rews, news, infos = self.venv.step_wait()
        self.ret = self.ret * self.gamma + rews
        obs = self._obfilt(obs)
@@ -37,5 +42,8 @@ class VecNormalize(VecEnvWrapper):
            return obs

    def reset(self):
+        """
+        Reset all environments
+        """
        obs = self.venv.reset()
        return self._obfilt(obs)
--- a/baselines/deepq/README.md
+++ b/baselines/deepq/README.md
@@ -32,7 +32,7 @@ In particular notice that once `deepq.learn` finishes training it returns `act`


 - [baselines/deepq/experiments/custom_cartpole.py](experiments/custom_cartpole.py) - Cartpole training with more fine grained control over the internals of DQN algorithm.
- [baselines/deepq/experiments/run_atari.py](experiments/run_atari.py) - more robust setup for training at scale.
+- [baselines/deepq/experiments/atari/train.py](experiments/atari/train.py) - more robust setup for training at scale.


 ##### Download a pretrained Atari agent
--- a/benchmarks_atari10M.htm
+++ b/benchmarks_atari10M.htm
--- a/benchmarks_mujoco1M.htm
+++ b/benchmarks_mujoco1M.htm
Author	SHA1	Message	Date
Peter Zhokhov	dbcc4e0252	lstm network builders using tf lstm	2018-08-10 14:23:45 -07:00
Peter Zhokhov	217b111c88	merged refactor	2018-08-10 14:14:46 -07:00