Compare commits
49 Commits
peterz_ale
...
peterz_ben
Author | SHA1 | Date | |
---|---|---|---|
|
ea68f3b7e6 | ||
|
ca721a4be6 | ||
|
72f3572a10 | ||
|
b9cd941471 | ||
|
0899b71ede | ||
|
cc8c9541fb | ||
|
cb32522394 | ||
|
1e40ec22be | ||
|
701a36cdfa | ||
|
5a7f9847d8 | ||
|
b63134e5c5 | ||
|
db314cdeda | ||
|
b08c083d91 | ||
|
bfbbe66d9e | ||
|
1c5c6563b7 | ||
|
1fa8c58da5 | ||
|
f6d1115ead | ||
|
f6d5a47bed | ||
|
c2df27bee4 | ||
|
974c15756e | ||
|
ad43fd9a35 | ||
|
72c357c638 | ||
|
e00e5ca016 | ||
|
705797f2f0 | ||
|
fcd84aa831 | ||
|
390b51597a | ||
|
95104a3592 | ||
|
3528f7b992 | ||
|
151e48009e | ||
|
92f33335e9 | ||
|
af729cff15 | ||
|
10f815fe1d | ||
|
8c4adac898 | ||
|
2a93ea8782 | ||
|
9c48f9fad5 | ||
|
348cbb4b71 | ||
|
a1602ab15f | ||
|
e63e69bb14 | ||
|
385e7e5c0d | ||
|
d112a2e49f | ||
|
e662dd6409 | ||
|
efc6bffce3 | ||
|
872181d4c3 | ||
|
628ddecf6a | ||
|
83a4a4be65 | ||
|
7edac38c73 | ||
|
a6dca44115 | ||
|
622915c473 | ||
|
a1d3c18ec0 |
17
README.md
17
README.md
@@ -112,6 +112,10 @@ This should get to the mean reward per episode about 5k. To load and visualize t
|
||||
*NOTE:* At the moment Mujoco training uses VecNormalize wrapper for the environment which is not being saved correctly; so loading the models trained on Mujoco will not work well if the environment is recreated. If necessary, you can work around that by replacing RunningMeanStd by TfRunningMeanStd in [baselines/common/vec_env/vec_normalize.py](baselines/common/vec_env/vec_normalize.py#L12). This way, mean and std of environment normalizing wrapper will be saved in tensorflow variables and included in the model file; however, training is slower that way - hence not including it by default
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
## Subpackages
|
||||
|
||||
- [A2C](baselines/a2c)
|
||||
@@ -121,19 +125,10 @@ This should get to the mean reward per episode about 5k. To load and visualize t
|
||||
- [DQN](baselines/deepq)
|
||||
- [GAIL](baselines/gail)
|
||||
- [HER](baselines/her)
|
||||
- [PPO1](baselines/ppo1) (obsolete version, left here temporarily)
|
||||
- [PPO2](baselines/ppo2)
|
||||
- [PPO1](baselines/ppo1) (Multi-CPU using MPI)
|
||||
- [PPO2](baselines/ppo2) (Optimized for GPU)
|
||||
- [TRPO](baselines/trpo_mpi)
|
||||
|
||||
|
||||
|
||||
## Benchmarks
|
||||
Results of benchmarks on Mujoco (1M timesteps) and Atari (10M timesteps) are available
|
||||
[here for Mujoco](https://htmlpreview.github.com/?https://github.com/openai/baselines/blob/master/benchmarks_mujoco1M.htm)
|
||||
and
|
||||
[here for Atari](https://htmlpreview.github.com/?https://github.com/openai/baselines/blob/master/benchmarks_atari10M.htm)
|
||||
respectively. Note that these results may be not on the latest version of the code, particular commit hash with which results were obtained is specified on the benchmarks page.
|
||||
|
||||
To cite this repository in publications:
|
||||
|
||||
@misc{baselines,
|
||||
|
@@ -1,34 +1,28 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from baselines import logger
|
||||
|
||||
|
||||
class AlreadySteppingError(Exception):
|
||||
"""
|
||||
Raised when an asynchronous step is running while
|
||||
step_async() is called again.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
msg = 'already running an async step'
|
||||
Exception.__init__(self, msg)
|
||||
|
||||
|
||||
class NotSteppingError(Exception):
|
||||
"""
|
||||
Raised when an asynchronous step is not running but
|
||||
step_wait() is called.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
msg = 'not running an async step'
|
||||
Exception.__init__(self, msg)
|
||||
|
||||
|
||||
class VecEnv(ABC):
|
||||
"""
|
||||
An abstract asynchronous, vectorized environment.
|
||||
"""
|
||||
|
||||
def __init__(self, num_envs, observation_space, action_space):
|
||||
self.num_envs = num_envs
|
||||
self.observation_space = observation_space
|
||||
@@ -38,7 +32,7 @@ class VecEnv(ABC):
|
||||
def reset(self):
|
||||
"""
|
||||
Reset all the environments and return an array of
|
||||
observations, or a dict of observation arrays.
|
||||
observations, or a tuple of observation arrays.
|
||||
|
||||
If step_async is still doing work, that work will
|
||||
be cancelled and step_wait() should not be called
|
||||
@@ -64,7 +58,7 @@ class VecEnv(ABC):
|
||||
Wait for the step taken with step_async().
|
||||
|
||||
Returns (obs, rews, dones, infos):
|
||||
- obs: an array of observations, or a dict of
|
||||
- obs: an array of observations, or a tuple of
|
||||
arrays of observations.
|
||||
- rews: an array of rewards
|
||||
- dones: an array of "episode done" booleans
|
||||
@@ -80,16 +74,11 @@ class VecEnv(ABC):
|
||||
pass
|
||||
|
||||
def step(self, actions):
|
||||
"""
|
||||
Step the environments synchronously.
|
||||
|
||||
This is available for backwards compatibility.
|
||||
"""
|
||||
self.step_async(actions)
|
||||
return self.step_wait()
|
||||
|
||||
def render(self, mode='human'):
|
||||
logger.warn('Render not defined for %s' % self)
|
||||
logger.warn('Render not defined for %s'%self)
|
||||
|
||||
@property
|
||||
def unwrapped(self):
|
||||
@@ -98,19 +87,13 @@ class VecEnv(ABC):
|
||||
else:
|
||||
return self
|
||||
|
||||
|
||||
class VecEnvWrapper(VecEnv):
|
||||
"""
|
||||
An environment wrapper that applies to an entire batch
|
||||
of environments at once.
|
||||
"""
|
||||
|
||||
def __init__(self, venv, observation_space=None, action_space=None):
|
||||
self.venv = venv
|
||||
VecEnv.__init__(self,
|
||||
num_envs=venv.num_envs,
|
||||
observation_space=observation_space or venv.observation_space,
|
||||
action_space=action_space or venv.action_space)
|
||||
VecEnv.__init__(self,
|
||||
num_envs=venv.num_envs,
|
||||
observation_space=observation_space or venv.observation_space,
|
||||
action_space=action_space or venv.action_space)
|
||||
|
||||
def step_async(self, actions):
|
||||
self.venv.step_async(actions)
|
||||
@@ -129,19 +112,15 @@ class VecEnvWrapper(VecEnv):
|
||||
def render(self):
|
||||
self.venv.render()
|
||||
|
||||
|
||||
class CloudpickleWrapper(object):
|
||||
"""
|
||||
Uses cloudpickle to serialize contents (otherwise multiprocessing tries to use pickle)
|
||||
"""
|
||||
|
||||
def __init__(self, x):
|
||||
self.x = x
|
||||
|
||||
def __getstate__(self):
|
||||
import cloudpickle
|
||||
return cloudpickle.dumps(self.x)
|
||||
|
||||
def __setstate__(self, ob):
|
||||
import pickle
|
||||
self.x = pickle.loads(ob)
|
||||
|
@@ -1,40 +1,59 @@
|
||||
import numpy as np
|
||||
from gym import spaces
|
||||
from collections import OrderedDict
|
||||
from . import VecEnv
|
||||
from .util import copy_obs_dict, dict_to_obs, obs_space_info
|
||||
|
||||
|
||||
class DummyVecEnv(VecEnv):
|
||||
"""
|
||||
A VecEnv that wraps raw gym.Envs.
|
||||
|
||||
This can be used when an algorithm requires a VecEnv
|
||||
but you want to use a vanilla gym.Env instance.
|
||||
It is also useful for avoiding IPC overhead when you
|
||||
don't need to run environments in parallel.
|
||||
"""
|
||||
|
||||
def __init__(self, env_fns):
|
||||
self.envs = [fn() for fn in env_fns]
|
||||
env = self.envs[0]
|
||||
VecEnv.__init__(self, len(env_fns), env.observation_space, env.action_space)
|
||||
shapes, dtypes = {}, {}
|
||||
self.keys = []
|
||||
obs_space = env.observation_space
|
||||
self.keys, shapes, dtypes = obs_space_info(obs_space)
|
||||
self.buf_obs = {k: np.zeros((self.num_envs,) + tuple(shapes[k]), dtype=dtypes[k]) for k in self.keys}
|
||||
|
||||
if isinstance(obs_space, spaces.Dict):
|
||||
assert isinstance(obs_space.spaces, OrderedDict)
|
||||
subspaces = obs_space.spaces
|
||||
else:
|
||||
subspaces = {None: obs_space}
|
||||
|
||||
for key, box in subspaces.items():
|
||||
shapes[key] = box.shape
|
||||
dtypes[key] = box.dtype
|
||||
self.keys.append(key)
|
||||
|
||||
self.buf_obs = { k: np.zeros((self.num_envs,) + tuple(shapes[k]), dtype=dtypes[k]) for k in self.keys }
|
||||
self.buf_dones = np.zeros((self.num_envs,), dtype=np.bool)
|
||||
self.buf_rews = np.zeros((self.num_envs,), dtype=np.float32)
|
||||
self.buf_rews = np.zeros((self.num_envs,), dtype=np.float32)
|
||||
self.buf_infos = [{} for _ in range(self.num_envs)]
|
||||
self.actions = None
|
||||
|
||||
def step_async(self, actions):
|
||||
self.actions = actions
|
||||
listify = True
|
||||
try:
|
||||
if len(actions) == self.num_envs:
|
||||
listify = False
|
||||
except TypeError:
|
||||
pass
|
||||
|
||||
if not listify:
|
||||
self.actions = actions
|
||||
else:
|
||||
assert self.num_envs == 1, "actions {} is either not a list or has a wrong size - cannot match to {} environments".format(actions, self.num_envs)
|
||||
self.actions = [actions]
|
||||
|
||||
def step_wait(self):
|
||||
for e in range(self.num_envs):
|
||||
obs, self.buf_rews[e], self.buf_dones[e], self.buf_infos[e] = self.envs[e].step(self.actions[e])
|
||||
action = self.actions[e]
|
||||
if isinstance(self.envs[e].action_space, spaces.Discrete):
|
||||
action = int(action)
|
||||
|
||||
obs, self.buf_rews[e], self.buf_dones[e], self.buf_infos[e] = self.envs[e].step(action)
|
||||
if self.buf_dones[e]:
|
||||
obs = self.envs[e].reset()
|
||||
self._save_obs(e, obs)
|
||||
return (self._obs_from_buf(), np.copy(self.buf_rews), np.copy(self.buf_dones),
|
||||
return (np.copy(self._obs_from_buf()), np.copy(self.buf_rews), np.copy(self.buf_dones),
|
||||
self.buf_infos.copy())
|
||||
|
||||
def reset(self):
|
||||
@@ -44,8 +63,7 @@ class DummyVecEnv(VecEnv):
|
||||
return self._obs_from_buf()
|
||||
|
||||
def close(self):
|
||||
for e in self.envs:
|
||||
e.close()
|
||||
return
|
||||
|
||||
def render(self, mode='human'):
|
||||
return [e.render(mode=mode) for e in self.envs]
|
||||
@@ -58,4 +76,7 @@ class DummyVecEnv(VecEnv):
|
||||
self.buf_obs[k][e] = obs[k]
|
||||
|
||||
def _obs_from_buf(self):
|
||||
return dict_to_obs(copy_obs_dict(self.buf_obs))
|
||||
if self.keys==[None]:
|
||||
return self.buf_obs[None]
|
||||
else:
|
||||
return self.buf_obs
|
||||
|
@@ -1,146 +0,0 @@
|
||||
"""
|
||||
An interface for asynchronous vectorized environments.
|
||||
"""
|
||||
|
||||
from multiprocessing import Pipe, Array, Process
|
||||
import numpy as np
|
||||
from . import VecEnv, CloudpickleWrapper
|
||||
import ctypes
|
||||
from baselines import logger
|
||||
from baselines.common.tile_images import tile_images
|
||||
|
||||
from .util import dict_to_obs, obs_space_info, obs_to_dict
|
||||
|
||||
_NP_TO_CT = {np.float32: ctypes.c_float,
|
||||
np.int32: ctypes.c_int32,
|
||||
np.int8: ctypes.c_int8,
|
||||
np.uint8: ctypes.c_char,
|
||||
np.bool: ctypes.c_bool}
|
||||
|
||||
|
||||
class ShmemVecEnv(VecEnv):
|
||||
"""
|
||||
An AsyncEnv that uses multiprocessing to run multiple
|
||||
environments in parallel.
|
||||
"""
|
||||
|
||||
def __init__(self, env_fns, spaces=None):
|
||||
"""
|
||||
If you don't specify observation_space, we'll have to create a dummy
|
||||
environment to get it.
|
||||
"""
|
||||
if spaces:
|
||||
observation_space, action_space = spaces
|
||||
else:
|
||||
logger.log('Creating dummy env object to get spaces')
|
||||
with logger.scoped_configure(format_strs=[]):
|
||||
dummy = env_fns[0]()
|
||||
observation_space, action_space = dummy.observation_space, dummy.action_space
|
||||
dummy.close()
|
||||
del dummy
|
||||
VecEnv.__init__(self, len(env_fns), observation_space, action_space)
|
||||
self.obs_keys, self.obs_shapes, self.obs_dtypes = obs_space_info(observation_space)
|
||||
self.obs_bufs = [
|
||||
{k: Array(_NP_TO_CT[self.obs_dtypes[k].type], int(np.prod(self.obs_shapes[k]))) for k in self.obs_keys}
|
||||
for _ in env_fns]
|
||||
self.parent_pipes = []
|
||||
self.procs = []
|
||||
for env_fn, obs_buf in zip(env_fns, self.obs_bufs):
|
||||
wrapped_fn = CloudpickleWrapper(env_fn)
|
||||
parent_pipe, child_pipe = Pipe()
|
||||
proc = Process(target=_subproc_worker,
|
||||
args=(child_pipe, parent_pipe, wrapped_fn, obs_buf, self.obs_shapes, self.obs_dtypes, self.obs_keys))
|
||||
proc.daemon = True
|
||||
self.procs.append(proc)
|
||||
self.parent_pipes.append(parent_pipe)
|
||||
proc.start()
|
||||
child_pipe.close()
|
||||
self.waiting_step = False
|
||||
|
||||
def reset(self):
|
||||
if self.waiting_step:
|
||||
logger.warn('Called reset() while waiting for the step to complete')
|
||||
self.step_wait()
|
||||
for pipe in self.parent_pipes:
|
||||
pipe.send(('reset', None))
|
||||
return self._decode_obses([pipe.recv() for pipe in self.parent_pipes])
|
||||
|
||||
def step_async(self, actions):
|
||||
assert len(actions) == len(self.parent_pipes)
|
||||
for pipe, act in zip(self.parent_pipes, actions):
|
||||
pipe.send(('step', act))
|
||||
|
||||
def step_wait(self):
|
||||
outs = [pipe.recv() for pipe in self.parent_pipes]
|
||||
obs, rews, dones, infos = zip(*outs)
|
||||
return self._decode_obses(obs), np.array(rews), np.array(dones), infos
|
||||
|
||||
def close(self):
|
||||
if self.waiting_step:
|
||||
self.step_wait()
|
||||
for pipe in self.parent_pipes:
|
||||
pipe.send(('close', None))
|
||||
for pipe in self.parent_pipes:
|
||||
pipe.recv()
|
||||
pipe.close()
|
||||
for proc in self.procs:
|
||||
proc.join()
|
||||
|
||||
def render(self, mode='human'):
|
||||
for pipe in self.parent_pipes:
|
||||
pipe.send(('render', None))
|
||||
imgs = [pipe.recv() for pipe in self.parent_pipes]
|
||||
bigimg = tile_images(imgs)
|
||||
if mode == 'human':
|
||||
import cv2
|
||||
cv2.imshow('vecenv', bigimg[:, :, ::-1])
|
||||
cv2.waitKey(1)
|
||||
elif mode == 'rgb_array':
|
||||
return bigimg
|
||||
else:
|
||||
raise NotImplementedError
|
||||
|
||||
def _decode_obses(self, obs):
|
||||
result = {}
|
||||
for k in self.obs_keys:
|
||||
bufs = [b[k] for b in self.obs_bufs]
|
||||
o = [np.frombuffer(b.get_obj(), dtype=self.obs_dtypes[k]).reshape(self.obs_shapes[k]) for b in bufs]
|
||||
result[k] = np.array(o)
|
||||
return dict_to_obs(result)
|
||||
|
||||
|
||||
def _subproc_worker(pipe, parent_pipe, env_fn_wrapper, obs_bufs, obs_shapes, obs_dtypes, keys):
|
||||
"""
|
||||
Control a single environment instance using IPC and
|
||||
shared memory.
|
||||
"""
|
||||
def _write_obs(maybe_dict_obs):
|
||||
flatdict = obs_to_dict(maybe_dict_obs)
|
||||
for k in keys:
|
||||
dst = obs_bufs[k].get_obj()
|
||||
dst_np = np.frombuffer(dst, dtype=obs_dtypes[k]).reshape(obs_shapes[k]) # pylint: disable=W0212
|
||||
np.copyto(dst_np, flatdict[k])
|
||||
|
||||
env = env_fn_wrapper.x()
|
||||
parent_pipe.close()
|
||||
try:
|
||||
while True:
|
||||
cmd, data = pipe.recv()
|
||||
if cmd == 'reset':
|
||||
pipe.send(_write_obs(env.reset()))
|
||||
elif cmd == 'step':
|
||||
obs, reward, done, info = env.step(data)
|
||||
if done:
|
||||
obs = env.reset()
|
||||
pipe.send((_write_obs(obs), reward, done, info))
|
||||
elif cmd == 'render':
|
||||
pipe.send(env.render(mode='rgb_array'))
|
||||
elif cmd == 'close':
|
||||
pipe.send(None)
|
||||
break
|
||||
else:
|
||||
raise RuntimeError('Got unrecognized cmd %s' % cmd)
|
||||
except KeyboardInterrupt:
|
||||
print('ShmemVecEnv worker: got KeyboardInterrupt')
|
||||
finally:
|
||||
env.close()
|
@@ -1,6 +1,6 @@
|
||||
import numpy as np
|
||||
from multiprocessing import Process, Pipe
|
||||
from . import VecEnv, CloudpickleWrapper
|
||||
from baselines.common.vec_env import VecEnv, CloudpickleWrapper
|
||||
from baselines.common.tile_images import tile_images
|
||||
|
||||
|
||||
@@ -32,7 +32,6 @@ def worker(remote, parent_remote, env_fn_wrapper):
|
||||
finally:
|
||||
env.close()
|
||||
|
||||
|
||||
class SubprocVecEnv(VecEnv):
|
||||
def __init__(self, env_fns, spaces=None):
|
||||
"""
|
||||
@@ -43,9 +42,9 @@ class SubprocVecEnv(VecEnv):
|
||||
nenvs = len(env_fns)
|
||||
self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(nenvs)])
|
||||
self.ps = [Process(target=worker, args=(work_remote, remote, CloudpickleWrapper(env_fn)))
|
||||
for (work_remote, remote, env_fn) in zip(self.work_remotes, self.remotes, env_fns)]
|
||||
for (work_remote, remote, env_fn) in zip(self.work_remotes, self.remotes, env_fns)]
|
||||
for p in self.ps:
|
||||
p.daemon = True # if the main process crashes, we should not cause things to hang
|
||||
p.daemon = True # if the main process crashes, we should not cause things to hang
|
||||
p.start()
|
||||
for remote in self.work_remotes:
|
||||
remote.close()
|
||||
@@ -79,7 +78,7 @@ class SubprocVecEnv(VecEnv):
|
||||
if self.closed:
|
||||
return
|
||||
if self.waiting:
|
||||
for remote in self.remotes:
|
||||
for remote in self.remotes:
|
||||
remote.recv()
|
||||
for remote in self.remotes:
|
||||
remote.send(('close', None))
|
||||
@@ -94,9 +93,9 @@ class SubprocVecEnv(VecEnv):
|
||||
bigimg = tile_images(imgs)
|
||||
if mode == 'human':
|
||||
import cv2
|
||||
cv2.imshow('vecenv', bigimg[:, :, ::-1])
|
||||
cv2.imshow('vecenv', bigimg[:,:,::-1])
|
||||
cv2.waitKey(1)
|
||||
elif mode == 'rgb_array':
|
||||
return bigimg
|
||||
else:
|
||||
raise NotImplementedError
|
||||
raise NotImplementedError
|
@@ -1,85 +0,0 @@
|
||||
"""
|
||||
Tests for asynchronous vectorized environments.
|
||||
"""
|
||||
|
||||
import gym
|
||||
import numpy as np
|
||||
import pytest
|
||||
from .dummy_vec_env import DummyVecEnv
|
||||
from .shmem_vec_env import ShmemVecEnv
|
||||
from .subproc_vec_env import SubprocVecEnv
|
||||
|
||||
|
||||
@pytest.mark.parametrize('klass', (ShmemVecEnv, SubprocVecEnv))
|
||||
@pytest.mark.parametrize('dtype', ('uint8', 'float32'))
|
||||
def test_vec_env(klass, dtype): # pylint: disable=R0914
|
||||
"""
|
||||
Test that a vectorized environment is equivalent to
|
||||
DummyVecEnv, since DummyVecEnv is less likely to be
|
||||
error prone.
|
||||
"""
|
||||
num_envs = 3
|
||||
num_steps = 100
|
||||
shape = (3, 8)
|
||||
|
||||
def make_fn(seed):
|
||||
"""
|
||||
Get an environment constructor with a seed.
|
||||
"""
|
||||
return lambda: _SimpleEnv(seed, shape, dtype)
|
||||
fns = [make_fn(i) for i in range(num_envs)]
|
||||
env1 = DummyVecEnv(fns)
|
||||
env2 = klass(fns)
|
||||
try:
|
||||
obs1, obs2 = env1.reset(), env2.reset()
|
||||
assert np.array(obs1).shape == np.array(obs2).shape
|
||||
assert np.allclose(obs1, obs2)
|
||||
np.random.seed(1337)
|
||||
for _ in range(num_steps):
|
||||
joint_shape = (len(fns),) + shape
|
||||
actions = np.array(np.random.randint(0, 0x100, size=joint_shape),
|
||||
dtype=dtype)
|
||||
for env in [env1, env2]:
|
||||
env.step_async(actions)
|
||||
outs1 = env1.step_wait()
|
||||
outs2 = env2.step_wait()
|
||||
for out1, out2 in zip(outs1[:3], outs2[:3]):
|
||||
assert np.array(out1).shape == np.array(out2).shape
|
||||
assert np.allclose(out1, out2)
|
||||
assert list(outs1[3]) == list(outs2[3])
|
||||
finally:
|
||||
env1.close()
|
||||
env2.close()
|
||||
|
||||
|
||||
class _SimpleEnv(gym.Env):
|
||||
"""
|
||||
An environment with a pre-determined observation space
|
||||
and RNG seed.
|
||||
"""
|
||||
|
||||
def __init__(self, seed, shape, dtype):
|
||||
np.random.seed(seed)
|
||||
self._dtype = dtype
|
||||
self._start_obs = np.array(np.random.randint(0, 0x100, size=shape),
|
||||
dtype=dtype)
|
||||
self._max_steps = seed + 1
|
||||
self._cur_obs = None
|
||||
self._cur_step = 0
|
||||
self.action_space = gym.spaces.Box(low=0, high=100, shape=shape, dtype=dtype)
|
||||
self.observation_space = self.action_space
|
||||
|
||||
def step(self, action):
|
||||
self._cur_obs += np.array(action, dtype=self._dtype)
|
||||
self._cur_step += 1
|
||||
done = self._cur_step >= self._max_steps
|
||||
reward = self._cur_step / self._max_steps
|
||||
return self._cur_obs, reward, done, {'foo': 'bar' + str(reward)}
|
||||
|
||||
def reset(self):
|
||||
self._cur_obs = self._start_obs
|
||||
self._cur_step = 0
|
||||
return self._cur_obs
|
||||
|
||||
def render(self, mode=None):
|
||||
raise NotImplementedError
|
@@ -1,59 +0,0 @@
|
||||
"""
|
||||
Helpers for dealing with vectorized environments.
|
||||
"""
|
||||
|
||||
from collections import OrderedDict
|
||||
|
||||
import gym
|
||||
import numpy as np
|
||||
|
||||
|
||||
def copy_obs_dict(obs):
|
||||
"""
|
||||
Deep-copy an observation dict.
|
||||
"""
|
||||
return {k: np.copy(v) for k, v in obs.items()}
|
||||
|
||||
|
||||
def dict_to_obs(obs_dict):
|
||||
"""
|
||||
Convert an observation dict into a raw array if the
|
||||
original observation space was not a Dict space.
|
||||
"""
|
||||
if set(obs_dict.keys()) == {None}:
|
||||
return obs_dict[None]
|
||||
return obs_dict
|
||||
|
||||
|
||||
def obs_space_info(obs_space):
|
||||
"""
|
||||
Get dict-structured information about a gym.Space.
|
||||
|
||||
Returns:
|
||||
A tuple (keys, shapes, dtypes):
|
||||
keys: a list of dict keys.
|
||||
shapes: a dict mapping keys to shapes.
|
||||
dtypes: a dict mapping keys to dtypes.
|
||||
"""
|
||||
if isinstance(obs_space, gym.spaces.Dict):
|
||||
assert isinstance(obs_space.spaces, OrderedDict)
|
||||
subspaces = obs_space.spaces
|
||||
else:
|
||||
subspaces = {None: obs_space}
|
||||
keys = []
|
||||
shapes = {}
|
||||
dtypes = {}
|
||||
for key, box in subspaces.items():
|
||||
keys.append(key)
|
||||
shapes[key] = box.shape
|
||||
dtypes[key] = box.dtype
|
||||
return keys, shapes, dtypes
|
||||
|
||||
|
||||
def obs_to_dict(obs):
|
||||
"""
|
||||
Convert an observation into a dict.
|
||||
"""
|
||||
if isinstance(obs, dict):
|
||||
return obs
|
||||
return {None: obs}
|
@@ -1,16 +1,18 @@
|
||||
from . import VecEnvWrapper
|
||||
from baselines.common.vec_env import VecEnvWrapper
|
||||
import numpy as np
|
||||
from gym import spaces
|
||||
|
||||
|
||||
class VecFrameStack(VecEnvWrapper):
|
||||
"""
|
||||
Vectorized environment base class
|
||||
"""
|
||||
def __init__(self, venv, nstack):
|
||||
self.venv = venv
|
||||
self.nstack = nstack
|
||||
wos = venv.observation_space # wrapped ob space
|
||||
wos = venv.observation_space # wrapped ob space
|
||||
low = np.repeat(wos.low, self.nstack, axis=-1)
|
||||
high = np.repeat(wos.high, self.nstack, axis=-1)
|
||||
self.stackedobs = np.zeros((venv.num_envs,) + low.shape, low.dtype)
|
||||
self.stackedobs = np.zeros((venv.num_envs,)+low.shape, low.dtype)
|
||||
observation_space = spaces.Box(low=low, high=high, dtype=venv.observation_space.dtype)
|
||||
VecEnvWrapper.__init__(self, venv, observation_space=observation_space)
|
||||
|
||||
@@ -24,6 +26,9 @@ class VecFrameStack(VecEnvWrapper):
|
||||
return self.stackedobs, rews, news, infos
|
||||
|
||||
def reset(self):
|
||||
"""
|
||||
Reset all environments
|
||||
"""
|
||||
obs = self.venv.reset()
|
||||
self.stackedobs[...] = 0
|
||||
self.stackedobs[..., -obs.shape[-1]:] = obs
|
||||
|
@@ -1,29 +0,0 @@
|
||||
from . import VecEnvWrapper
|
||||
import numpy as np
|
||||
|
||||
|
||||
class VecMonitor(VecEnvWrapper):
|
||||
def __init__(self, venv):
|
||||
VecEnvWrapper.__init__(self, venv)
|
||||
self.eprets = None
|
||||
self.eplens = None
|
||||
|
||||
def reset(self):
|
||||
obs = self.venv.reset()
|
||||
self.eprets = np.zeros(self.num_envs, 'f')
|
||||
self.eplens = np.zeros(self.num_envs, 'i')
|
||||
return obs
|
||||
|
||||
def step_wait(self):
|
||||
obs, rews, dones, infos = self.venv.step_wait()
|
||||
self.eprets += rews
|
||||
self.eplens += 1
|
||||
newinfos = []
|
||||
for (i, (done, ret, eplen, info)) in enumerate(zip(dones, self.eprets, self.eplens, infos)):
|
||||
info = info.copy()
|
||||
if done:
|
||||
info['episode'] = {'r': ret, 'l': eplen}
|
||||
self.eprets[i] = 0
|
||||
self.eplens[i] = 0
|
||||
newinfos.append(info)
|
||||
return obs, rews, dones, newinfos
|
@@ -1,18 +1,17 @@
|
||||
from . import VecEnvWrapper
|
||||
from baselines.common.vec_env import VecEnvWrapper
|
||||
from baselines.common.running_mean_std import RunningMeanStd
|
||||
import numpy as np
|
||||
|
||||
|
||||
class VecNormalize(VecEnvWrapper):
|
||||
"""
|
||||
A vectorized wrapper that normalizes the observations
|
||||
and returns from an environment.
|
||||
Vectorized environment base class
|
||||
"""
|
||||
|
||||
def __init__(self, venv, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8):
|
||||
VecEnvWrapper.__init__(self, venv)
|
||||
self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if ob else None
|
||||
self.ret_rms = RunningMeanStd(shape=()) if ret else None
|
||||
#self.ob_rms = TfRunningMeanStd(shape=self.observation_space.shape, scope='observation_running_mean_std') if ob else None
|
||||
#self.ret_rms = TfRunningMeanStd(shape=(), scope='return_running_mean_std') if ret else None
|
||||
self.clipob = clipob
|
||||
self.cliprew = cliprew
|
||||
self.ret = np.zeros(self.num_envs)
|
||||
@@ -20,6 +19,12 @@ class VecNormalize(VecEnvWrapper):
|
||||
self.epsilon = epsilon
|
||||
|
||||
def step_wait(self):
|
||||
"""
|
||||
Apply sequence of actions to sequence of environments
|
||||
actions -> (observations, rewards, news)
|
||||
|
||||
where 'news' is a boolean vector indicating whether each element is new.
|
||||
"""
|
||||
obs, rews, news, infos = self.venv.step_wait()
|
||||
self.ret = self.ret * self.gamma + rews
|
||||
obs = self._obfilt(obs)
|
||||
@@ -37,5 +42,8 @@ class VecNormalize(VecEnvWrapper):
|
||||
return obs
|
||||
|
||||
def reset(self):
|
||||
"""
|
||||
Reset all environments
|
||||
"""
|
||||
obs = self.venv.reset()
|
||||
return self._obfilt(obs)
|
||||
|
@@ -32,7 +32,7 @@ In particular notice that once `deepq.learn` finishes training it returns `act`
|
||||
|
||||
|
||||
- [baselines/deepq/experiments/custom_cartpole.py](experiments/custom_cartpole.py) - Cartpole training with more fine grained control over the internals of DQN algorithm.
|
||||
- [baselines/deepq/experiments/run_atari.py](experiments/run_atari.py) - more robust setup for training at scale.
|
||||
- [baselines/deepq/experiments/atari/train.py](experiments/atari/train.py) - more robust setup for training at scale.
|
||||
|
||||
|
||||
##### Download a pretrained Atari agent
|
||||
|
12351
benchmarks_atari10M.htm
12351
benchmarks_atari10M.htm
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user