Compare commits
26 Commits
tuple_pdty
...
fix_build
Author | SHA1 | Date | |
---|---|---|---|
|
8d9e20fec3 | ||
|
fc23c78c77 | ||
|
25f750d84f | ||
|
391811d98c | ||
|
665b888eeb | ||
|
f40a477a17 | ||
|
c6144bdb6a | ||
|
adba88b218 | ||
|
bfbc3bae14 | ||
|
f703776c91 | ||
|
53797293e5 | ||
|
229a772b81 | ||
|
d80b075904 | ||
|
0182fe1877 | ||
|
1fb4dfb780 | ||
|
7cadef715f | ||
|
fce4370ba2 | ||
|
c57528573e | ||
|
2bca7901f5 | ||
|
ba2b017820 | ||
|
7c520852d9 | ||
|
1c872ca8fd | ||
|
ff8d36a7a7 | ||
|
7614b02f7a | ||
|
f7d5a265e1 | ||
|
21776e8f57 |
@@ -11,7 +11,7 @@ WORKDIR $CODE_DIR/baselines
|
||||
# Clean up pycache and pyc files
|
||||
RUN rm -rf __pycache__ && \
|
||||
find . -name "*.pyc" -delete && \
|
||||
pip install tensorflow && \
|
||||
pip install 'tensorflow < 2' && \
|
||||
pip install -e .[test]
|
||||
|
||||
|
||||
|
28
README.md
28
README.md
@@ -1,4 +1,4 @@
|
||||
**Status:** Active (under active development, breaking changes may occur)
|
||||
**Status:** Maintenance (expect bug fixes and minor updates)
|
||||
|
||||
<img src="data/logo.jpg" width=25% align="right" /> [](https://travis-ci.org/openai/baselines)
|
||||
|
||||
@@ -39,21 +39,24 @@ To activate a virtualenv:
|
||||
More thorough tutorial on virtualenvs and options can be found [here](https://virtualenv.pypa.io/en/stable/)
|
||||
|
||||
|
||||
## Tensorflow versions
|
||||
The master branch supports Tensorflow from version 1.4 to 1.14. For Tensorflow 2.0 support, please use tf2 branch.
|
||||
|
||||
## Installation
|
||||
- Clone the repo and cd into it:
|
||||
```bash
|
||||
git clone https://github.com/openai/baselines.git
|
||||
cd baselines
|
||||
```
|
||||
- If you don't have TensorFlow installed already, install your favourite flavor of TensorFlow. In most cases,
|
||||
- If you don't have TensorFlow installed already, install your favourite flavor of TensorFlow. In most cases, you may use
|
||||
```bash
|
||||
pip install tensorflow-gpu # if you have a CUDA-compatible gpu and proper drivers
|
||||
pip install tensorflow-gpu==1.14 # if you have a CUDA-compatible gpu and proper drivers
|
||||
```
|
||||
or
|
||||
```bash
|
||||
pip install tensorflow
|
||||
pip install tensorflow==1.14
|
||||
```
|
||||
should be sufficient. Refer to [TensorFlow installation guide](https://www.tensorflow.org/install/)
|
||||
to install Tensorflow 1.14, which is the latest version of Tensorflow supported by the master branch. Refer to [TensorFlow installation guide](https://www.tensorflow.org/install/)
|
||||
for more details.
|
||||
|
||||
- Install baselines package
|
||||
@@ -98,6 +101,8 @@ python -m baselines.run --alg=deepq --env=PongNoFrameskip-v4 --num_timesteps=1e6
|
||||
```
|
||||
|
||||
## Saving, loading and visualizing models
|
||||
|
||||
### Saving and loading the model
|
||||
The algorithms serialization API is not properly unified yet; however, there is a simple method to save / restore trained models.
|
||||
`--save_path` and `--load_path` command-line option loads the tensorflow state from a given path before training, and saves it after the training, respectively.
|
||||
Let's imagine you'd like to train ppo2 on Atari Pong, save the model and then later visualize what has it learnt.
|
||||
@@ -111,8 +116,17 @@ python -m baselines.run --alg=ppo2 --env=PongNoFrameskip-v4 --num_timesteps=0 --
|
||||
|
||||
*NOTE:* Mujoco environments require normalization to work properly, so we wrap them with VecNormalize wrapper. Currently, to ensure the models are saved with normalization (so that trained models can be restored and run without further training) the normalization coefficients are saved as tensorflow variables. This can decrease the performance somewhat, so if you require high-throughput steps with Mujoco and do not need saving/restoring the models, it may make sense to use numpy normalization instead. To do that, set 'use_tf=False` in [baselines/run.py](baselines/run.py#L116).
|
||||
|
||||
## Loading and vizualizing learning curves and other training metrics
|
||||
See [here](docs/viz/viz.ipynb) for instructions on how to load and display the training data.
|
||||
### Logging and vizualizing learning curves and other training metrics
|
||||
By default, all summary data, including progress, standard output, is saved to a unique directory in a temp folder, specified by a call to Python's [tempfile.gettempdir()](https://docs.python.org/3/library/tempfile.html#tempfile.gettempdir).
|
||||
The directory can be changed with the `--log_path` command-line option.
|
||||
```bash
|
||||
python -m baselines.run --alg=ppo2 --env=PongNoFrameskip-v4 --num_timesteps=2e7 --save_path=~/models/pong_20M_ppo2 --log_path=~/logs/Pong/
|
||||
```
|
||||
*NOTE:* Please be aware that the logger will overwrite files of the same name in an existing directory, thus it's recommended that folder names be given a unique timestamp to prevent overwritten logs.
|
||||
|
||||
Another way the temp directory can be changed is through the use of the `$OPENAI_LOGDIR` environment variable.
|
||||
|
||||
For examples on how to load and display the training data, see [here](docs/viz/viz.ipynb).
|
||||
|
||||
## Subpackages
|
||||
|
||||
|
@@ -6,7 +6,7 @@ from baselines import logger
|
||||
|
||||
from baselines.common import set_global_seeds
|
||||
from baselines.common.policies import build_policy
|
||||
from baselines.common.tf_util import get_session, save_variables
|
||||
from baselines.common.tf_util import get_session, save_variables, load_variables
|
||||
from baselines.common.vec_env.vec_frame_stack import VecFrameStack
|
||||
|
||||
from baselines.a2c.utils import batch_to_seq, seq_to_batch
|
||||
@@ -216,7 +216,8 @@ class Model(object):
|
||||
|
||||
|
||||
self.train = train
|
||||
self.save = functools.partial(save_variables, sess=sess, variables=params)
|
||||
self.save = functools.partial(save_variables, sess=sess)
|
||||
self.load = functools.partial(load_variables, sess=sess)
|
||||
self.train_model = train_model
|
||||
self.step_model = step_model
|
||||
self._step = _step
|
||||
@@ -358,6 +359,9 @@ def learn(network, env, seed=None, nsteps=20, total_timesteps=int(80e6), q_coef=
|
||||
total_timesteps=total_timesteps, lrschedule=lrschedule, c=c,
|
||||
trust_region=trust_region, alpha=alpha, delta=delta)
|
||||
|
||||
if load_path is not None:
|
||||
model.load(load_path)
|
||||
|
||||
runner = Runner(env=env, model=model, nsteps=nsteps)
|
||||
if replay_ratio > 0:
|
||||
buffer = Buffer(env=env, nsteps=nsteps, size=buffer_size)
|
||||
|
@@ -1,2 +1,3 @@
|
||||
# flake8: noqa F403
|
||||
from baselines.bench.benchmarks import *
|
||||
from baselines.bench.monitor import *
|
||||
|
@@ -1,5 +1,4 @@
|
||||
import re
|
||||
import os.path as osp
|
||||
import os
|
||||
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||
|
||||
|
@@ -1,13 +1,11 @@
|
||||
__all__ = ['Monitor', 'get_monitor_files', 'load_results']
|
||||
|
||||
import gym
|
||||
from gym.core import Wrapper
|
||||
import time
|
||||
from glob import glob
|
||||
import csv
|
||||
import os.path as osp
|
||||
import json
|
||||
import numpy as np
|
||||
|
||||
class Monitor(Wrapper):
|
||||
EXT = "monitor.csv"
|
||||
@@ -162,27 +160,3 @@ def load_results(dir):
|
||||
df['t'] -= min(header['t_start'] for header in headers)
|
||||
df.headers = headers # HACK to preserve backwards compatibility
|
||||
return df
|
||||
|
||||
def test_monitor():
|
||||
env = gym.make("CartPole-v1")
|
||||
env.seed(0)
|
||||
mon_file = "/tmp/baselines-test-%s.monitor.csv" % uuid.uuid4()
|
||||
menv = Monitor(env, mon_file)
|
||||
menv.reset()
|
||||
for _ in range(1000):
|
||||
_, _, done, _ = menv.step(0)
|
||||
if done:
|
||||
menv.reset()
|
||||
|
||||
f = open(mon_file, 'rt')
|
||||
|
||||
firstline = f.readline()
|
||||
assert firstline.startswith('#')
|
||||
metadata = json.loads(firstline[1:])
|
||||
assert metadata['env_id'] == "CartPole-v1"
|
||||
assert set(metadata.keys()) == {'env_id', 'gym_version', 't_start'}, "Incorrect keys in monitor metadata"
|
||||
|
||||
last_logline = pandas.read_csv(f, index_col=None)
|
||||
assert set(last_logline.keys()) == {'l', 't', 'r'}, "Incorrect keys in monitor logline"
|
||||
f.close()
|
||||
os.remove(mon_file)
|
||||
|
31
baselines/bench/test_monitor.py
Normal file
31
baselines/bench/test_monitor.py
Normal file
@@ -0,0 +1,31 @@
|
||||
from .monitor import Monitor
|
||||
import gym
|
||||
import json
|
||||
|
||||
def test_monitor():
|
||||
import pandas
|
||||
import os
|
||||
import uuid
|
||||
|
||||
env = gym.make("CartPole-v1")
|
||||
env.seed(0)
|
||||
mon_file = "/tmp/baselines-test-%s.monitor.csv" % uuid.uuid4()
|
||||
menv = Monitor(env, mon_file)
|
||||
menv.reset()
|
||||
for _ in range(1000):
|
||||
_, _, done, _ = menv.step(0)
|
||||
if done:
|
||||
menv.reset()
|
||||
|
||||
f = open(mon_file, 'rt')
|
||||
|
||||
firstline = f.readline()
|
||||
assert firstline.startswith('#')
|
||||
metadata = json.loads(firstline[1:])
|
||||
assert metadata['env_id'] == "CartPole-v1"
|
||||
assert set(metadata.keys()) == {'env_id', 't_start'}, "Incorrect keys in monitor metadata"
|
||||
|
||||
last_logline = pandas.read_csv(f, index_col=None)
|
||||
assert set(last_logline.keys()) == {'l', 't', 'r'}, "Incorrect keys in monitor logline"
|
||||
f.close()
|
||||
os.remove(mon_file)
|
@@ -254,6 +254,13 @@ class LazyFrames(object):
|
||||
return len(self._force())
|
||||
|
||||
def __getitem__(self, i):
|
||||
return self._force()[i]
|
||||
|
||||
def count(self):
|
||||
frames = self._force()
|
||||
return frames.shape[frames.ndim - 1]
|
||||
|
||||
def frame(self, i):
|
||||
return self._force()[..., i]
|
||||
|
||||
def make_atari(env_id, max_episode_steps=None):
|
||||
|
@@ -9,7 +9,7 @@ except ImportError:
|
||||
MPI = None
|
||||
|
||||
import gym
|
||||
from gym.wrappers import FlattenDictWrapper
|
||||
from gym.wrappers import FlattenObservation, FilterObservation
|
||||
from baselines import logger
|
||||
from baselines.bench import Monitor
|
||||
from baselines.common import set_global_seeds
|
||||
@@ -81,8 +81,7 @@ def make_env(env_id, env_type, mpi_rank=0, subrank=0, seed=None, reward_scale=1.
|
||||
env = gym.make(env_id, **env_kwargs)
|
||||
|
||||
if flatten_dict_observations and isinstance(env.observation_space, gym.spaces.Dict):
|
||||
keys = env.observation_space.spaces.keys()
|
||||
env = gym.wrappers.FlattenDictWrapper(env, dict_keys=list(keys))
|
||||
env = FlattenObservation(env)
|
||||
|
||||
env.seed(seed + subrank if seed is not None else None)
|
||||
env = Monitor(env,
|
||||
@@ -128,7 +127,7 @@ def make_robotics_env(env_id, seed, rank=0):
|
||||
"""
|
||||
set_global_seeds(seed)
|
||||
env = gym.make(env_id)
|
||||
env = FlattenDictWrapper(env, ['observation', 'desired_goal'])
|
||||
env = FlattenObservation(FilterObservation(env, ['observation', 'desired_goal']))
|
||||
env = Monitor(
|
||||
env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)),
|
||||
info_keywords=('is_success',))
|
||||
@@ -170,6 +169,7 @@ def common_arg_parser():
|
||||
parser.add_argument('--save_path', help='Path to save trained model to', default=None, type=str)
|
||||
parser.add_argument('--save_video_interval', help='Save video every x steps (0 = disabled)', default=0, type=int)
|
||||
parser.add_argument('--save_video_length', help='Length of recorded video. Default: 200', default=200, type=int)
|
||||
parser.add_argument('--log_path', help='Directory to save learning curve data.', default=None, type=str)
|
||||
parser.add_argument('--play', default=False, action='store_true')
|
||||
return parser
|
||||
|
||||
@@ -186,7 +186,7 @@ def robotics_arg_parser():
|
||||
|
||||
def parse_unknown_args(args):
|
||||
"""
|
||||
Parse arguments not consumed by arg parser into a dicitonary
|
||||
Parse arguments not consumed by arg parser into a dictionary
|
||||
"""
|
||||
retval = {}
|
||||
preceded_by_key = False
|
||||
|
@@ -65,7 +65,7 @@ def check_synced(localval, comm=None):
|
||||
vals = comm.gather(localval)
|
||||
if comm.rank == 0:
|
||||
assert all(val==vals[0] for val in vals[1:]),\
|
||||
f'MpiAdamOptimizer detected that different workers have different weights: {vals}'
|
||||
'MpiAdamOptimizer detected that different workers have different weights: {}'.format(vals)
|
||||
|
||||
@with_mpi(timeout=5)
|
||||
def test_nonfreeze():
|
||||
|
@@ -12,8 +12,9 @@ def mpi_mean(x, axis=0, comm=None, keepdims=False):
|
||||
localsum = np.zeros(n+1, x.dtype)
|
||||
localsum[:n] = xsum.ravel()
|
||||
localsum[n] = x.shape[axis]
|
||||
globalsum = np.zeros_like(localsum)
|
||||
comm.Allreduce(localsum, globalsum, op=MPI.SUM)
|
||||
# globalsum = np.zeros_like(localsum)
|
||||
# comm.Allreduce(localsum, globalsum, op=MPI.SUM)
|
||||
globalsum = comm.allreduce(localsum, op=MPI.SUM)
|
||||
return globalsum[:n].reshape(xsum.shape) / globalsum[n], globalsum[n]
|
||||
|
||||
def mpi_moments(x, axis=0, comm=None, keepdims=False):
|
||||
|
@@ -70,9 +70,11 @@ class ShmemVecEnv(VecEnv):
|
||||
assert len(actions) == len(self.parent_pipes)
|
||||
for pipe, act in zip(self.parent_pipes, actions):
|
||||
pipe.send(('step', act))
|
||||
self.waiting_step = True
|
||||
|
||||
def step_wait(self):
|
||||
outs = [pipe.recv() for pipe in self.parent_pipes]
|
||||
self.waiting_step = False
|
||||
obs, rews, dones, infos = zip(*outs)
|
||||
return self._decode_obses(obs), np.array(rews), np.array(dones), infos
|
||||
|
||||
|
@@ -4,33 +4,36 @@ import numpy as np
|
||||
from .vec_env import VecEnv, CloudpickleWrapper, clear_mpi_env_vars
|
||||
|
||||
|
||||
def worker(remote, parent_remote, env_fn_wrapper):
|
||||
def worker(remote, parent_remote, env_fn_wrappers):
|
||||
def step_env(env, action):
|
||||
ob, reward, done, info = env.step(action)
|
||||
if done:
|
||||
ob = env.reset()
|
||||
return ob, reward, done, info
|
||||
|
||||
parent_remote.close()
|
||||
env = env_fn_wrapper.x()
|
||||
envs = [env_fn_wrapper() for env_fn_wrapper in env_fn_wrappers.x]
|
||||
try:
|
||||
while True:
|
||||
cmd, data = remote.recv()
|
||||
if cmd == 'step':
|
||||
ob, reward, done, info = env.step(data)
|
||||
if done:
|
||||
ob = env.reset()
|
||||
remote.send((ob, reward, done, info))
|
||||
remote.send([step_env(env, action) for env, action in zip(envs, data)])
|
||||
elif cmd == 'reset':
|
||||
ob = env.reset()
|
||||
remote.send(ob)
|
||||
remote.send([env.reset() for env in envs])
|
||||
elif cmd == 'render':
|
||||
remote.send(env.render(mode='rgb_array'))
|
||||
remote.send([env.render(mode='rgb_array') for env in envs])
|
||||
elif cmd == 'close':
|
||||
remote.close()
|
||||
break
|
||||
elif cmd == 'get_spaces_spec':
|
||||
remote.send((env.observation_space, env.action_space, env.spec))
|
||||
remote.send(CloudpickleWrapper((envs[0].observation_space, envs[0].action_space, envs[0].spec)))
|
||||
else:
|
||||
raise NotImplementedError
|
||||
except KeyboardInterrupt:
|
||||
print('SubprocVecEnv worker: got KeyboardInterrupt')
|
||||
finally:
|
||||
env.close()
|
||||
for env in envs:
|
||||
env.close()
|
||||
|
||||
|
||||
class SubprocVecEnv(VecEnv):
|
||||
@@ -38,17 +41,23 @@ class SubprocVecEnv(VecEnv):
|
||||
VecEnv that runs multiple environments in parallel in subproceses and communicates with them via pipes.
|
||||
Recommended to use when num_envs > 1 and step() can be a bottleneck.
|
||||
"""
|
||||
def __init__(self, env_fns, spaces=None, context='spawn'):
|
||||
def __init__(self, env_fns, spaces=None, context='spawn', in_series=1):
|
||||
"""
|
||||
Arguments:
|
||||
|
||||
env_fns: iterable of callables - functions that create environments to run in subprocesses. Need to be cloud-pickleable
|
||||
in_series: number of environments to run in series in a single process
|
||||
(e.g. when len(env_fns) == 12 and in_series == 3, it will run 4 processes, each running 3 envs in series)
|
||||
"""
|
||||
self.waiting = False
|
||||
self.closed = False
|
||||
self.in_series = in_series
|
||||
nenvs = len(env_fns)
|
||||
assert nenvs % in_series == 0, "Number of envs must be divisible by number of envs to run in series"
|
||||
self.nremotes = nenvs // in_series
|
||||
env_fns = np.array_split(env_fns, self.nremotes)
|
||||
ctx = mp.get_context(context)
|
||||
self.remotes, self.work_remotes = zip(*[ctx.Pipe() for _ in range(nenvs)])
|
||||
self.remotes, self.work_remotes = zip(*[ctx.Pipe() for _ in range(self.nremotes)])
|
||||
self.ps = [ctx.Process(target=worker, args=(work_remote, remote, CloudpickleWrapper(env_fn)))
|
||||
for (work_remote, remote, env_fn) in zip(self.work_remotes, self.remotes, env_fns)]
|
||||
for p in self.ps:
|
||||
@@ -59,12 +68,13 @@ class SubprocVecEnv(VecEnv):
|
||||
remote.close()
|
||||
|
||||
self.remotes[0].send(('get_spaces_spec', None))
|
||||
observation_space, action_space, self.spec = self.remotes[0].recv()
|
||||
observation_space, action_space, self.spec = self.remotes[0].recv().x
|
||||
self.viewer = None
|
||||
VecEnv.__init__(self, len(env_fns), observation_space, action_space)
|
||||
VecEnv.__init__(self, nenvs, observation_space, action_space)
|
||||
|
||||
def step_async(self, actions):
|
||||
self._assert_not_closed()
|
||||
actions = np.array_split(actions, self.nremotes)
|
||||
for remote, action in zip(self.remotes, actions):
|
||||
remote.send(('step', action))
|
||||
self.waiting = True
|
||||
@@ -72,6 +82,7 @@ class SubprocVecEnv(VecEnv):
|
||||
def step_wait(self):
|
||||
self._assert_not_closed()
|
||||
results = [remote.recv() for remote in self.remotes]
|
||||
results = _flatten_list(results)
|
||||
self.waiting = False
|
||||
obs, rews, dones, infos = zip(*results)
|
||||
return _flatten_obs(obs), np.stack(rews), np.stack(dones), infos
|
||||
@@ -80,7 +91,9 @@ class SubprocVecEnv(VecEnv):
|
||||
self._assert_not_closed()
|
||||
for remote in self.remotes:
|
||||
remote.send(('reset', None))
|
||||
return _flatten_obs([remote.recv() for remote in self.remotes])
|
||||
obs = [remote.recv() for remote in self.remotes]
|
||||
obs = _flatten_list(obs)
|
||||
return _flatten_obs(obs)
|
||||
|
||||
def close_extras(self):
|
||||
self.closed = True
|
||||
@@ -97,6 +110,7 @@ class SubprocVecEnv(VecEnv):
|
||||
for pipe in self.remotes:
|
||||
pipe.send(('render', None))
|
||||
imgs = [pipe.recv() for pipe in self.remotes]
|
||||
imgs = _flatten_list(imgs)
|
||||
return imgs
|
||||
|
||||
def _assert_not_closed(self):
|
||||
@@ -115,3 +129,10 @@ def _flatten_obs(obs):
|
||||
return {k: np.stack([o[k] for o in obs]) for k in keys}
|
||||
else:
|
||||
return np.stack(obs)
|
||||
|
||||
def _flatten_list(l):
|
||||
assert isinstance(l, (list, tuple))
|
||||
assert len(l) > 0
|
||||
assert all([len(l_) > 0 for l_ in l])
|
||||
|
||||
return [l__ for l_ in l for l__ in l_]
|
||||
|
@@ -67,6 +67,50 @@ def test_vec_env(klass, dtype): # pylint: disable=R0914
|
||||
assert_venvs_equal(env1, env2, num_steps=num_steps)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('dtype', ('uint8', 'float32'))
|
||||
@pytest.mark.parametrize('num_envs_in_series', (3, 4, 6))
|
||||
def test_sync_sampling(dtype, num_envs_in_series):
|
||||
"""
|
||||
Test that a SubprocVecEnv running with envs in series
|
||||
outputs the same as DummyVecEnv.
|
||||
"""
|
||||
num_envs = 12
|
||||
num_steps = 100
|
||||
shape = (3, 8)
|
||||
|
||||
def make_fn(seed):
|
||||
"""
|
||||
Get an environment constructor with a seed.
|
||||
"""
|
||||
return lambda: SimpleEnv(seed, shape, dtype)
|
||||
fns = [make_fn(i) for i in range(num_envs)]
|
||||
env1 = DummyVecEnv(fns)
|
||||
env2 = SubprocVecEnv(fns, in_series=num_envs_in_series)
|
||||
assert_venvs_equal(env1, env2, num_steps=num_steps)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('dtype', ('uint8', 'float32'))
|
||||
@pytest.mark.parametrize('num_envs_in_series', (3, 4, 6))
|
||||
def test_sync_sampling_sanity(dtype, num_envs_in_series):
|
||||
"""
|
||||
Test that a SubprocVecEnv running with envs in series
|
||||
outputs the same as SubprocVecEnv without running in series.
|
||||
"""
|
||||
num_envs = 12
|
||||
num_steps = 100
|
||||
shape = (3, 8)
|
||||
|
||||
def make_fn(seed):
|
||||
"""
|
||||
Get an environment constructor with a seed.
|
||||
"""
|
||||
return lambda: SimpleEnv(seed, shape, dtype)
|
||||
fns = [make_fn(i) for i in range(num_envs)]
|
||||
env1 = SubprocVecEnv(fns)
|
||||
env2 = SubprocVecEnv(fns, in_series=num_envs_in_series)
|
||||
assert_venvs_equal(env1, env2, num_steps=num_steps)
|
||||
|
||||
|
||||
class SimpleEnv(gym.Env):
|
||||
"""
|
||||
An environment with a pre-determined observation space
|
||||
|
@@ -38,6 +38,9 @@ def obs_space_info(obs_space):
|
||||
if isinstance(obs_space, gym.spaces.Dict):
|
||||
assert isinstance(obs_space.spaces, OrderedDict)
|
||||
subspaces = obs_space.spaces
|
||||
elif isinstance(obs_space, gym.spaces.Tuple):
|
||||
assert isinstance(obs_space.spaces, tuple)
|
||||
subspaces = {i: obs_space.spaces[i] for i in range(len(obs_space.spaces))}
|
||||
else:
|
||||
subspaces = {None: obs_space}
|
||||
keys = []
|
||||
|
@@ -378,11 +378,6 @@ class DDPG(object):
|
||||
self.param_noise_stddev: self.param_noise.current_stddev,
|
||||
})
|
||||
|
||||
if MPI is not None:
|
||||
mean_distance = MPI.COMM_WORLD.allreduce(distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size()
|
||||
else:
|
||||
mean_distance = distance
|
||||
|
||||
if MPI is not None:
|
||||
mean_distance = MPI.COMM_WORLD.allreduce(distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size()
|
||||
else:
|
||||
|
@@ -13,7 +13,7 @@ The functions in this file can are used to create the following functions:
|
||||
stochastic: bool
|
||||
if set to False all the actions are always deterministic (default False)
|
||||
update_eps_ph: float
|
||||
update epsilon a new value, if negative not update happens
|
||||
update epsilon a new value, if negative no update happens
|
||||
(default: no update)
|
||||
|
||||
Returns
|
||||
|
@@ -142,9 +142,8 @@ def learn(env,
|
||||
final value of random action probability
|
||||
train_freq: int
|
||||
update the model every `train_freq` steps.
|
||||
set to None to disable printing
|
||||
batch_size: int
|
||||
size of a batched sampled from replay buffer for training
|
||||
size of a batch sampled from replay buffer for training
|
||||
print_freq: int
|
||||
how often to print out training progress
|
||||
set to None to disable printing
|
||||
|
@@ -2,101 +2,6 @@ import tensorflow as tf
|
||||
import tensorflow.contrib.layers as layers
|
||||
|
||||
|
||||
def _mlp(hiddens, input_, num_actions, scope, reuse=False, layer_norm=False):
|
||||
with tf.variable_scope(scope, reuse=reuse):
|
||||
out = input_
|
||||
for hidden in hiddens:
|
||||
out = layers.fully_connected(out, num_outputs=hidden, activation_fn=None)
|
||||
if layer_norm:
|
||||
out = layers.layer_norm(out, center=True, scale=True)
|
||||
out = tf.nn.relu(out)
|
||||
q_out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None)
|
||||
return q_out
|
||||
|
||||
|
||||
def mlp(hiddens=[], layer_norm=False):
|
||||
"""This model takes as input an observation and returns values of all actions.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
hiddens: [int]
|
||||
list of sizes of hidden layers
|
||||
layer_norm: bool
|
||||
if true applies layer normalization for every layer
|
||||
as described in https://arxiv.org/abs/1607.06450
|
||||
|
||||
Returns
|
||||
-------
|
||||
q_func: function
|
||||
q_function for DQN algorithm.
|
||||
"""
|
||||
return lambda *args, **kwargs: _mlp(hiddens, layer_norm=layer_norm, *args, **kwargs)
|
||||
|
||||
|
||||
def _cnn_to_mlp(convs, hiddens, dueling, input_, num_actions, scope, reuse=False, layer_norm=False):
|
||||
with tf.variable_scope(scope, reuse=reuse):
|
||||
out = input_
|
||||
with tf.variable_scope("convnet"):
|
||||
for num_outputs, kernel_size, stride in convs:
|
||||
out = layers.convolution2d(out,
|
||||
num_outputs=num_outputs,
|
||||
kernel_size=kernel_size,
|
||||
stride=stride,
|
||||
activation_fn=tf.nn.relu)
|
||||
conv_out = layers.flatten(out)
|
||||
with tf.variable_scope("action_value"):
|
||||
action_out = conv_out
|
||||
for hidden in hiddens:
|
||||
action_out = layers.fully_connected(action_out, num_outputs=hidden, activation_fn=None)
|
||||
if layer_norm:
|
||||
action_out = layers.layer_norm(action_out, center=True, scale=True)
|
||||
action_out = tf.nn.relu(action_out)
|
||||
action_scores = layers.fully_connected(action_out, num_outputs=num_actions, activation_fn=None)
|
||||
|
||||
if dueling:
|
||||
with tf.variable_scope("state_value"):
|
||||
state_out = conv_out
|
||||
for hidden in hiddens:
|
||||
state_out = layers.fully_connected(state_out, num_outputs=hidden, activation_fn=None)
|
||||
if layer_norm:
|
||||
state_out = layers.layer_norm(state_out, center=True, scale=True)
|
||||
state_out = tf.nn.relu(state_out)
|
||||
state_score = layers.fully_connected(state_out, num_outputs=1, activation_fn=None)
|
||||
action_scores_mean = tf.reduce_mean(action_scores, 1)
|
||||
action_scores_centered = action_scores - tf.expand_dims(action_scores_mean, 1)
|
||||
q_out = state_score + action_scores_centered
|
||||
else:
|
||||
q_out = action_scores
|
||||
return q_out
|
||||
|
||||
|
||||
def cnn_to_mlp(convs, hiddens, dueling=False, layer_norm=False):
|
||||
"""This model takes as input an observation and returns values of all actions.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
convs: [(int, int, int)]
|
||||
list of convolutional layers in form of
|
||||
(num_outputs, kernel_size, stride)
|
||||
hiddens: [int]
|
||||
list of sizes of hidden layers
|
||||
dueling: bool
|
||||
if true double the output MLP to compute a baseline
|
||||
for action scores
|
||||
layer_norm: bool
|
||||
if true applies layer normalization for every layer
|
||||
as described in https://arxiv.org/abs/1607.06450
|
||||
|
||||
Returns
|
||||
-------
|
||||
q_func: function
|
||||
q_function for DQN algorithm.
|
||||
"""
|
||||
|
||||
return lambda *args, **kwargs: _cnn_to_mlp(convs, hiddens, dueling, layer_norm=layer_norm, *args, **kwargs)
|
||||
|
||||
|
||||
|
||||
def build_q_func(network, hiddens=[256], dueling=True, layer_norm=False, **network_kwargs):
|
||||
if isinstance(network, str):
|
||||
from baselines.common.models import get_network_builder
|
||||
|
@@ -23,7 +23,7 @@ from baselines.gail.dataset.mujoco_dset import Mujoco_Dset
|
||||
|
||||
def argsparser():
|
||||
parser = argparse.ArgumentParser("Tensorflow Implementation of Behavior Cloning")
|
||||
parser.add_argument('--env_id', help='environment ID', default='Hopper-v1')
|
||||
parser.add_argument('--env_id', help='environment ID', default='Hopper-v2')
|
||||
parser.add_argument('--seed', help='RNG seed', type=int, default=0)
|
||||
parser.add_argument('--expert_path', type=str, default='data/deterministic.trpo.Hopper.0.00.npz')
|
||||
parser.add_argument('--checkpoint_dir', help='the directory to save model', default='checkpoint')
|
||||
@@ -73,7 +73,7 @@ def learn(env, policy_func, dataset, optim_batch_size=128, max_iters=1e4,
|
||||
savedir_fname = tempfile.TemporaryDirectory().name
|
||||
else:
|
||||
savedir_fname = osp.join(ckpt_dir, task_name)
|
||||
U.save_state(savedir_fname, var_list=pi.get_variables())
|
||||
U.save_variables(savedir_fname, variables=pi.get_variables())
|
||||
return savedir_fname
|
||||
|
||||
|
||||
|
@@ -77,7 +77,7 @@ class Mujoco_Dset(object):
|
||||
self.log_info()
|
||||
|
||||
def log_info(self):
|
||||
logger.log("Total trajectorues: %d" % self.num_traj)
|
||||
logger.log("Total trajectories: %d" % self.num_traj)
|
||||
logger.log("Total transitions: %d" % self.num_transition)
|
||||
logger.log("Average returns: %f" % self.avg_ret)
|
||||
logger.log("Std for returns: %f" % self.std_ret)
|
||||
|
@@ -165,7 +165,7 @@ def runner(env, policy_func, load_model_path, timesteps_per_batch, number_trajs,
|
||||
U.initialize()
|
||||
# Prepare for rollouts
|
||||
# ----------------------------------------
|
||||
U.load_state(load_model_path)
|
||||
U.load_variables(load_model_path)
|
||||
|
||||
obs_list = []
|
||||
acs_list = []
|
||||
|
@@ -15,8 +15,7 @@ class RolloutWorker:
|
||||
"""Rollout worker generates experience by interacting with one or many environments.
|
||||
|
||||
Args:
|
||||
make_env (function): a factory function that creates a new instance of the environment
|
||||
when called
|
||||
venv: vectorized gym environments.
|
||||
policy (object): the policy that is used to act
|
||||
dims (dict of ints): the dimensions for observations (o), goals (g), and actions (u)
|
||||
logger (object): the logger that is used by the rollout worker
|
||||
|
@@ -379,7 +379,8 @@ def configure(dir=None, format_strs=None, comm=None, log_suffix=''):
|
||||
dir = osp.join(tempfile.gettempdir(),
|
||||
datetime.datetime.now().strftime("openai-%Y-%m-%d-%H-%M-%S-%f"))
|
||||
assert isinstance(dir, str)
|
||||
os.makedirs(dir, exist_ok=True)
|
||||
dir = os.path.expanduser(dir)
|
||||
os.makedirs(os.path.expanduser(dir), exist_ok=True)
|
||||
|
||||
rank = get_rank_without_mpi_import()
|
||||
if rank > 0:
|
||||
@@ -394,7 +395,8 @@ def configure(dir=None, format_strs=None, comm=None, log_suffix=''):
|
||||
output_formats = [make_output_format(f, dir, log_suffix) for f in format_strs]
|
||||
|
||||
Logger.CURRENT = Logger(dir=dir, output_formats=output_formats, comm=comm)
|
||||
log('Logging to %s'%dir)
|
||||
if output_formats:
|
||||
log('Logging to %s'%dir)
|
||||
|
||||
def _configure_default_logger():
|
||||
configure()
|
||||
|
@@ -32,7 +32,7 @@ except ImportError:
|
||||
_game_envs = defaultdict(set)
|
||||
for env in gym.envs.registry.all():
|
||||
# TODO: solve this with regexes
|
||||
env_type = env._entry_point.split(':')[0].split('.')[-1]
|
||||
env_type = env.entry_point.split(':')[0].split('.')[-1]
|
||||
_game_envs[env_type].add(env.id)
|
||||
|
||||
# reading benchmark names directly from retro requires
|
||||
@@ -126,7 +126,7 @@ def get_env_type(args):
|
||||
|
||||
# Re-parse the gym registry, since we could have new envs since last time.
|
||||
for env in gym.envs.registry.all():
|
||||
env_type = env._entry_point.split(':')[0].split('.')[-1]
|
||||
env_type = env.entry_point.split(':')[0].split('.')[-1]
|
||||
_game_envs[env_type].add(env.id) # This is a set so add is idempotent
|
||||
|
||||
if env_id in _game_envs.keys():
|
||||
@@ -192,6 +192,12 @@ def parse_cmdline_kwargs(args):
|
||||
return {k: parse(v) for k,v in parse_unknown_args(args).items()}
|
||||
|
||||
|
||||
def configure_logger(log_path, **kwargs):
|
||||
if log_path is not None:
|
||||
logger.configure(log_path)
|
||||
else:
|
||||
logger.configure(**kwargs)
|
||||
|
||||
|
||||
def main(args):
|
||||
# configure logger, disable logging in child MPI processes (with rank > 0)
|
||||
@@ -202,10 +208,10 @@ def main(args):
|
||||
|
||||
if MPI is None or MPI.COMM_WORLD.Get_rank() == 0:
|
||||
rank = 0
|
||||
logger.configure()
|
||||
configure_logger(args.log_path)
|
||||
else:
|
||||
logger.configure(format_strs=[])
|
||||
rank = MPI.COMM_WORLD.Get_rank()
|
||||
configure_logger(args.log_path, format_strs=[])
|
||||
|
||||
model, env = train(args, extra_args)
|
||||
|
||||
@@ -220,7 +226,7 @@ def main(args):
|
||||
state = model.initial_state if hasattr(model, 'initial_state') else None
|
||||
dones = np.zeros((1,))
|
||||
|
||||
episode_rew = 0
|
||||
episode_rew = np.zeros(env.num_envs) if isinstance(env, VecEnv) else np.zeros(1)
|
||||
while True:
|
||||
if state is not None:
|
||||
actions, _, state, _ = model.step(obs,S=state, M=dones)
|
||||
@@ -228,13 +234,13 @@ def main(args):
|
||||
actions, _, _, _ = model.step(obs)
|
||||
|
||||
obs, rew, done, _ = env.step(actions)
|
||||
episode_rew += rew[0] if isinstance(env, VecEnv) else rew
|
||||
episode_rew += rew
|
||||
env.render()
|
||||
done = done.any() if isinstance(done, np.ndarray) else done
|
||||
if done:
|
||||
print('episode_rew={}'.format(episode_rew))
|
||||
episode_rew = 0
|
||||
obs = env.reset()
|
||||
done_any = done.any() if isinstance(done, np.ndarray) else done
|
||||
if done_any:
|
||||
for i in np.nonzero(done)[0]:
|
||||
print('episode_rew={}'.format(episode_rew[i]))
|
||||
episode_rew[i] = 0
|
||||
|
||||
env.close()
|
||||
|
||||
|
4
setup.py
4
setup.py
@@ -31,7 +31,7 @@ setup(name='baselines',
|
||||
packages=[package for package in find_packages()
|
||||
if package.startswith('baselines')],
|
||||
install_requires=[
|
||||
'gym>=0.10.0, <1.0.0',
|
||||
'gym>=0.15.4, <0.16.0',
|
||||
'scipy',
|
||||
'tqdm',
|
||||
'joblib',
|
||||
@@ -44,7 +44,7 @@ setup(name='baselines',
|
||||
author='OpenAI',
|
||||
url='https://github.com/openai/baselines',
|
||||
author_email='gym@openai.com',
|
||||
version='0.1.5')
|
||||
version='0.1.6')
|
||||
|
||||
|
||||
# ensure there is some tensorflow build with version above 1.4
|
||||
|
Reference in New Issue
Block a user