Compare commits

..

1 Commits

Author SHA1 Message Date
Jonathan Raiman
401a89e515 add tuple pdtype 2019-05-23 15:43:48 -07:00
28 changed files with 316 additions and 193 deletions

View File

@@ -11,7 +11,7 @@ WORKDIR $CODE_DIR/baselines
# Clean up pycache and pyc files
RUN rm -rf __pycache__ && \
find . -name "*.pyc" -delete && \
pip install 'tensorflow < 2' && \
pip install tensorflow && \
pip install -e .[test]

View File

@@ -1,4 +1,4 @@
**Status:** Maintenance (expect bug fixes and minor updates)
**Status:** Active (under active development, breaking changes may occur)
<img src="data/logo.jpg" width=25% align="right" /> [![Build status](https://travis-ci.org/openai/baselines.svg?branch=master)](https://travis-ci.org/openai/baselines)
@@ -39,24 +39,21 @@ To activate a virtualenv:
More thorough tutorial on virtualenvs and options can be found [here](https://virtualenv.pypa.io/en/stable/)
## Tensorflow versions
The master branch supports Tensorflow from version 1.4 to 1.14. For Tensorflow 2.0 support, please use tf2 branch.
## Installation
- Clone the repo and cd into it:
```bash
git clone https://github.com/openai/baselines.git
cd baselines
```
- If you don't have TensorFlow installed already, install your favourite flavor of TensorFlow. In most cases, you may use
- If you don't have TensorFlow installed already, install your favourite flavor of TensorFlow. In most cases,
```bash
pip install tensorflow-gpu==1.14 # if you have a CUDA-compatible gpu and proper drivers
pip install tensorflow-gpu # if you have a CUDA-compatible gpu and proper drivers
```
or
```bash
pip install tensorflow==1.14
pip install tensorflow
```
to install Tensorflow 1.14, which is the latest version of Tensorflow supported by the master branch. Refer to [TensorFlow installation guide](https://www.tensorflow.org/install/)
should be sufficient. Refer to [TensorFlow installation guide](https://www.tensorflow.org/install/)
for more details.
- Install baselines package
@@ -101,8 +98,6 @@ python -m baselines.run --alg=deepq --env=PongNoFrameskip-v4 --num_timesteps=1e6
```
## Saving, loading and visualizing models
### Saving and loading the model
The algorithms serialization API is not properly unified yet; however, there is a simple method to save / restore trained models.
`--save_path` and `--load_path` command-line option loads the tensorflow state from a given path before training, and saves it after the training, respectively.
Let's imagine you'd like to train ppo2 on Atari Pong, save the model and then later visualize what has it learnt.
@@ -116,17 +111,8 @@ python -m baselines.run --alg=ppo2 --env=PongNoFrameskip-v4 --num_timesteps=0 --
*NOTE:* Mujoco environments require normalization to work properly, so we wrap them with VecNormalize wrapper. Currently, to ensure the models are saved with normalization (so that trained models can be restored and run without further training) the normalization coefficients are saved as tensorflow variables. This can decrease the performance somewhat, so if you require high-throughput steps with Mujoco and do not need saving/restoring the models, it may make sense to use numpy normalization instead. To do that, set 'use_tf=False` in [baselines/run.py](baselines/run.py#L116).
### Logging and vizualizing learning curves and other training metrics
By default, all summary data, including progress, standard output, is saved to a unique directory in a temp folder, specified by a call to Python's [tempfile.gettempdir()](https://docs.python.org/3/library/tempfile.html#tempfile.gettempdir).
The directory can be changed with the `--log_path` command-line option.
```bash
python -m baselines.run --alg=ppo2 --env=PongNoFrameskip-v4 --num_timesteps=2e7 --save_path=~/models/pong_20M_ppo2 --log_path=~/logs/Pong/
```
*NOTE:* Please be aware that the logger will overwrite files of the same name in an existing directory, thus it's recommended that folder names be given a unique timestamp to prevent overwritten logs.
Another way the temp directory can be changed is through the use of the `$OPENAI_LOGDIR` environment variable.
For examples on how to load and display the training data, see [here](docs/viz/viz.ipynb).
## Loading and vizualizing learning curves and other training metrics
See [here](docs/viz/viz.ipynb) for instructions on how to load and display the training data.
## Subpackages

View File

@@ -6,7 +6,7 @@ from baselines import logger
from baselines.common import set_global_seeds
from baselines.common.policies import build_policy
from baselines.common.tf_util import get_session, save_variables, load_variables
from baselines.common.tf_util import get_session, save_variables
from baselines.common.vec_env.vec_frame_stack import VecFrameStack
from baselines.a2c.utils import batch_to_seq, seq_to_batch
@@ -216,8 +216,7 @@ class Model(object):
self.train = train
self.save = functools.partial(save_variables, sess=sess)
self.load = functools.partial(load_variables, sess=sess)
self.save = functools.partial(save_variables, sess=sess, variables=params)
self.train_model = train_model
self.step_model = step_model
self._step = _step
@@ -359,9 +358,6 @@ def learn(network, env, seed=None, nsteps=20, total_timesteps=int(80e6), q_coef=
total_timesteps=total_timesteps, lrschedule=lrschedule, c=c,
trust_region=trust_region, alpha=alpha, delta=delta)
if load_path is not None:
model.load(load_path)
runner = Runner(env=env, model=model, nsteps=nsteps)
if replay_ratio > 0:
buffer = Buffer(env=env, nsteps=nsteps, size=buffer_size)

View File

@@ -1,3 +1,2 @@
# flake8: noqa F403
from baselines.bench.benchmarks import *
from baselines.bench.monitor import *

View File

@@ -1,4 +1,5 @@
import re
import os.path as osp
import os
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))

View File

@@ -1,11 +1,13 @@
__all__ = ['Monitor', 'get_monitor_files', 'load_results']
import gym
from gym.core import Wrapper
import time
from glob import glob
import csv
import os.path as osp
import json
import numpy as np
class Monitor(Wrapper):
EXT = "monitor.csv"
@@ -77,7 +79,6 @@ class Monitor(Wrapper):
self.total_steps += 1
def close(self):
super(Monitor, self).close()
if self.f is not None:
self.f.close()
@@ -161,3 +162,27 @@ def load_results(dir):
df['t'] -= min(header['t_start'] for header in headers)
df.headers = headers # HACK to preserve backwards compatibility
return df
def test_monitor():
env = gym.make("CartPole-v1")
env.seed(0)
mon_file = "/tmp/baselines-test-%s.monitor.csv" % uuid.uuid4()
menv = Monitor(env, mon_file)
menv.reset()
for _ in range(1000):
_, _, done, _ = menv.step(0)
if done:
menv.reset()
f = open(mon_file, 'rt')
firstline = f.readline()
assert firstline.startswith('#')
metadata = json.loads(firstline[1:])
assert metadata['env_id'] == "CartPole-v1"
assert set(metadata.keys()) == {'env_id', 'gym_version', 't_start'}, "Incorrect keys in monitor metadata"
last_logline = pandas.read_csv(f, index_col=None)
assert set(last_logline.keys()) == {'l', 't', 'r'}, "Incorrect keys in monitor logline"
f.close()
os.remove(mon_file)

View File

@@ -1,31 +0,0 @@
from .monitor import Monitor
import gym
import json
def test_monitor():
import pandas
import os
import uuid
env = gym.make("CartPole-v1")
env.seed(0)
mon_file = "/tmp/baselines-test-%s.monitor.csv" % uuid.uuid4()
menv = Monitor(env, mon_file)
menv.reset()
for _ in range(1000):
_, _, done, _ = menv.step(0)
if done:
menv.reset()
f = open(mon_file, 'rt')
firstline = f.readline()
assert firstline.startswith('#')
metadata = json.loads(firstline[1:])
assert metadata['env_id'] == "CartPole-v1"
assert set(metadata.keys()) == {'env_id', 't_start'}, "Incorrect keys in monitor metadata"
last_logline = pandas.read_csv(f, index_col=None)
assert set(last_logline.keys()) == {'l', 't', 'r'}, "Incorrect keys in monitor logline"
f.close()
os.remove(mon_file)

View File

@@ -254,13 +254,6 @@ class LazyFrames(object):
return len(self._force())
def __getitem__(self, i):
return self._force()[i]
def count(self):
frames = self._force()
return frames.shape[frames.ndim - 1]
def frame(self, i):
return self._force()[..., i]
def make_atari(env_id, max_episode_steps=None):

View File

@@ -9,7 +9,7 @@ except ImportError:
MPI = None
import gym
from gym.wrappers import FlattenObservation, FilterObservation
from gym.wrappers import FlattenDictWrapper
from baselines import logger
from baselines.bench import Monitor
from baselines.common import set_global_seeds
@@ -81,7 +81,8 @@ def make_env(env_id, env_type, mpi_rank=0, subrank=0, seed=None, reward_scale=1.
env = gym.make(env_id, **env_kwargs)
if flatten_dict_observations and isinstance(env.observation_space, gym.spaces.Dict):
env = FlattenObservation(env)
keys = env.observation_space.spaces.keys()
env = gym.wrappers.FlattenDictWrapper(env, dict_keys=list(keys))
env.seed(seed + subrank if seed is not None else None)
env = Monitor(env,
@@ -127,7 +128,7 @@ def make_robotics_env(env_id, seed, rank=0):
"""
set_global_seeds(seed)
env = gym.make(env_id)
env = FlattenObservation(FilterObservation(env, ['observation', 'desired_goal']))
env = FlattenDictWrapper(env, ['observation', 'desired_goal'])
env = Monitor(
env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)),
info_keywords=('is_success',))
@@ -169,7 +170,6 @@ def common_arg_parser():
parser.add_argument('--save_path', help='Path to save trained model to', default=None, type=str)
parser.add_argument('--save_video_interval', help='Save video every x steps (0 = disabled)', default=0, type=int)
parser.add_argument('--save_video_length', help='Length of recorded video. Default: 200', default=200, type=int)
parser.add_argument('--log_path', help='Directory to save learning curve data.', default=None, type=str)
parser.add_argument('--play', default=False, action='store_true')
return parser
@@ -186,7 +186,7 @@ def robotics_arg_parser():
def parse_unknown_args(args):
"""
Parse arguments not consumed by arg parser into a dictionary
Parse arguments not consumed by arg parser into a dicitonary
"""
retval = {}
preceded_by_key = False

View File

@@ -275,6 +275,133 @@ class BernoulliPd(Pd):
def fromflat(cls, flat):
return cls(flat)
def _np_cast(x, dtype):
"""Numpy cast, equivalent to tf.cast"""
return x.astype(dtype)
def decode_tuple_sample(pdtypes, x):
"""
Cast and convert a sample from its dense concatenated state back to constituent parts.
Arguments
---------
:param pdtypes: list<PdType>, a TuplePdType's child PdTypes.
:param x: np.ndarray or tf.Tensor.
Shape is [..., sum(pdtype.sample_shape for pdtype in pdtypes)]
:return output, list<np.ndarray> or list<tf.Tensor>, the split and correctly casted
policy samples.
"""
if isinstance(x, np.ndarray):
cast_fn = _np_cast
numpy_casting = True
else:
cast_fn = tf.cast
numpy_casting = False
so_far = 0
xs = []
for pdtype in pdtypes:
sample_size = pdtype.sample_shape()[0] if len(pdtype.sample_shape()) > 0 else 1
if len(pdtype.sample_shape()) == 0:
slided_x = x[..., so_far]
else:
slided_x = x[..., so_far:so_far + sample_size]
desired_dtype = pdtype.sample_dtype()
if numpy_casting:
desired_dtype = desired_dtype.as_numpy_dtype
if desired_dtype != x:
slided_x = cast_fn(slided_x, desired_dtype)
xs.append(slided_x)
so_far += sample_size
return xs
class TuplePd(Pd):
def __init__(self, sample_dtype, pdtypes, logits):
self.pdtypes = pdtypes
self.sample_dtype = sample_dtype
self.pds = []
so_far = 0
for pdtype in self.pdtypes:
param_shape = pdtype.param_shape()[0]
self.pds.append(pdtype.pdfromflat(logits[..., so_far:so_far + param_shape]))
so_far += param_shape
def flatparam(self):
return tf.concat([pd.flatparam() for pd in self.pds], axis=-1)
def mode(self):
return self.tuple_sample_concat([pd.mode() for pd in self.pds])
def tuple_sample_concat(self, samples):
out = []
for sample, pdtype in zip(samples, self.pdtypes):
if len(pdtype.sample_shape()) == 0:
sample = tf.expand_dims(sample, axis=-1)
if sample.dtype != self.sample_dtype:
sample = tf.cast(sample, self.sample_dtype)
out.append(sample)
return tf.concat(out, axis=-1)
def sample(self):
return self.tuple_sample_concat([pd.sample() for pd in self.pds])
def neglogp(self, x):
return tf.add_n([pd.neglogp(xi) for pd, xi in zip(self.pds, decode_tuple_sample(self.pdtypes, x))])
def entropy(self):
return tf.add_n([pd.entropy() for pd in self.pds])
def _dtype_promotion(old, new):
"""
Find the highest precision common ground between two tensorflow datatypes.
if old is None, it is ignored.
"""
if old is None or (new.is_floating and old.is_integer):
return new
if old.is_floating and old.is_integer:
return old
if (old.is_floating and new.is_floating) or (new.is_integer and new.is_integer):
# take the largest type (e.g. float64 over float32)
return old if old.size > new.size else new
raise ValueError("No idea how to promote {} and {}.".format(old, new))
class TuplePdType(PdType):
def __init__(self, space):
self.internal_pdtypes = [make_pdtype(space) for space in space.spaces]
def decode_sample(self, x):
return decode_tuple_sample(self.internal_pdtypes, x)
def pdclass(self):
return TuplePd
def pdfromflat(self, flat):
return TuplePd(self.sample_dtype(), self.internal_pdtypes, flat)
def param_shape(self):
return [sum([pdtype.param_shape()[0]
for pdtype in self.internal_pdtypes])]
def sample_shape(self):
return [sum([pdtype.sample_shape()[0] if len(pdtype.sample_shape()) > 0 else 1
for pdtype in self.internal_pdtypes])]
def sample_dtype(self):
dtype = None
for pdtype in self.internal_pdtypes:
dtype = _dtype_promotion(dtype, pdtype.sample_dtype())
return dtype
def make_pdtype(ac_space):
from gym import spaces
if isinstance(ac_space, spaces.Box):
@@ -286,9 +413,12 @@ def make_pdtype(ac_space):
return MultiCategoricalPdType(ac_space.nvec)
elif isinstance(ac_space, spaces.MultiBinary):
return BernoulliPdType(ac_space.n)
elif isinstance(ac_space, spaces.Tuple):
return TuplePdType(ac_space)
else:
raise NotImplementedError
def shape_el(v, i):
maybe = v.get_shape()[i]
if maybe is not None:

View File

@@ -65,7 +65,7 @@ def check_synced(localval, comm=None):
vals = comm.gather(localval)
if comm.rank == 0:
assert all(val==vals[0] for val in vals[1:]),\
'MpiAdamOptimizer detected that different workers have different weights: {}'.format(vals)
f'MpiAdamOptimizer detected that different workers have different weights: {vals}'
@with_mpi(timeout=5)
def test_nonfreeze():

View File

@@ -12,9 +12,8 @@ def mpi_mean(x, axis=0, comm=None, keepdims=False):
localsum = np.zeros(n+1, x.dtype)
localsum[:n] = xsum.ravel()
localsum[n] = x.shape[axis]
# globalsum = np.zeros_like(localsum)
# comm.Allreduce(localsum, globalsum, op=MPI.SUM)
globalsum = comm.allreduce(localsum, op=MPI.SUM)
globalsum = np.zeros_like(localsum)
comm.Allreduce(localsum, globalsum, op=MPI.SUM)
return globalsum[:n].reshape(xsum.shape) / globalsum[n], globalsum[n]
def mpi_moments(x, axis=0, comm=None, keepdims=False):

View File

@@ -70,11 +70,9 @@ class ShmemVecEnv(VecEnv):
assert len(actions) == len(self.parent_pipes)
for pipe, act in zip(self.parent_pipes, actions):
pipe.send(('step', act))
self.waiting_step = True
def step_wait(self):
outs = [pipe.recv() for pipe in self.parent_pipes]
self.waiting_step = False
obs, rews, dones, infos = zip(*outs)
return self._decode_obses(obs), np.array(rews), np.array(dones), infos

View File

@@ -4,36 +4,33 @@ import numpy as np
from .vec_env import VecEnv, CloudpickleWrapper, clear_mpi_env_vars
def worker(remote, parent_remote, env_fn_wrappers):
def step_env(env, action):
ob, reward, done, info = env.step(action)
if done:
ob = env.reset()
return ob, reward, done, info
def worker(remote, parent_remote, env_fn_wrapper):
parent_remote.close()
envs = [env_fn_wrapper() for env_fn_wrapper in env_fn_wrappers.x]
env = env_fn_wrapper.x()
try:
while True:
cmd, data = remote.recv()
if cmd == 'step':
remote.send([step_env(env, action) for env, action in zip(envs, data)])
ob, reward, done, info = env.step(data)
if done:
ob = env.reset()
remote.send((ob, reward, done, info))
elif cmd == 'reset':
remote.send([env.reset() for env in envs])
ob = env.reset()
remote.send(ob)
elif cmd == 'render':
remote.send([env.render(mode='rgb_array') for env in envs])
remote.send(env.render(mode='rgb_array'))
elif cmd == 'close':
remote.close()
break
elif cmd == 'get_spaces_spec':
remote.send(CloudpickleWrapper((envs[0].observation_space, envs[0].action_space, envs[0].spec)))
remote.send((env.observation_space, env.action_space, env.spec))
else:
raise NotImplementedError
except KeyboardInterrupt:
print('SubprocVecEnv worker: got KeyboardInterrupt')
finally:
for env in envs:
env.close()
env.close()
class SubprocVecEnv(VecEnv):
@@ -41,23 +38,17 @@ class SubprocVecEnv(VecEnv):
VecEnv that runs multiple environments in parallel in subproceses and communicates with them via pipes.
Recommended to use when num_envs > 1 and step() can be a bottleneck.
"""
def __init__(self, env_fns, spaces=None, context='spawn', in_series=1):
def __init__(self, env_fns, spaces=None, context='spawn'):
"""
Arguments:
env_fns: iterable of callables - functions that create environments to run in subprocesses. Need to be cloud-pickleable
in_series: number of environments to run in series in a single process
(e.g. when len(env_fns) == 12 and in_series == 3, it will run 4 processes, each running 3 envs in series)
"""
self.waiting = False
self.closed = False
self.in_series = in_series
nenvs = len(env_fns)
assert nenvs % in_series == 0, "Number of envs must be divisible by number of envs to run in series"
self.nremotes = nenvs // in_series
env_fns = np.array_split(env_fns, self.nremotes)
ctx = mp.get_context(context)
self.remotes, self.work_remotes = zip(*[ctx.Pipe() for _ in range(self.nremotes)])
self.remotes, self.work_remotes = zip(*[ctx.Pipe() for _ in range(nenvs)])
self.ps = [ctx.Process(target=worker, args=(work_remote, remote, CloudpickleWrapper(env_fn)))
for (work_remote, remote, env_fn) in zip(self.work_remotes, self.remotes, env_fns)]
for p in self.ps:
@@ -68,13 +59,12 @@ class SubprocVecEnv(VecEnv):
remote.close()
self.remotes[0].send(('get_spaces_spec', None))
observation_space, action_space, self.spec = self.remotes[0].recv().x
observation_space, action_space, self.spec = self.remotes[0].recv()
self.viewer = None
VecEnv.__init__(self, nenvs, observation_space, action_space)
VecEnv.__init__(self, len(env_fns), observation_space, action_space)
def step_async(self, actions):
self._assert_not_closed()
actions = np.array_split(actions, self.nremotes)
for remote, action in zip(self.remotes, actions):
remote.send(('step', action))
self.waiting = True
@@ -82,7 +72,6 @@ class SubprocVecEnv(VecEnv):
def step_wait(self):
self._assert_not_closed()
results = [remote.recv() for remote in self.remotes]
results = _flatten_list(results)
self.waiting = False
obs, rews, dones, infos = zip(*results)
return _flatten_obs(obs), np.stack(rews), np.stack(dones), infos
@@ -91,9 +80,7 @@ class SubprocVecEnv(VecEnv):
self._assert_not_closed()
for remote in self.remotes:
remote.send(('reset', None))
obs = [remote.recv() for remote in self.remotes]
obs = _flatten_list(obs)
return _flatten_obs(obs)
return _flatten_obs([remote.recv() for remote in self.remotes])
def close_extras(self):
self.closed = True
@@ -110,7 +97,6 @@ class SubprocVecEnv(VecEnv):
for pipe in self.remotes:
pipe.send(('render', None))
imgs = [pipe.recv() for pipe in self.remotes]
imgs = _flatten_list(imgs)
return imgs
def _assert_not_closed(self):
@@ -129,10 +115,3 @@ def _flatten_obs(obs):
return {k: np.stack([o[k] for o in obs]) for k in keys}
else:
return np.stack(obs)
def _flatten_list(l):
assert isinstance(l, (list, tuple))
assert len(l) > 0
assert all([len(l_) > 0 for l_ in l])
return [l__ for l_ in l for l__ in l_]

View File

@@ -67,50 +67,6 @@ def test_vec_env(klass, dtype): # pylint: disable=R0914
assert_venvs_equal(env1, env2, num_steps=num_steps)
@pytest.mark.parametrize('dtype', ('uint8', 'float32'))
@pytest.mark.parametrize('num_envs_in_series', (3, 4, 6))
def test_sync_sampling(dtype, num_envs_in_series):
"""
Test that a SubprocVecEnv running with envs in series
outputs the same as DummyVecEnv.
"""
num_envs = 12
num_steps = 100
shape = (3, 8)
def make_fn(seed):
"""
Get an environment constructor with a seed.
"""
return lambda: SimpleEnv(seed, shape, dtype)
fns = [make_fn(i) for i in range(num_envs)]
env1 = DummyVecEnv(fns)
env2 = SubprocVecEnv(fns, in_series=num_envs_in_series)
assert_venvs_equal(env1, env2, num_steps=num_steps)
@pytest.mark.parametrize('dtype', ('uint8', 'float32'))
@pytest.mark.parametrize('num_envs_in_series', (3, 4, 6))
def test_sync_sampling_sanity(dtype, num_envs_in_series):
"""
Test that a SubprocVecEnv running with envs in series
outputs the same as SubprocVecEnv without running in series.
"""
num_envs = 12
num_steps = 100
shape = (3, 8)
def make_fn(seed):
"""
Get an environment constructor with a seed.
"""
return lambda: SimpleEnv(seed, shape, dtype)
fns = [make_fn(i) for i in range(num_envs)]
env1 = SubprocVecEnv(fns)
env2 = SubprocVecEnv(fns, in_series=num_envs_in_series)
assert_venvs_equal(env1, env2, num_steps=num_steps)
class SimpleEnv(gym.Env):
"""
An environment with a pre-determined observation space

View File

@@ -38,9 +38,6 @@ def obs_space_info(obs_space):
if isinstance(obs_space, gym.spaces.Dict):
assert isinstance(obs_space.spaces, OrderedDict)
subspaces = obs_space.spaces
elif isinstance(obs_space, gym.spaces.Tuple):
assert isinstance(obs_space.spaces, tuple)
subspaces = {i: obs_space.spaces[i] for i in range(len(obs_space.spaces))}
else:
subspaces = {None: obs_space}
keys = []

View File

@@ -378,6 +378,11 @@ class DDPG(object):
self.param_noise_stddev: self.param_noise.current_stddev,
})
if MPI is not None:
mean_distance = MPI.COMM_WORLD.allreduce(distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size()
else:
mean_distance = distance
if MPI is not None:
mean_distance = MPI.COMM_WORLD.allreduce(distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size()
else:

View File

@@ -13,7 +13,7 @@ The functions in this file can are used to create the following functions:
stochastic: bool
if set to False all the actions are always deterministic (default False)
update_eps_ph: float
update epsilon a new value, if negative no update happens
update epsilon a new value, if negative not update happens
(default: no update)
Returns

View File

@@ -142,8 +142,9 @@ def learn(env,
final value of random action probability
train_freq: int
update the model every `train_freq` steps.
set to None to disable printing
batch_size: int
size of a batch sampled from replay buffer for training
size of a batched sampled from replay buffer for training
print_freq: int
how often to print out training progress
set to None to disable printing

View File

@@ -2,6 +2,101 @@ import tensorflow as tf
import tensorflow.contrib.layers as layers
def _mlp(hiddens, input_, num_actions, scope, reuse=False, layer_norm=False):
with tf.variable_scope(scope, reuse=reuse):
out = input_
for hidden in hiddens:
out = layers.fully_connected(out, num_outputs=hidden, activation_fn=None)
if layer_norm:
out = layers.layer_norm(out, center=True, scale=True)
out = tf.nn.relu(out)
q_out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None)
return q_out
def mlp(hiddens=[], layer_norm=False):
"""This model takes as input an observation and returns values of all actions.
Parameters
----------
hiddens: [int]
list of sizes of hidden layers
layer_norm: bool
if true applies layer normalization for every layer
as described in https://arxiv.org/abs/1607.06450
Returns
-------
q_func: function
q_function for DQN algorithm.
"""
return lambda *args, **kwargs: _mlp(hiddens, layer_norm=layer_norm, *args, **kwargs)
def _cnn_to_mlp(convs, hiddens, dueling, input_, num_actions, scope, reuse=False, layer_norm=False):
with tf.variable_scope(scope, reuse=reuse):
out = input_
with tf.variable_scope("convnet"):
for num_outputs, kernel_size, stride in convs:
out = layers.convolution2d(out,
num_outputs=num_outputs,
kernel_size=kernel_size,
stride=stride,
activation_fn=tf.nn.relu)
conv_out = layers.flatten(out)
with tf.variable_scope("action_value"):
action_out = conv_out
for hidden in hiddens:
action_out = layers.fully_connected(action_out, num_outputs=hidden, activation_fn=None)
if layer_norm:
action_out = layers.layer_norm(action_out, center=True, scale=True)
action_out = tf.nn.relu(action_out)
action_scores = layers.fully_connected(action_out, num_outputs=num_actions, activation_fn=None)
if dueling:
with tf.variable_scope("state_value"):
state_out = conv_out
for hidden in hiddens:
state_out = layers.fully_connected(state_out, num_outputs=hidden, activation_fn=None)
if layer_norm:
state_out = layers.layer_norm(state_out, center=True, scale=True)
state_out = tf.nn.relu(state_out)
state_score = layers.fully_connected(state_out, num_outputs=1, activation_fn=None)
action_scores_mean = tf.reduce_mean(action_scores, 1)
action_scores_centered = action_scores - tf.expand_dims(action_scores_mean, 1)
q_out = state_score + action_scores_centered
else:
q_out = action_scores
return q_out
def cnn_to_mlp(convs, hiddens, dueling=False, layer_norm=False):
"""This model takes as input an observation and returns values of all actions.
Parameters
----------
convs: [(int, int, int)]
list of convolutional layers in form of
(num_outputs, kernel_size, stride)
hiddens: [int]
list of sizes of hidden layers
dueling: bool
if true double the output MLP to compute a baseline
for action scores
layer_norm: bool
if true applies layer normalization for every layer
as described in https://arxiv.org/abs/1607.06450
Returns
-------
q_func: function
q_function for DQN algorithm.
"""
return lambda *args, **kwargs: _cnn_to_mlp(convs, hiddens, dueling, layer_norm=layer_norm, *args, **kwargs)
def build_q_func(network, hiddens=[256], dueling=True, layer_norm=False, **network_kwargs):
if isinstance(network, str):
from baselines.common.models import get_network_builder

View File

@@ -23,7 +23,7 @@ from baselines.gail.dataset.mujoco_dset import Mujoco_Dset
def argsparser():
parser = argparse.ArgumentParser("Tensorflow Implementation of Behavior Cloning")
parser.add_argument('--env_id', help='environment ID', default='Hopper-v2')
parser.add_argument('--env_id', help='environment ID', default='Hopper-v1')
parser.add_argument('--seed', help='RNG seed', type=int, default=0)
parser.add_argument('--expert_path', type=str, default='data/deterministic.trpo.Hopper.0.00.npz')
parser.add_argument('--checkpoint_dir', help='the directory to save model', default='checkpoint')
@@ -73,7 +73,7 @@ def learn(env, policy_func, dataset, optim_batch_size=128, max_iters=1e4,
savedir_fname = tempfile.TemporaryDirectory().name
else:
savedir_fname = osp.join(ckpt_dir, task_name)
U.save_variables(savedir_fname, variables=pi.get_variables())
U.save_state(savedir_fname, var_list=pi.get_variables())
return savedir_fname

View File

@@ -77,7 +77,7 @@ class Mujoco_Dset(object):
self.log_info()
def log_info(self):
logger.log("Total trajectories: %d" % self.num_traj)
logger.log("Total trajectorues: %d" % self.num_traj)
logger.log("Total transitions: %d" % self.num_transition)
logger.log("Average returns: %f" % self.avg_ret)
logger.log("Std for returns: %f" % self.std_ret)

View File

@@ -165,7 +165,7 @@ def runner(env, policy_func, load_model_path, timesteps_per_batch, number_trajs,
U.initialize()
# Prepare for rollouts
# ----------------------------------------
U.load_variables(load_model_path)
U.load_state(load_model_path)
obs_list = []
acs_list = []

View File

@@ -15,7 +15,8 @@ class RolloutWorker:
"""Rollout worker generates experience by interacting with one or many environments.
Args:
venv: vectorized gym environments.
make_env (function): a factory function that creates a new instance of the environment
when called
policy (object): the policy that is used to act
dims (dict of ints): the dimensions for observations (o), goals (g), and actions (u)
logger (object): the logger that is used by the rollout worker

View File

@@ -379,8 +379,7 @@ def configure(dir=None, format_strs=None, comm=None, log_suffix=''):
dir = osp.join(tempfile.gettempdir(),
datetime.datetime.now().strftime("openai-%Y-%m-%d-%H-%M-%S-%f"))
assert isinstance(dir, str)
dir = os.path.expanduser(dir)
os.makedirs(os.path.expanduser(dir), exist_ok=True)
os.makedirs(dir, exist_ok=True)
rank = get_rank_without_mpi_import()
if rank > 0:
@@ -395,8 +394,7 @@ def configure(dir=None, format_strs=None, comm=None, log_suffix=''):
output_formats = [make_output_format(f, dir, log_suffix) for f in format_strs]
Logger.CURRENT = Logger(dir=dir, output_formats=output_formats, comm=comm)
if output_formats:
log('Logging to %s'%dir)
log('Logging to %s'%dir)
def _configure_default_logger():
configure()

View File

@@ -32,7 +32,7 @@ except ImportError:
_game_envs = defaultdict(set)
for env in gym.envs.registry.all():
# TODO: solve this with regexes
env_type = env.entry_point.split(':')[0].split('.')[-1]
env_type = env._entry_point.split(':')[0].split('.')[-1]
_game_envs[env_type].add(env.id)
# reading benchmark names directly from retro requires
@@ -126,7 +126,7 @@ def get_env_type(args):
# Re-parse the gym registry, since we could have new envs since last time.
for env in gym.envs.registry.all():
env_type = env.entry_point.split(':')[0].split('.')[-1]
env_type = env._entry_point.split(':')[0].split('.')[-1]
_game_envs[env_type].add(env.id) # This is a set so add is idempotent
if env_id in _game_envs.keys():
@@ -192,12 +192,6 @@ def parse_cmdline_kwargs(args):
return {k: parse(v) for k,v in parse_unknown_args(args).items()}
def configure_logger(log_path, **kwargs):
if log_path is not None:
logger.configure(log_path)
else:
logger.configure(**kwargs)
def main(args):
# configure logger, disable logging in child MPI processes (with rank > 0)
@@ -208,10 +202,10 @@ def main(args):
if MPI is None or MPI.COMM_WORLD.Get_rank() == 0:
rank = 0
configure_logger(args.log_path)
logger.configure()
else:
logger.configure(format_strs=[])
rank = MPI.COMM_WORLD.Get_rank()
configure_logger(args.log_path, format_strs=[])
model, env = train(args, extra_args)
@@ -226,7 +220,7 @@ def main(args):
state = model.initial_state if hasattr(model, 'initial_state') else None
dones = np.zeros((1,))
episode_rew = np.zeros(env.num_envs) if isinstance(env, VecEnv) else np.zeros(1)
episode_rew = 0
while True:
if state is not None:
actions, _, state, _ = model.step(obs,S=state, M=dones)
@@ -234,13 +228,13 @@ def main(args):
actions, _, _, _ = model.step(obs)
obs, rew, done, _ = env.step(actions)
episode_rew += rew
episode_rew += rew[0] if isinstance(env, VecEnv) else rew
env.render()
done_any = done.any() if isinstance(done, np.ndarray) else done
if done_any:
for i in np.nonzero(done)[0]:
print('episode_rew={}'.format(episode_rew[i]))
episode_rew[i] = 0
done = done.any() if isinstance(done, np.ndarray) else done
if done:
print('episode_rew={}'.format(episode_rew))
episode_rew = 0
obs = env.reset()
env.close()

View File

@@ -4,3 +4,4 @@ exclude =
.git,
__pycache__,
baselines/ppo1,
baselines/bench,

View File

@@ -31,7 +31,7 @@ setup(name='baselines',
packages=[package for package in find_packages()
if package.startswith('baselines')],
install_requires=[
'gym>=0.15.4, <0.16.0',
'gym>=0.10.0, <1.0.0',
'scipy',
'tqdm',
'joblib',
@@ -44,7 +44,7 @@ setup(name='baselines',
author='OpenAI',
url='https://github.com/openai/baselines',
author_email='gym@openai.com',
version='0.1.6')
version='0.1.5')
# ensure there is some tensorflow build with version above 1.4