Compare commits

..

21 Commits

Author SHA1 Message Date
Peter Zhokhov
c7a0c2781a autopep8 and import fix 2018-10-23 13:42:44 -07:00
Peter Zhokhov
06c2fd2a3c typo in registry.py 2018-10-23 11:16:20 -07:00
Peter Zhokhov
a52dcae856 added comments on registry usage, fixed typos in deepq and trpo_mpi registration 2018-10-23 11:14:48 -07:00
Peter Zhokhov
a8c2e643dc import error in run.py 2018-10-23 10:11:48 -07:00
Peter Zhokhov
5ca31a7c25 merged latest master 2018-10-23 10:07:51 -07:00
Peter Zhokhov
35dcb6fd74 merged internal 2018-10-22 19:22:46 -07:00
Peter Zhokhov
c1c7c469a1 fix syntax 2018-10-22 19:19:54 -07:00
Peter Zhokhov
b4869bd271 use algorithm registry - staging for internal benchmarks 2018-10-22 19:13:10 -07:00
Peter Zhokhov
29cfb4a69c Merge branch 'internal' of github.com:openai/baselines into peterz_learn_registration 2018-10-22 19:08:27 -07:00
Peter Zhokhov
bd7c479e04 merge master 2018-10-22 19:07:46 -07:00
Peter Zhokhov
3ddf69c4b5 defaults are handled through registry 2018-10-22 18:10:10 -07:00
Peter Zhokhov
bfdc552521 moving things around 2018-10-22 17:45:55 -07:00
Peter Zhokhov
bcb4d4f795 moved imports back to run 2018-10-22 17:15:24 -07:00
Peter Zhokhov
0c9b236475 using registry of algorithms 2018-10-22 17:01:49 -07:00
Peter Zhokhov
01884bb0eb wrap retro envs correctly for other (non-deepq) algorithms 2018-10-22 14:21:26 -07:00
Peter Zhokhov
ade2d61be7 Merge branch 'master' of github.com:openai/games into peterz_track_baselines_branch 2018-10-19 17:27:57 -07:00
Peter Zhokhov
f6ef52a9df Merge branch 'master' of github.com:openai/baselines into internal 2018-10-19 09:52:23 -07:00
Peter Zhokhov
8964d5ad45 flake8 and numpy.random.random_integers deprecation warning 2018-10-16 14:58:23 -07:00
Peter Zhokhov
8624bc629c eval_done[d]==True -> eval_done[d] 2018-10-15 18:31:55 -07:00
Peter Zhokhov
7b33af0395 B -> nenvs for consistency with other algos, small cleanups 2018-10-15 18:29:48 -07:00
Peter Zhokhov
4bca9158a1 sync internal changes. Make ddpg work with vecenvs 2018-10-15 17:40:24 -07:00
34 changed files with 212 additions and 1630 deletions

View File

@@ -11,4 +11,4 @@ install:
script:
- flake8 . --show-source --statistics
- docker run baselines-test pytest -v --forked .
- docker run baselines-test pytest -v .

View File

@@ -1,9 +1,16 @@
FROM python:3.6
RUN apt-get -y update && apt-get -y install ffmpeg
# RUN apt-get -y update && apt-get -y install git wget python-dev python3-dev libopenmpi-dev python-pip zlib1g-dev cmake python-opencv
FROM ubuntu:16.04
RUN apt-get -y update && apt-get -y install git wget python-dev python3-dev libopenmpi-dev python-pip zlib1g-dev cmake python-opencv
ENV CODE_DIR /root/code
ENV VENV /root/venv
RUN \
pip install virtualenv && \
virtualenv $VENV --python=python3 && \
. $VENV/bin/activate && \
pip install --upgrade pip
ENV PATH=$VENV/bin:$PATH
COPY . $CODE_DIR/baselines
WORKDIR $CODE_DIR/baselines

View File

@@ -109,9 +109,17 @@ python -m baselines.run --alg=ppo2 --env=PongNoFrameskip-v4 --num_timesteps=0 --
*NOTE:* At the moment Mujoco training uses VecNormalize wrapper for the environment which is not being saved correctly; so loading the models trained on Mujoco will not work well if the environment is recreated. If necessary, you can work around that by replacing RunningMeanStd by TfRunningMeanStd in [baselines/common/vec_env/vec_normalize.py](baselines/common/vec_env/vec_normalize.py#L12). This way, mean and std of environment normalizing wrapper will be saved in tensorflow variables and included in the model file; however, training is slower that way - hence not including it by default
## Loading and vizualizing learning curves and other training metrics
See [here](docs/viz/viz.ipynb) for instructions on how to load and display the training data.
## Using baselines with TensorBoard
Baselines logger can save data in the TensorBoard format. To do so, set environment variables `OPENAI_LOG_FORMAT` and `OPENAI_LOGDIR`:
```bash
export OPENAI_LOG_FORMAT='stdout,log,csv,tensorboard' # formats are comma-separated, but for tensorboard you only really need the last one
export OPENAI_LOGDIR=path/to/tensorboard/data
```
And you can now start TensorBoard with:
```bash
tensorboard --logdir=$OPENAI_LOGDIR
```
## Subpackages
- [A2C](baselines/a2c)

View File

@@ -0,0 +1,12 @@
# explicitly import sub-packages to register algorithms
import baselines.a2c.a2c
import baselines.acer.acer
import baselines.acktr.acktr
import baselines.deepq.deepq
import baselines.ddpg.ddpg
import baselines.ppo2.ppo2
# not really sure why flake8 complains only about trpo_mpi here...
import baselines.trpo_mpi.trpo_mpi # noqa: F401

View File

@@ -2,13 +2,12 @@ import time
import functools
import tensorflow as tf
from baselines import logger
from baselines import logger, registry
from baselines.common import set_global_seeds, explained_variance
from baselines.common import tf_util
from baselines.common.policies import build_policy
from baselines.a2c.utils import Scheduler, find_trainable_variables
from baselines.a2c.runner import Runner
@@ -114,6 +113,7 @@ class Model(object):
tf.global_variables_initializer().run(session=sess)
@registry.register('a2c')
def learn(
network,
env,

View File

@@ -2,7 +2,7 @@ import time
import functools
import numpy as np
import tensorflow as tf
from baselines import logger
from baselines import logger, registry
from baselines.common import set_global_seeds
from baselines.common.policies import build_policy
@@ -16,6 +16,7 @@ from baselines.a2c.utils import EpisodeStats
from baselines.a2c.utils import get_by_index, check_shape, avg_norm, gradient_add, q_explained_variance
from baselines.acer.buffer import Buffer
from baselines.acer.runner import Runner
from baselines.acer.defaults import defaults
# remove last step
def strip(var, nenvs, nsteps, flat = False):
@@ -270,7 +271,7 @@ class Acer():
logger.record_tabular(name, float(val))
logger.dump_tabular()
@registry.register('acer', defaults=defaults)
def learn(network, env, seed=None, nsteps=20, total_timesteps=int(80e6), q_coef=0.5, ent_coef=0.01,
max_grad_norm=10, lr=7e-4, lrschedule='linear', rprop_epsilon=1e-5, rprop_alpha=0.99, gamma=0.99,
log_interval=100, buffer_size=50000, replay_ratio=4, replay_start=10000, c=10.0,

View File

@@ -1,4 +1,3 @@
def atari():
return dict(
lrschedule='constant'
)
defaults = {
'atari': dict(lrschedule='constant')
}

View File

@@ -2,7 +2,7 @@ import os.path as osp
import time
import functools
import tensorflow as tf
from baselines import logger
from baselines import logger, registry
from baselines.common import set_global_seeds, explained_variance
from baselines.common.policies import build_policy
@@ -11,6 +11,7 @@ from baselines.common.tf_util import get_session, save_variables, load_variables
from baselines.a2c.runner import Runner
from baselines.a2c.utils import Scheduler, find_trainable_variables
from baselines.acktr import kfac
from baselines.acktr.defaults import defaults
class Model(object):
@@ -21,16 +22,16 @@ class Model(object):
self.sess = sess = get_session()
nbatch = nenvs * nsteps
with tf.variable_scope('acktr_model', reuse=tf.AUTO_REUSE):
self.model = step_model = policy(nenvs, 1, sess=sess)
self.model2 = train_model = policy(nenvs*nsteps, nsteps, sess=sess)
A = train_model.pdtype.sample_placeholder([None])
A = tf.placeholder(ac_space.dtype, [nbatch,] + list(ac_space.shape))
ADV = tf.placeholder(tf.float32, [nbatch])
R = tf.placeholder(tf.float32, [nbatch])
PG_LR = tf.placeholder(tf.float32, [])
VF_LR = tf.placeholder(tf.float32, [])
with tf.variable_scope('acktr_model', reuse=tf.AUTO_REUSE):
self.model = step_model = policy(nenvs, 1, sess=sess)
self.model2 = train_model = policy(nenvs*nsteps, nsteps, sess=sess)
neglogpac = train_model.pd.neglogp(A)
self.logits = train_model.pi
@@ -90,6 +91,7 @@ class Model(object):
self.initial_state = step_model.initial_state
tf.global_variables_initializer().run(session=sess)
@registry.register('acktr', defaults=defaults)
def learn(network, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interval=1, nprocs=32, nsteps=20,
ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5,
kfac_clip=0.001, save_interval=None, lrschedule='linear', load_path=None, is_async=True, **network_kwargs):

View File

@@ -1,5 +1,6 @@
def mujoco():
return dict(
defaults = {
'mujoco' : dict(
nsteps=2500,
value_network='copy'
)
}

View File

@@ -16,11 +16,13 @@ from baselines.common import set_global_seeds
from baselines.common.atari_wrappers import make_atari, wrap_deepmind
from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
from baselines.common.vec_env.vec_frame_stack import VecFrameStack
from baselines.common import retro_wrappers
def make_vec_env(env_id, env_type, num_env, seed, wrapper_kwargs=None, start_index=0, reward_scale=1.0, gamestate=None):
def make_vec_env(env_id, env_type, num_env, seed, wrapper_kwargs=None, start_index=0, reward_scale=1.0, gamestate=None, frame_stack_size=1):
"""
Create a wrapped, monitored SubprocVecEnv for Atari and MuJoCo.
Create a wrapped, monitored SubprocVecEnv
"""
if wrapper_kwargs is None: wrapper_kwargs = {}
mpi_rank = MPI.COMM_WORLD.Get_rank() if MPI else 0
@@ -38,9 +40,15 @@ def make_vec_env(env_id, env_type, num_env, seed, wrapper_kwargs=None, start_ind
set_global_seeds(seed)
if num_env > 1:
return SubprocVecEnv([make_thunk(i + start_index) for i in range(num_env)])
venv = SubprocVecEnv([make_thunk(i + start_index) for i in range(num_env)])
else:
return DummyVecEnv([make_thunk(start_index)])
venv = DummyVecEnv([make_thunk(start_index)])
if frame_stack_size > 1:
venv = VecFrameStack(venv, frame_stack_size)
return venv
def make_env(env_id, env_type, subrank=0, seed=None, reward_scale=1.0, gamestate=None, wrapper_kwargs={}):
@@ -60,14 +68,12 @@ def make_env(env_id, env_type, subrank=0, seed=None, reward_scale=1.0, gamestate
allow_early_resets=True)
if env_type == 'atari':
env = wrap_deepmind(env, **wrapper_kwargs)
elif env_type == 'retro':
env = retro_wrappers.wrap_deepmind_retro(env, **wrapper_kwargs)
return wrap_deepmind(env, **wrapper_kwargs)
elif reward_scale != 1:
return retro_wrappers.RewardScaler(env, reward_scale)
else:
return env
if reward_scale != 1:
env = retro_wrappers.RewardScaler(env, reward_scale)
return env
def make_mujoco_env(env_id, seed, reward_scale=1.0):
@@ -131,8 +137,6 @@ def common_arg_parser():
parser.add_argument('--num_env', help='Number of environment copies being run in parallel. When not specified, set to number of cpus for Atari, and to 1 for Mujoco', default=None, type=int)
parser.add_argument('--reward_scale', help='Reward scale factor. Default: 1.0', default=1.0, type=float)
parser.add_argument('--save_path', help='Path to save trained model to', default=None, type=str)
parser.add_argument('--save_video_interval', help='Save video every x steps (0 = disabled)', default=0, type=int)
parser.add_argument('--save_video_length', help='Length of recorded video. Default: 200', default=200, type=int)
parser.add_argument('--play', default=False, action='store_true')
return parser

View File

@@ -39,7 +39,7 @@ class PdType(object):
raise NotImplementedError
def pdfromflat(self, flat):
return self.pdclass()(flat)
def pdfromlatent(self, latent_vector, init_scale, init_bias):
def pdfromlatent(self, latent_vector):
raise NotImplementedError
def param_shape(self):
raise NotImplementedError
@@ -80,11 +80,6 @@ class MultiCategoricalPdType(PdType):
return MultiCategoricalPd
def pdfromflat(self, flat):
return MultiCategoricalPd(self.ncats, flat)
def pdfromlatent(self, latent, init_scale=1.0, init_bias=0.0):
pdparam = fc(latent, 'pi', self.ncats.sum(), init_scale=init_scale, init_bias=init_bias)
return self.pdfromflat(pdparam), pdparam
def param_shape(self):
return [sum(self.ncats)]
def sample_shape(self):

View File

@@ -1,6 +1,5 @@
import numpy as np
import tensorflow as tf
from gym.spaces import Discrete, Box, MultiDiscrete
from gym.spaces import Discrete, Box
def observation_placeholder(ob_space, batch_size=None, name='Ob'):
'''
@@ -21,14 +20,10 @@ def observation_placeholder(ob_space, batch_size=None, name='Ob'):
tensorflow placeholder tensor
'''
assert isinstance(ob_space, Discrete) or isinstance(ob_space, Box) or isinstance(ob_space, MultiDiscrete), \
assert isinstance(ob_space, Discrete) or isinstance(ob_space, Box), \
'Can only deal with Discrete and Box observation spaces for now'
dtype = ob_space.dtype
if dtype == np.int8:
dtype = np.uint8
return tf.placeholder(shape=(batch_size,) + ob_space.shape, dtype=dtype, name=name)
return tf.placeholder(shape=(batch_size,) + ob_space.shape, dtype=ob_space.dtype, name=name)
def observation_input(ob_space, batch_size=None, name='Ob'):
@@ -53,12 +48,9 @@ def encode_observation(ob_space, placeholder):
'''
if isinstance(ob_space, Discrete):
return tf.to_float(tf.one_hot(placeholder, ob_space.n))
elif isinstance(ob_space, Box):
return tf.to_float(placeholder)
elif isinstance(ob_space, MultiDiscrete):
placeholder = tf.cast(placeholder, tf.int32)
one_hots = [tf.to_float(tf.one_hot(placeholder[..., i], ob_space.nvec[i])) for i in range(placeholder.shape[-1])]
return tf.concat(one_hots, axis=-1)
else:
raise NotImplementedError

View File

@@ -1,11 +1,7 @@
from mpi4py import MPI
import baselines.common.tf_util as U
import tensorflow as tf
import numpy as np
try:
from mpi4py import MPI
except ImportError:
MPI = None
class MpiAdam(object):
def __init__(self, var_list, *, beta1=0.9, beta2=0.999, epsilon=1e-08, scale_grad_by_procs=True, comm=None):
@@ -20,19 +16,16 @@ class MpiAdam(object):
self.t = 0
self.setfromflat = U.SetFromFlat(var_list)
self.getflat = U.GetFlat(var_list)
self.comm = MPI.COMM_WORLD if comm is None and MPI is not None else comm
self.comm = MPI.COMM_WORLD if comm is None else comm
def update(self, localg, stepsize):
if self.t % 100 == 0:
self.check_synced()
localg = localg.astype('float32')
if self.comm is not None:
globalg = np.zeros_like(localg)
self.comm.Allreduce(localg, globalg, op=MPI.SUM)
if self.scale_grad_by_procs:
globalg /= self.comm.Get_size()
else:
globalg = np.copy(localg)
globalg = np.zeros_like(localg)
self.comm.Allreduce(localg, globalg, op=MPI.SUM)
if self.scale_grad_by_procs:
globalg /= self.comm.Get_size()
self.t += 1
a = stepsize * np.sqrt(1 - self.beta2**self.t)/(1 - self.beta1**self.t)
@@ -42,15 +35,11 @@ class MpiAdam(object):
self.setfromflat(self.getflat() + step)
def sync(self):
if self.comm is None:
return
theta = self.getflat()
self.comm.Bcast(theta, root=0)
self.setfromflat(theta)
def check_synced(self):
if self.comm is None:
return
if self.comm.Get_rank() == 0: # this is root
theta = self.getflat()
self.comm.Bcast(theta, root=0)
@@ -74,30 +63,17 @@ def test_MpiAdam():
do_update = U.function([], loss, updates=[update_op])
tf.get_default_session().run(tf.global_variables_initializer())
losslist_ref = []
for i in range(10):
l = do_update()
print(i, l)
losslist_ref.append(l)
print(i,do_update())
tf.set_random_seed(0)
tf.get_default_session().run(tf.global_variables_initializer())
var_list = [a,b]
lossandgrad = U.function([], [loss, U.flatgrad(loss, var_list)])
lossandgrad = U.function([], [loss, U.flatgrad(loss, var_list)], updates=[update_op])
adam = MpiAdam(var_list)
losslist_test = []
for i in range(10):
l,g = lossandgrad()
adam.update(g, stepsize)
print(i,l)
losslist_test.append(l)
np.testing.assert_allclose(np.array(losslist_ref), np.array(losslist_test), atol=1e-4)
if __name__ == '__main__':
test_MpiAdam()

View File

@@ -1,8 +1,4 @@
try:
from mpi4py import MPI
except ImportError:
MPI = None
from mpi4py import MPI
import tensorflow as tf, baselines.common.tf_util as U, numpy as np
class RunningMeanStd(object):
@@ -43,8 +39,7 @@ class RunningMeanStd(object):
n = int(np.prod(self.shape))
totalvec = np.zeros(n*2+1, 'float64')
addvec = np.concatenate([x.sum(axis=0).ravel(), np.square(x).sum(axis=0).ravel(), np.array([len(x)],dtype='float64')])
if MPI is not None:
MPI.COMM_WORLD.Allreduce(addvec, totalvec, op=MPI.SUM)
MPI.COMM_WORLD.Allreduce(addvec, totalvec, op=MPI.SUM)
self.incfiltparams(totalvec[0:n].reshape(self.shape), totalvec[n:2*n].reshape(self.shape), totalvec[2*n])
@U.in_session

View File

@@ -1,401 +0,0 @@
import matplotlib.pyplot as plt
import os.path as osp
import json
import os
import numpy as np
import pandas
from collections import defaultdict, namedtuple
from baselines.bench import monitor
from baselines.logger import read_json, read_csv
def smooth(y, radius, mode='two_sided', valid_only=False):
'''
Smooth signal y, where radius is determines the size of the window
mode='twosided':
average over the window [max(index - radius, 0), min(index + radius, len(y)-1)]
mode='causal':
average over the window [max(index - radius, 0), index]
valid_only: put nan in entries where the full-sized window is not available
'''
assert mode in ('two_sided', 'causal')
if len(y) < 2*radius+1:
return np.ones_like(y) * y.mean()
elif mode == 'two_sided':
convkernel = np.ones(2 * radius+1)
out = np.convolve(y, convkernel,mode='same') / np.convolve(np.ones_like(y), convkernel, mode='same')
if valid_only:
out[:radius] = out[-radius:] = np.nan
elif mode == 'causal':
convkernel = np.ones(radius)
out = np.convolve(y, convkernel,mode='full') / np.convolve(np.ones_like(y), convkernel, mode='full')
out = out[:-radius+1]
if valid_only:
out[:radius] = np.nan
return out
def one_sided_ema(xolds, yolds, low=None, high=None, n=512, decay_steps=1., low_counts_threshold=1e-8):
'''
perform one-sided (causal) EMA (exponential moving average)
smoothing and resampling to an even grid with n points.
Does not do extrapolation, so we assume
xolds[0] <= low && high <= xolds[-1]
Arguments:
xolds: array or list - x values of data. Needs to be sorted in ascending order
yolds: array of list - y values of data. Has to have the same length as xolds
low: float - min value of the new x grid. By default equals to xolds[0]
high: float - max value of the new x grid. By default equals to xolds[-1]
n: int - number of points in new x grid
decay_steps: float - EMA decay factor, expressed in new x grid steps.
low_counts_threshold: float or int
- y values with counts less than this value will be set to NaN
Returns:
tuple sum_ys, count_ys where
xs - array with new x grid
ys - array of EMA of y at each point of the new x grid
count_ys - array of EMA of y counts at each point of the new x grid
'''
low = xolds[0] if low is None else low
high = xolds[-1] if high is None else high
assert xolds[0] <= low, 'low = {} < xolds[0] = {} - extrapolation not permitted!'.format(low, xolds[0])
assert xolds[-1] >= high, 'high = {} > xolds[-1] = {} - extrapolation not permitted!'.format(high, xolds[-1])
assert len(xolds) == len(yolds), 'length of xolds ({}) and yolds ({}) do not match!'.format(len(xolds), len(yolds))
xolds = xolds.astype('float64')
yolds = yolds.astype('float64')
luoi = 0 # last unused old index
sum_y = 0.
count_y = 0.
xnews = np.linspace(low, high, n)
decay_period = (high - low) / (n - 1) * decay_steps
interstep_decay = np.exp(- 1. / decay_steps)
sum_ys = np.zeros_like(xnews)
count_ys = np.zeros_like(xnews)
for i in range(n):
xnew = xnews[i]
sum_y *= interstep_decay
count_y *= interstep_decay
while True:
xold = xolds[luoi]
if xold <= xnew:
decay = np.exp(- (xnew - xold) / decay_period)
sum_y += decay * yolds[luoi]
count_y += decay
luoi += 1
else:
break
if luoi >= len(xolds):
break
sum_ys[i] = sum_y
count_ys[i] = count_y
ys = sum_ys / count_ys
ys[count_ys < low_counts_threshold] = np.nan
return xnews, ys, count_ys
def symmetric_ema(xolds, yolds, low=None, high=None, n=512, decay_steps=1., low_counts_threshold=1e-8):
'''
perform symmetric EMA (exponential moving average)
smoothing and resampling to an even grid with n points.
Does not do extrapolation, so we assume
xolds[0] <= low && high <= xolds[-1]
Arguments:
xolds: array or list - x values of data. Needs to be sorted in ascending order
yolds: array of list - y values of data. Has to have the same length as xolds
low: float - min value of the new x grid. By default equals to xolds[0]
high: float - max value of the new x grid. By default equals to xolds[-1]
n: int - number of points in new x grid
decay_steps: float - EMA decay factor, expressed in new x grid steps.
low_counts_threshold: float or int
- y values with counts less than this value will be set to NaN
Returns:
tuple sum_ys, count_ys where
xs - array with new x grid
ys - array of EMA of y at each point of the new x grid
count_ys - array of EMA of y counts at each point of the new x grid
'''
xs, ys1, count_ys1 = one_sided_ema(xolds, yolds, low, high, n, decay_steps, low_counts_threshold=0)
_, ys2, count_ys2 = one_sided_ema(-xolds[::-1], yolds[::-1], -high, -low, n, decay_steps, low_counts_threshold=0)
ys2 = ys2[::-1]
count_ys2 = count_ys2[::-1]
count_ys = count_ys1 + count_ys2
ys = (ys1 * count_ys1 + ys2 * count_ys2) / count_ys
ys[count_ys < low_counts_threshold] = np.nan
return xs, ys, count_ys
Result = namedtuple('Result', 'monitor progress dirname metadata')
Result.__new__.__defaults__ = (None,) * len(Result._fields)
def load_results(root_dir_or_dirs, enable_progress=True, enable_monitor=True, verbose=False):
'''
load summaries of runs from a list of directories (including subdirectories)
Arguments:
enable_progress: bool - if True, will attempt to load data from progress.csv files (data saved by logger). Default: True
enable_monitor: bool - if True, will attempt to load data from monitor.csv files (data saved by Monitor environment wrapper). Default: True
verbose: bool - if True, will print out list of directories from which the data is loaded. Default: False
Returns:
List of Result objects with the following fields:
- dirname - path to the directory data was loaded from
- metadata - run metadata (such as command-line arguments and anything else in metadata.json file
- monitor - if enable_monitor is True, this field contains pandas dataframe with loaded monitor.csv file (or aggregate of all *.monitor.csv files in the directory)
- progress - if enable_progress is True, this field contains pandas dataframe with loaded progress.csv file
'''
if isinstance(root_dir_or_dirs, str):
rootdirs = [osp.expanduser(root_dir_or_dirs)]
else:
rootdirs = [osp.expanduser(d) for d in root_dir_or_dirs]
allresults = []
for rootdir in rootdirs:
assert osp.exists(rootdir), "%s doesn't exist"%rootdir
for dirname, dirs, files in os.walk(rootdir):
if '-proc' in dirname:
files[:] = []
continue
if set(['metadata.json', 'monitor.json', 'monitor.csv', 'progress.json', 'progress.csv']).intersection(files):
# used to be uncommented, which means do not go deeper than current directory if any of the data files
# are found
# dirs[:] = []
result = {'dirname' : dirname}
if "metadata.json" in files:
with open(osp.join(dirname, "metadata.json"), "r") as fh:
result['metadata'] = json.load(fh)
progjson = osp.join(dirname, "progress.json")
progcsv = osp.join(dirname, "progress.csv")
if enable_progress:
if osp.exists(progjson):
result['progress'] = pandas.DataFrame(read_json(progjson))
elif osp.exists(progcsv):
try:
result['progress'] = read_csv(progcsv)
except pandas.errors.EmptyDataError:
print('skipping progress file in ', dirname, 'empty data')
else:
if verbose: print('skipping %s: no progress file'%dirname)
if enable_monitor:
try:
result['monitor'] = pandas.DataFrame(monitor.load_results(dirname))
except monitor.LoadMonitorResultsError:
print('skipping %s: no monitor files'%dirname)
except Exception as e:
print('exception loading monitor file in %s: %s'%(dirname, e))
if result.get('monitor') is not None or result.get('progress') is not None:
allresults.append(Result(**result))
if verbose:
print('successfully loaded %s'%dirname)
if verbose: print('loaded %i results'%len(allresults))
return allresults
COLORS = ['blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'black', 'purple', 'pink',
'brown', 'orange', 'teal', 'lightblue', 'lime', 'lavender', 'turquoise',
'darkgreen', 'tan', 'salmon', 'gold', 'darkred', 'darkblue']
def default_xy_fn(r):
x = np.cumsum(r.monitor.l)
y = smooth(r.monitor.r, radius=10)
return x,y
def default_split_fn(r):
import re
# match name between slash and -<digits> at the end of the string
# (slash in the beginning or -<digits> in the end or either may be missing)
match = re.search(r'[^/-]+(?=(-\d+)?\Z)', r.dirname)
if match:
return match.group(0)
def plot_results(
allresults, *,
xy_fn=default_xy_fn,
split_fn=default_split_fn,
group_fn=default_split_fn,
average_group=False,
shaded_std=True,
shaded_err=True,
figsize=None,
legend_outside=False,
resample=0,
smooth_step=1.0,
):
'''
Plot multiple Results objects
xy_fn: function Result -> x,y - function that converts results objects into tuple of x and y values.
By default, x is cumsum of episode lengths, and y is episode rewards
split_fn: function Result -> hashable - function that converts results objects into keys to split curves into sub-panels by.
That is, the results r for which split_fn(r) is different will be put on different sub-panels.
By default, the portion of r.dirname between last / and -<digits> is returned. The sub-panels are
stacked vertically in the figure.
group_fn: function Result -> hashable - function that converts results objects into keys to group curves by.
That is, the results r for which group_fn(r) is the same will be put into the same group.
Curves in the same group have the same color (if average_group is False), or averaged over
(if average_group is True). The default value is the same as default value for split_fn
average_group: bool - if True, will average the curves in the same group and plot the mean. Enables resampling
(if resample = 0, will use 512 steps)
shaded_std: bool - if True (default), the shaded region corresponding to standard deviation of the group of curves will be
shown (only applicable if average_group = True)
shaded_err: bool - if True (default), the shaded region corresponding to error in mean estimate of the group of curves
(that is, standard deviation divided by square root of number of curves) will be
shown (only applicable if average_group = True)
figsize: tuple or None - size of the resulting figure (including sub-panels). By default, width is 6 and height is 6 times number of
sub-panels.
legend_outside: bool - if True, will place the legend outside of the sub-panels.
resample: int - if not zero, size of the uniform grid in x direction to resample onto. Resampling is performed via symmetric
EMA smoothing (see the docstring for symmetric_ema).
Default is zero (no resampling). Note that if average_group is True, resampling is necessary; in that case, default
value is 512.
smooth_step: float - when resampling (i.e. when resample > 0 or average_group is True), use this EMA decay parameter (in units of the new grid step).
See docstrings for decay_steps in symmetric_ema or one_sided_ema functions.
'''
if split_fn is None: split_fn = lambda _ : ''
if group_fn is None: group_fn = lambda _ : ''
sk2r = defaultdict(list) # splitkey2results
for result in allresults:
splitkey = split_fn(result)
sk2r[splitkey].append(result)
assert len(sk2r) > 0
assert isinstance(resample, int), "0: don't resample. <integer>: that many samples"
nrows = len(sk2r)
ncols = 1
figsize = figsize or (6, 6 * nrows)
f, axarr = plt.subplots(nrows, ncols, sharex=False, squeeze=False, figsize=figsize)
groups = list(set(group_fn(result) for result in allresults))
default_samples = 512
if average_group:
resample = resample or default_samples
for (isplit, sk) in enumerate(sorted(sk2r.keys())):
g2l = {}
g2c = defaultdict(int)
sresults = sk2r[sk]
gresults = defaultdict(list)
ax = axarr[isplit][0]
for result in sresults:
group = group_fn(result)
g2c[group] += 1
x, y = xy_fn(result)
if x is None: x = np.arange(len(y))
x, y = map(np.asarray, (x, y))
if average_group:
gresults[group].append((x,y))
else:
if resample:
x, y, counts = symmetric_ema(x, y, x[0], x[-1], resample, decay_steps=smooth_step)
l, = ax.plot(x, y, color=COLORS[groups.index(group) % len(COLORS)])
g2l[group] = l
if average_group:
for group in sorted(groups):
xys = gresults[group]
if not any(xys):
continue
color = COLORS[groups.index(group) % len(COLORS)]
origxs = [xy[0] for xy in xys]
minxlen = min(map(len, origxs))
def allequal(qs):
return all((q==qs[0]).all() for q in qs[1:])
if resample:
low = max(x[0] for x in origxs)
high = min(x[-1] for x in origxs)
usex = np.linspace(low, high, resample)
ys = []
for (x, y) in xys:
ys.append(symmetric_ema(x, y, low, high, resample, decay_steps=smooth_step)[1])
else:
assert allequal([x[:minxlen] for x in origxs]),\
'If you want to average unevenly sampled data, set resample=<number of samples you want>'
usex = origxs[0]
ys = [xy[1][:minxlen] for xy in xys]
ymean = np.mean(ys, axis=0)
ystd = np.std(ys, axis=0)
ystderr = ystd / np.sqrt(len(ys))
l, = axarr[isplit][0].plot(usex, ymean, color=color)
g2l[group] = l
if shaded_err:
ax.fill_between(usex, ymean - ystderr, ymean + ystderr, color=color, alpha=.4)
if shaded_std:
ax.fill_between(usex, ymean - ystd, ymean + ystd, color=color, alpha=.2)
# https://matplotlib.org/users/legend_guide.html
plt.tight_layout()
if any(g2l.keys()):
ax.legend(
g2l.values(),
['%s (%i)'%(g, g2c[g]) for g in g2l] if average_group else g2l.keys(),
loc=2 if legend_outside else None,
bbox_to_anchor=(1,1) if legend_outside else None)
ax.set_title(sk)
return f, axarr
def regression_analysis(df):
xcols = list(df.columns.copy())
xcols.remove('score')
ycols = ['score']
import statsmodels.api as sm
mod = sm.OLS(df[ycols], sm.add_constant(df[xcols]), hasconst=False)
res = mod.fit()
print(res.summary())
def test_smooth():
norig = 100
nup = 300
ndown = 30
xs = np.cumsum(np.random.rand(norig) * 10 / norig)
yclean = np.sin(xs)
ys = yclean + .1 * np.random.randn(yclean.size)
xup, yup, _ = symmetric_ema(xs, ys, xs.min(), xs.max(), nup, decay_steps=nup/ndown)
xdown, ydown, _ = symmetric_ema(xs, ys, xs.min(), xs.max(), ndown, decay_steps=ndown/ndown)
xsame, ysame, _ = symmetric_ema(xs, ys, xs.min(), xs.max(), norig, decay_steps=norig/ndown)
plt.plot(xs, ys, label='orig', marker='x')
plt.plot(xup, yup, label='up', marker='x')
plt.plot(xdown, ydown, label='down', marker='x')
plt.plot(xsame, ysame, label='same', marker='x')
plt.plot(xs, yclean, label='clean', marker='x')
plt.legend()
plt.show()

View File

@@ -1,7 +1,7 @@
import numpy as np
from abc import abstractmethod
from gym import Env
from gym.spaces import MultiDiscrete, Discrete, Box
from gym.spaces import Discrete, Box
class IdentityEnv(Env):
@@ -53,19 +53,6 @@ class DiscreteIdentityEnv(IdentityEnv):
def _get_reward(self, actions):
return 1 if self.state == actions else 0
class MultiDiscreteIdentityEnv(IdentityEnv):
def __init__(
self,
dims,
episode_len=None,
):
self.action_space = MultiDiscrete(dims)
super().__init__(episode_len=episode_len)
def _get_reward(self, actions):
return 1 if all(self.state == actions) else 0
class BoxIdentityEnv(IdentityEnv):
def __init__(

View File

@@ -1,5 +1,5 @@
import pytest
from baselines.common.tests.envs.identity_env import DiscreteIdentityEnv, BoxIdentityEnv, MultiDiscreteIdentityEnv
from baselines.common.tests.envs.identity_env import DiscreteIdentityEnv, BoxIdentityEnv
from baselines.run import get_learn_function
from baselines.common.tests.util import simple_test
@@ -21,7 +21,6 @@ learn_kwargs = {
algos_disc = ['a2c', 'acktr', 'deepq', 'ppo2', 'trpo_mpi']
algos_multidisc = ['a2c', 'acktr', 'ppo2', 'trpo_mpi']
algos_cont = ['a2c', 'acktr', 'ddpg', 'ppo2', 'trpo_mpi']
@pytest.mark.slow
@@ -39,21 +38,6 @@ def test_discrete_identity(alg):
env_fn = lambda: DiscreteIdentityEnv(10, episode_len=100)
simple_test(env_fn, learn_fn, 0.9)
@pytest.mark.slow
@pytest.mark.parametrize("alg", algos_multidisc)
def test_multidiscrete_identity(alg):
'''
Test if the algorithm (with an mlp policy)
can learn an identity transformation (i.e. return observation as an action)
'''
kwargs = learn_kwargs[alg]
kwargs.update(common_kwargs)
learn_fn = lambda e: get_learn_function(alg)(env=e, **kwargs)
env_fn = lambda: MultiDiscreteIdentityEnv((3,3), episode_len=100)
simple_test(env_fn, learn_fn, 0.9)
@pytest.mark.slow
@pytest.mark.parametrize("alg", algos_cont)
def test_continuous_identity(alg):
@@ -71,5 +55,5 @@ def test_continuous_identity(alg):
simple_test(env_fn, learn_fn, -0.1)
if __name__ == '__main__':
test_multidiscrete_identity('acktr')
test_continuous_identity('ddpg')

View File

@@ -32,11 +32,6 @@ class VecEnv(ABC):
"""
closed = False
viewer = None
metadata = {
'render.modes': ['human', 'rgb_array']
}
def __init__(self, num_envs, observation_space, action_space):
self.num_envs = num_envs
self.observation_space = observation_space

View File

@@ -20,8 +20,8 @@ class DummyVecEnv(VecEnv):
env = self.envs[0]
VecEnv.__init__(self, len(env_fns), env.observation_space, env.action_space)
obs_space = env.observation_space
self.keys, shapes, dtypes = obs_space_info(obs_space)
self.keys, shapes, dtypes = obs_space_info(obs_space)
self.buf_obs = { k: np.zeros((self.num_envs,) + tuple(shapes[k]), dtype=dtypes[k]) for k in self.keys }
self.buf_dones = np.zeros((self.num_envs,), dtype=np.bool)
self.buf_rews = np.zeros((self.num_envs,), dtype=np.float32)
@@ -76,6 +76,6 @@ class DummyVecEnv(VecEnv):
def render(self, mode='human'):
if self.num_envs == 1:
return self.envs[0].render(mode=mode)
self.envs[0].render(mode=mode)
else:
return super().render(mode=mode)
super().render(mode=mode)

View File

@@ -1,49 +0,0 @@
"""
Tests for asynchronous vectorized environments.
"""
import gym
import pytest
import os
import glob
import tempfile
from .dummy_vec_env import DummyVecEnv
from .shmem_vec_env import ShmemVecEnv
from .subproc_vec_env import SubprocVecEnv
from .vec_video_recorder import VecVideoRecorder
@pytest.mark.parametrize('klass', (DummyVecEnv, ShmemVecEnv, SubprocVecEnv))
@pytest.mark.parametrize('num_envs', (1, 4))
@pytest.mark.parametrize('video_length', (10, 100))
@pytest.mark.parametrize('video_interval', (1, 50))
def test_video_recorder(klass, num_envs, video_length, video_interval):
"""
Wrap an existing VecEnv with VevVideoRecorder,
Make (video_interval + video_length + 1) steps,
then check that the file is present
"""
def make_fn():
env = gym.make('PongNoFrameskip-v4')
return env
fns = [make_fn for _ in range(num_envs)]
env = klass(fns)
with tempfile.TemporaryDirectory() as video_path:
env = VecVideoRecorder(env, video_path, record_video_trigger=lambda x: x % video_interval == 0, video_length=video_length)
env.reset()
for _ in range(video_interval + video_length + 1):
env.step([0] * num_envs)
env.close()
recorded_video = glob.glob(os.path.join(video_path, "*.mp4"))
# first and second step
assert len(recorded_video) == 2
# Files are not empty
assert all(os.stat(p).st_size != 0 for p in recorded_video)

View File

@@ -1,89 +0,0 @@
import os
from baselines import logger
from baselines.common.vec_env import VecEnvWrapper
from gym.wrappers.monitoring import video_recorder
class VecVideoRecorder(VecEnvWrapper):
"""
Wrap VecEnv to record rendered image as mp4 video.
"""
def __init__(self, venv, directory, record_video_trigger, video_length=200):
"""
# Arguments
venv: VecEnv to wrap
directory: Where to save videos
record_video_trigger:
Function that defines when to start recording.
The function takes the current number of step,
and returns whether we should start recording or not.
video_length: Length of recorded video
"""
VecEnvWrapper.__init__(self, venv)
self.record_video_trigger = record_video_trigger
self.video_recorder = None
self.directory = os.path.abspath(directory)
if not os.path.exists(self.directory): os.mkdir(self.directory)
self.file_prefix = "vecenv"
self.file_infix = '{}'.format(os.getpid())
self.step_id = 0
self.video_length = video_length
self.recording = False
self.recorded_frames = 0
def reset(self):
obs = self.venv.reset()
self.start_video_recorder()
return obs
def start_video_recorder(self):
self.close_video_recorder()
base_path = os.path.join(self.directory, '{}.video.{}.video{:06}'.format(self.file_prefix, self.file_infix, self.step_id))
self.video_recorder = video_recorder.VideoRecorder(
env=self.venv,
base_path=base_path,
metadata={'step_id': self.step_id}
)
self.video_recorder.capture_frame()
self.recorded_frames = 1
self.recording = True
def _video_enabled(self):
return self.record_video_trigger(self.step_id)
def step_wait(self):
obs, rews, dones, infos = self.venv.step_wait()
self.step_id += 1
if self.recording:
self.video_recorder.capture_frame()
self.recorded_frames += 1
if self.recorded_frames > self.video_length:
logger.info("Saving video to ", self.video_recorder.path)
self.close_video_recorder()
elif self._video_enabled():
self.start_video_recorder()
return obs, rews, dones, infos
def close_video_recorder(self):
if self.recording:
self.video_recorder.close()
self.recording = False
self.recorded_frames = 0
def close(self):
VecEnvWrapper.close(self)
self.close_video_recorder()
def __del__(self):
self.close()

View File

@@ -7,17 +7,14 @@ from baselines.ddpg.ddpg_learner import DDPG
from baselines.ddpg.models import Actor, Critic
from baselines.ddpg.memory import Memory
from baselines.ddpg.noise import AdaptiveParamNoiseSpec, NormalActionNoise, OrnsteinUhlenbeckActionNoise
from baselines.common import set_global_seeds
import baselines.common.tf_util as U
from baselines import logger
from baselines import logger, registry
import numpy as np
from mpi4py import MPI
try:
from mpi4py import MPI
except ImportError:
MPI = None
@registry.register('ddpg')
def learn(network, env,
seed=None,
total_timesteps=None,
@@ -44,7 +41,6 @@ def learn(network, env,
param_noise_adaption_interval=50,
**network_kwargs):
set_global_seeds(seed)
if total_timesteps is not None:
assert nb_epochs is None
@@ -52,11 +48,7 @@ def learn(network, env,
else:
nb_epochs = 500
if MPI is not None:
rank = MPI.COMM_WORLD.Get_rank()
else:
rank = 0
rank = MPI.COMM_WORLD.Get_rank()
nb_actions = env.action_space.shape[-1]
assert (np.abs(env.action_space.low) == env.action_space.high).all() # we assume symmetric actions.
@@ -66,6 +58,7 @@ def learn(network, env,
action_noise = None
param_noise = None
nb_actions = env.action_space.shape[-1]
if noise_type is not None:
for current_noise_type in noise_type.split(','):
current_noise_type = current_noise_type.strip()
@@ -206,11 +199,7 @@ def learn(network, env,
eval_episode_rewards_history.append(eval_episode_reward[d])
eval_episode_reward[d] = 0.0
if MPI is not None:
mpi_size = MPI.COMM_WORLD.Get_size()
else:
mpi_size = 1
mpi_size = MPI.COMM_WORLD.Get_size()
# Log stats.
# XXX shouldn't call np.mean on variable length lists
duration = time.time() - start_time
@@ -244,10 +233,7 @@ def learn(network, env,
else:
raise ValueError('expected scalar, got %s'%x)
combined_stats_sums = np.array([ np.array(x).flatten()[0] for x in combined_stats.values()])
if MPI is not None:
combined_stats_sums = MPI.COMM_WORLD.allreduce(combined_stats_sums)
combined_stats_sums = MPI.COMM_WORLD.allreduce(np.array([ np.array(x).flatten()[0] for x in combined_stats.values()]))
combined_stats = {k : v / mpi_size for (k,v) in zip(combined_stats.keys(), combined_stats_sums)}
# Total statistics.

View File

@@ -9,10 +9,7 @@ from baselines import logger
from baselines.common.mpi_adam import MpiAdam
import baselines.common.tf_util as U
from baselines.common.mpi_running_mean_std import RunningMeanStd
try:
from mpi4py import MPI
except ImportError:
MPI = None
from mpi4py import MPI
def normalize(x, stats):
if stats is None:
@@ -271,7 +268,7 @@ class DDPG(object):
if self.action_noise is not None and apply_noise:
noise = self.action_noise()
assert noise.shape == action[0].shape
assert noise.shape == action.shape
action += noise
action = np.clip(action, self.action_range[0], self.action_range[1])
@@ -361,11 +358,6 @@ class DDPG(object):
return stats
def adapt_param_noise(self):
try:
from mpi4py import MPI
except ImportError:
MPI = None
if self.param_noise is None:
return 0.
@@ -379,16 +371,7 @@ class DDPG(object):
self.param_noise_stddev: self.param_noise.current_stddev,
})
if MPI is not None:
mean_distance = MPI.COMM_WORLD.allreduce(distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size()
else:
mean_distance = distance
if MPI is not None:
mean_distance = MPI.COMM_WORLD.allreduce(distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size()
else:
mean_distance = distance
mean_distance = MPI.COMM_WORLD.allreduce(distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size()
self.param_noise.adapt(mean_distance)
return mean_distance

View File

@@ -8,7 +8,7 @@ import numpy as np
import baselines.common.tf_util as U
from baselines.common.tf_util import load_variables, save_variables
from baselines import logger
from baselines import logger, registry
from baselines.common.schedules import LinearSchedule
from baselines.common import set_global_seeds
@@ -18,6 +18,7 @@ from baselines.deepq.utils import ObservationInput
from baselines.common.tf_util import get_session
from baselines.deepq.models import build_q_func
from baselines.deepq.defaults import defaults
class ActWrapper(object):
@@ -92,6 +93,7 @@ def load_act(path):
return ActWrapper.load_act(path)
@registry.register('deepq', supports_vecenv=False, defaults=defaults)
def learn(env,
network,
seed=None,
@@ -169,8 +171,6 @@ def learn(env,
to 1.0. If set to None equals to total_timesteps.
prioritized_replay_eps: float
epsilon to add to the TD errors when updating priorities.
param_noise: bool
whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905)
callback: (locals, globals) -> None
function called at every steps with state of the algorithm.
If callback returns true training stops.

View File

@@ -16,6 +16,8 @@ def atari():
dueling=True
)
def retro():
return atari()
defaults = {
'atari': atari(),
'retro': atari()
}

View File

@@ -18,11 +18,11 @@ class TfInput(object):
"""Return the tf variable(s) representing the possibly postprocessed value
of placeholder(s).
"""
raise NotImplementedError
raise NotImplemented()
def make_feed_dict(data):
"""Given data input it to the placeholder(s)."""
raise NotImplementedError
raise NotImplemented()
class PlaceholderTfInput(TfInput):

View File

@@ -1,5 +1,5 @@
def mujoco():
return dict(
defaults = {
'mujoco': dict(
nsteps=2048,
nminibatches=32,
lam=0.95,
@@ -10,16 +10,13 @@ def mujoco():
lr=lambda f: 3e-4 * f,
cliprange=0.2,
value_network='copy'
)
),
def atari():
return dict(
'atari': dict(
nsteps=128, nminibatches=4,
lam=0.95, gamma=0.99, noptepochs=4, log_interval=1,
ent_coef=.01,
lr=lambda f : f * 2.5e-4,
cliprange=lambda f : f * 0.1,
)
def retro():
return atari()
}

View File

@@ -4,21 +4,18 @@ import functools
import numpy as np
import os.path as osp
import tensorflow as tf
from baselines import logger
from baselines import logger, registry
from collections import deque
from baselines.common import explained_variance, set_global_seeds
from baselines.common.policies import build_policy
from baselines.common.runners import AbstractEnvRunner
from baselines.common.tf_util import get_session, save_variables, load_variables
from baselines.common.mpi_adam_optimizer import MpiAdamOptimizer
try:
from baselines.common.mpi_adam_optimizer import MpiAdamOptimizer
from mpi4py import MPI
from baselines.common.mpi_util import sync_from_root
except ImportError:
MPI = None
from mpi4py import MPI
from baselines.common.tf_util import initialize
from baselines.common.mpi_util import sync_from_root
from baselines.ppo2.defaults import defaults
class Model(object):
"""
@@ -97,10 +94,7 @@ class Model(object):
# 1. Get the model parameters
params = tf.trainable_variables('ppo2_model')
# 2. Build our trainer
if MPI is not None:
trainer = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=LR, epsilon=1e-5)
else:
trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5)
trainer = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=LR, epsilon=1e-5)
# 3. Calculate the gradients
grads_and_var = trainer.compute_gradients(loss, params)
grads, var = zip(*grads_and_var)
@@ -143,12 +137,10 @@ class Model(object):
self.save = functools.partial(save_variables, sess=sess)
self.load = functools.partial(load_variables, sess=sess)
if MPI is None or MPI.COMM_WORLD.Get_rank() == 0:
if MPI.COMM_WORLD.Get_rank() == 0:
initialize()
global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="")
if MPI is not None:
sync_from_root(sess, global_variables) #pylint: disable=E1101
sync_from_root(sess, global_variables) #pylint: disable=E1101
class Runner(AbstractEnvRunner):
"""
@@ -227,6 +219,7 @@ def constfn(val):
return val
return f
@registry.register('ppo2', defaults=defaults)
def learn(*, network, env, total_timesteps, eval_env = None, seed=None, nsteps=2048, ent_coef=0.0, lr=3e-4,
vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95,
log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2,
@@ -401,9 +394,9 @@ def learn(*, network, env, total_timesteps, eval_env = None, seed=None, nsteps=2
logger.logkv('time_elapsed', tnow - tfirststart)
for (lossval, lossname) in zip(lossvals, model.loss_names):
logger.logkv(lossname, lossval)
if MPI is None or MPI.COMM_WORLD.Get_rank() == 0:
if MPI.COMM_WORLD.Get_rank() == 0:
logger.dumpkvs()
if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir() and (MPI is None or MPI.COMM_WORLD.Get_rank() == 0):
if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir() and MPI.COMM_WORLD.Get_rank() == 0:
checkdir = osp.join(logger.get_dir(), 'checkpoints')
os.makedirs(checkdir, exist_ok=True)
savepath = osp.join(checkdir, '%.5i'%update)

39
baselines/registry.py Normal file
View File

@@ -0,0 +1,39 @@
# Registry of algorithms that keeps track of algorithms supported environments and
# and fine-grained defaults for different kinds of environments (atari, retro, mujoco etc)
#
# Example usage:
#
# from baselines import registry
#
# @registry.register('fancy_algorithm', supports_vecenv=False)
# def learn(env, network):
# return
#
# for algo_name, algo_entry in registry.registry.items():
# if not algo_entry['supports_vecenv']:
# print(f'{algo_name} does not support vecenvs')
# # should print "fancy_algorithm does not support vecenvs" (among other ones)"f
from baselines import logger
registry = {}
def register(name, supports_vecenv=True, defaults={}):
def get_fn_entrypoint(fn):
import inspect
return '.'.join([inspect.getmodule(fn).__name__, fn.__name__])
def _thunk(learn_fn):
old_entry = registry.get(name)
if old_entry is not None:
logger.warn('Re-registering learn function {} (old entrypoint {}, new entrypoint {}) '.format(
name, get_fn_entrypoint(old_entry['fn']), get_fn_entrypoint(learn_fn)))
registry[name] = dict(
fn = learn_fn,
supports_vecenv=supports_vecenv,
defaults=defaults,
)
return learn_fn
return _thunk

View File

@@ -3,17 +3,12 @@ import multiprocessing
import os.path as osp
import gym
from collections import defaultdict
import tensorflow as tf
import numpy as np
from baselines.common.vec_env.vec_video_recorder import VecVideoRecorder
from baselines.common.vec_env.vec_frame_stack import VecFrameStack
from baselines.common.cmd_util import common_arg_parser, parse_unknown_args, make_vec_env, make_env
from baselines.common.tf_util import get_session
from baselines import logger
from importlib import import_module
from baselines.common.vec_env.vec_normalize import VecNormalize
from baselines.common.cmd_util import common_arg_parser, parse_unknown_args, make_vec_env, make_env
from baselines import logger
from baselines.registry import registry
try:
from mpi4py import MPI
@@ -63,8 +58,6 @@ def train(args, extra_args):
alg_kwargs.update(extra_args)
env = build_env(args)
if args.save_video_interval != 0:
env = VecVideoRecorder(env, osp.join(logger.Logger.CURRENT.dir, "videos"), record_video_trigger=lambda x: x % args.save_video_interval == 0, video_length=args.save_video_length)
if args.network:
alg_kwargs['network'] = args.network
@@ -92,28 +85,20 @@ def build_env(args):
seed = args.seed
env_type, env_id = get_env_type(args.env)
assert alg in registry, 'Unknown algorithm {}'.format(alg)
if env_type in {'atari', 'retro'}:
if alg == 'deepq':
env = make_env(env_id, env_type, seed=seed, wrapper_kwargs={'frame_stack': True})
elif alg == 'trpo_mpi':
env = make_env(env_id, env_type, seed=seed)
else:
frame_stack_size = 4
env = make_vec_env(env_id, env_type, nenv, seed, gamestate=args.gamestate, reward_scale=args.reward_scale)
env = VecFrameStack(env, frame_stack_size)
frame_stack_size = 4
else:
config = tf.ConfigProto(allow_soft_placement=True,
intra_op_parallelism_threads=1,
inter_op_parallelism_threads=1)
config.gpu_options.allow_growth = True
get_session(config=config)
frame_stack_size = 1
env = make_vec_env(env_id, env_type, args.num_env or 1, seed, reward_scale=args.reward_scale)
if registry[alg]['supports_vecenv']:
env = make_vec_env(env_id, env_type, nenv, seed, gamestate=args.gamestate, reward_scale=args.reward_scale, frame_stack_size=frame_stack_size)
else:
env = make_env(env_id, env_type, seed=seed, wrapper_kwargs={'frame_stack': frame_stack_size > 1})
if env_type == 'mujoco':
env = VecNormalize(env)
if env_type == 'mujoco' and registry[alg]['supports_vecenv']:
env = VecNormalize(env)
return env
@@ -134,35 +119,32 @@ def get_env_type(env_id):
def get_default_network(env_type):
if env_type in {'atari', 'retro'}:
if env_type == 'atari':
return 'cnn'
else:
return 'mlp'
def get_alg_module(alg, submodule=None):
submodule = submodule or alg
try:
# first try to import the alg module from baselines
alg_module = import_module('.'.join(['baselines', alg, submodule]))
except ImportError:
# then from rl_algs
alg_module = import_module('.'.join(['rl_' + 'algs', alg, submodule]))
import inspect
entry = registry.get(alg)
assert entry is not None, 'Unregistered algorithm {}'.format(alg)
module = inspect.getmodule(entry['fn']).__name__
if submodule is not None:
module = '.'.join([module, submodule])
return module
return alg_module
def get_learn_function(alg):
return get_alg_module(alg).learn
entry = registry.get(alg)
assert entry is not None, 'Unregistered algorithm {}'.format(alg)
return entry['fn']
def get_learn_function_defaults(alg, env_type):
try:
alg_defaults = get_alg_module(alg, 'defaults')
kwargs = getattr(alg_defaults, env_type)()
except (ImportError, AttributeError):
kwargs = {}
return kwargs
entry = registry.get(alg)
assert entry is not None, 'Unregistered algorithm {}'.format(alg)
return entry['defaults'].get(env_type, {})
def parse_cmdline_kwargs(args):
@@ -196,6 +178,7 @@ def main():
rank = MPI.COMM_WORLD.Get_rank()
model, env = train(args, extra_args)
env.close()
if args.save_path is not None and rank == 0:

View File

@@ -28,3 +28,8 @@ def mujoco():
vf_stepsize=1e-3,
normalize_observations=True,
)
defaults = {
'atari': atari(),
'mujoco': mujoco(),
}

View File

@@ -1,9 +1,10 @@
from baselines.common import explained_variance, zipsame, dataset
from baselines import logger
from baselines import logger, registry
import baselines.common.tf_util as U
import tensorflow as tf, numpy as np
import time
from baselines.common import colorize
from mpi4py import MPI
from collections import deque
from baselines.common import set_global_seeds
from baselines.common.mpi_adam import MpiAdam
@@ -12,10 +13,7 @@ from baselines.common.input import observation_placeholder
from baselines.common.policies import build_policy
from contextlib import contextmanager
try:
from mpi4py import MPI
except ImportError:
MPI = None
from baselines.trpo_mpi.defaults import defaults
def traj_segment_generator(pi, env, horizon, stochastic):
# Initialize state variables
@@ -86,6 +84,7 @@ def add_vtarg_and_adv(seg, gamma, lam):
gaelam[t] = lastgaelam = delta + gamma * lam * nonterminal * lastgaelam
seg["tdlamret"] = seg["adv"] + seg["vpred"]
@registry.register('trpo_mpi', supports_vecenv=False, defaults=defaults)
def learn(*,
network,
env,
@@ -150,12 +149,9 @@ def learn(*,
'''
if MPI is not None:
nworkers = MPI.COMM_WORLD.Get_size()
rank = MPI.COMM_WORLD.Get_rank()
else:
nworkers = 1
rank = 0
nworkers = MPI.COMM_WORLD.Get_size()
rank = MPI.COMM_WORLD.Get_rank()
cpus_per_worker = 1
U.get_session(config=tf.ConfigProto(
@@ -244,13 +240,9 @@ def learn(*,
def allmean(x):
assert isinstance(x, np.ndarray)
if MPI is not None:
out = np.empty_like(x)
MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM)
out /= nworkers
else:
out = np.copy(x)
out = np.empty_like(x)
MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM)
out /= nworkers
return out
U.initialize()
@@ -258,9 +250,7 @@ def learn(*,
pi.load(load_path)
th_init = get_flat()
if MPI is not None:
MPI.COMM_WORLD.Bcast(th_init, root=0)
MPI.COMM_WORLD.Bcast(th_init, root=0)
set_from_flat(th_init)
vfadam.sync()
print("Init param sum", th_init.sum(), flush=True)
@@ -366,11 +356,7 @@ def learn(*,
logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret))
lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values
if MPI is not None:
listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples
else:
listoflrpairs = [lrlocal]
listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples
lens, rews = map(flatten_lists, zip(*listoflrpairs))
lenbuffer.extend(lens)
rewbuffer.extend(rews)

File diff suppressed because one or more lines are too long

View File

@@ -11,14 +11,10 @@ extras = {
'test': [
'filelock',
'pytest',
'pytest-forked',
'atari-py'
],
'bullet': [
'pybullet',
],
'mpi': [
'mpi4py'
]
}
@@ -38,6 +34,7 @@ setup(name='baselines',
'joblib',
'dill',
'progressbar2',
'mpi4py',
'cloudpickle',
'click',
'opencv-python'
@@ -60,4 +57,4 @@ for tf_pkg_name in ['tensorflow', 'tensorflow-gpu']:
pass
assert tf_pkg is not None, 'TensorFlow needed, of version above 1.4'
from distutils.version import StrictVersion
assert StrictVersion(re.sub(r'-?rc\d+$', '', tf_pkg.version)) >= StrictVersion('1.4.0')
assert StrictVersion(re.sub(r'-rc\d+$', '', tf_pkg.version)) >= StrictVersion('1.4.0')