From 376fd88bb820bdcbe7d192928cb50ec8c54f9842 Mon Sep 17 00:00:00 2001 From: Christopher Hesse Date: Wed, 3 Apr 2019 16:28:29 -0700 Subject: [PATCH 01/14] fix vec monitor infos --- baselines/common/vec_env/vec_monitor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/baselines/common/vec_env/vec_monitor.py b/baselines/common/vec_env/vec_monitor.py index a7b1ce4..efaafc9 100644 --- a/baselines/common/vec_env/vec_monitor.py +++ b/baselines/common/vec_env/vec_monitor.py @@ -33,7 +33,7 @@ class VecMonitor(VecEnvWrapper): self.eprets += rews self.eplens += 1 - newinfos = infos[:] + newinfos = list(infos[:]) for i in range(len(dones)): if dones[i]: info = infos[i].copy() From 5082e5d34b29e21c171cb372ffdbbe9d9e621232 Mon Sep 17 00:00:00 2001 From: Karl Cobbe Date: Thu, 4 Apr 2019 13:52:00 -0700 Subject: [PATCH 02/14] Workbench (#303) * begin workbench * cleanup * begin procgen config integration * arg tweaks * more args * parameter saving * begin procgen enjoy * tweaks * more workbench * more args sync/restore * cleanup * merge in master * rework args priority * more workbench * more loggign * impala cnn * impala lstm * tweak * tweaks * rl19 time logging * misc fixes * faster pipeline * update local.py * sess and log config tweaks * num processes * logging tweaks * difficulty reward wrapper * logging fixes * gin tweaks * tweak * fix * task id * param loading * more variable loading * entrypoint * tweak * ksync * restore lstm * begin rl19 support * tweak * rl19 rnn * more rl19 integration * fix * cleanup * restore rl19 rnn * cleanup * cleanup * wrappers.get_log_info * cleanup * cleanup * directory cleanup * logging, num_experiments * fixes * cleanup * gin fixes * fix local max gpu * resid nx * num machines and download params * rename * cleanup * create workbench * more reorg * fix * more logging wrappers * lint fix * restore train procgen * restore train procgen * pylint fix * better wrapping * config sweep * args sweep * test workers * mpi_weight * train test comm and high difficulty fix * enjoy show returns * removing gin, procgen_parser * removing gin * procgen args * config fixes * cleanup * cleanup * procgen args fix * fix * rcall syncing * lint * rename mpi_weight * use username for sync * fixes * microbatch fix --- baselines/common/models.py | 52 +++++++++++++++++++++++--- baselines/common/mpi_adam_optimizer.py | 13 +++++-- baselines/ppo2/microbatched_model.py | 5 ++- baselines/ppo2/model.py | 4 +- baselines/ppo2/ppo2.py | 28 +++++++++----- 5 files changed, 79 insertions(+), 23 deletions(-) diff --git a/baselines/common/models.py b/baselines/common/models.py index 0003079..0798916 100644 --- a/baselines/common/models.py +++ b/baselines/common/models.py @@ -3,7 +3,6 @@ import tensorflow as tf from baselines.a2c import utils from baselines.a2c.utils import conv, fc, conv_to_fc, batch_to_seq, seq_to_batch from baselines.common.mpi_running_mean_std import RunningMeanStd -import tensorflow.contrib.layers as layers mapping = {} @@ -26,6 +25,42 @@ def nature_cnn(unscaled_images, **conv_kwargs): h3 = conv_to_fc(h3) return activ(fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))) +def build_impala_cnn(unscaled_images, depths=[16,32,32], **conv_kwargs): + """ + Model used in the paper "IMPALA: Scalable Distributed Deep-RL with + Importance Weighted Actor-Learner Architectures" https://arxiv.org/abs/1802.01561 + """ + def conv_layer(out, depth): + return tf.layers.conv2d(out, depth, 3, padding='same') + + def residual_block(inputs): + depth = inputs.get_shape()[-1].value + + out = tf.nn.relu(inputs) + + out = conv_layer(out, depth) + out = tf.nn.relu(out) + out = conv_layer(out, depth) + return out + inputs + + def conv_sequence(inputs, depth): + out = conv_layer(inputs, depth) + out = tf.layers.max_pooling2d(out, pool_size=3, strides=2, padding='same') + out = residual_block(out) + out = residual_block(out) + return out + + out = tf.cast(unscaled_images, tf.float32) / 255. + + for depth in depths: + out = conv_sequence(out, depth) + + out = tf.layers.flatten(out) + out = tf.nn.relu(out) + out = tf.layers.dense(out, 256, activation=tf.nn.relu) + + return out + @register("mlp") def mlp(num_layers=2, num_hidden=64, activation=tf.tanh, layer_norm=False): @@ -65,6 +100,11 @@ def cnn(**conv_kwargs): return nature_cnn(X, **conv_kwargs) return network_fn +@register("impala_cnn") +def impala_cnn(**conv_kwargs): + def network_fn(X): + return build_impala_cnn(X) + return network_fn @register("cnn_small") def cnn_small(**conv_kwargs): @@ -79,7 +119,6 @@ def cnn_small(**conv_kwargs): return h return network_fn - @register("lstm") def lstm(nlstm=128, layer_norm=False): """ @@ -136,12 +175,12 @@ def lstm(nlstm=128, layer_norm=False): @register("cnn_lstm") -def cnn_lstm(nlstm=128, layer_norm=False, **conv_kwargs): +def cnn_lstm(nlstm=128, layer_norm=False, conv_fn=nature_cnn, **conv_kwargs): def network_fn(X, nenv=1): nbatch = X.shape[0] nsteps = nbatch // nenv - h = nature_cnn(X, **conv_kwargs) + h = conv_fn(X, **conv_kwargs) M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) S = tf.placeholder(tf.float32, [nenv, 2*nlstm]) #states @@ -161,6 +200,9 @@ def cnn_lstm(nlstm=128, layer_norm=False, **conv_kwargs): return network_fn +@register("impala_cnn_lstm") +def impala_cnn_lstm(): + return cnn_lstm(nlstm=256, conv_fn=build_impala_cnn) @register("cnn_lnlstm") def cnn_lnlstm(nlstm=128, **conv_kwargs): @@ -187,7 +229,7 @@ def conv_only(convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], **conv_kwargs): out = tf.cast(X, tf.float32) / 255. with tf.variable_scope("convnet"): for num_outputs, kernel_size, stride in convs: - out = layers.convolution2d(out, + out = tf.contrib.layers.convolution2d(out, num_outputs=num_outputs, kernel_size=kernel_size, stride=stride, diff --git a/baselines/common/mpi_adam_optimizer.py b/baselines/common/mpi_adam_optimizer.py index db7f7a2..dcbcd74 100644 --- a/baselines/common/mpi_adam_optimizer.py +++ b/baselines/common/mpi_adam_optimizer.py @@ -9,22 +9,27 @@ except ImportError: class MpiAdamOptimizer(tf.train.AdamOptimizer): """Adam optimizer that averages gradients across mpi processes.""" - def __init__(self, comm, **kwargs): + def __init__(self, comm, mpi_rank_weight=1, **kwargs): self.comm = comm + self.mpi_rank_weight = mpi_rank_weight tf.train.AdamOptimizer.__init__(self, **kwargs) def compute_gradients(self, loss, var_list, **kwargs): grads_and_vars = tf.train.AdamOptimizer.compute_gradients(self, loss, var_list, **kwargs) grads_and_vars = [(g, v) for g, v in grads_and_vars if g is not None] - flat_grad = tf.concat([tf.reshape(g, (-1,)) for g, v in grads_and_vars], axis=0) + flat_grad = tf.concat([tf.reshape(g, (-1,)) for g, v in grads_and_vars], axis=0) * self.mpi_rank_weight shapes = [v.shape.as_list() for g, v in grads_and_vars] sizes = [int(np.prod(s)) for s in shapes] - num_tasks = self.comm.Get_size() + + total_weight = np.zeros(1, np.float32) + self.comm.Allreduce(np.array([self.mpi_rank_weight], dtype=np.float32), total_weight, op=MPI.SUM) + total_weight = total_weight[0] + buf = np.zeros(sum(sizes), np.float32) countholder = [0] # Counts how many times _collect_grads has been called stat = tf.reduce_sum(grads_and_vars[0][1]) # sum of first variable def _collect_grads(flat_grad, np_stat): self.comm.Allreduce(flat_grad, buf, op=MPI.SUM) - np.divide(buf, float(num_tasks), out=buf) + np.divide(buf, float(total_weight), out=buf) if countholder[0] % 100 == 0: check_synced(np_stat, self.comm) countholder[0] += 1 diff --git a/baselines/ppo2/microbatched_model.py b/baselines/ppo2/microbatched_model.py index 6735ed4..8d8b688 100644 --- a/baselines/ppo2/microbatched_model.py +++ b/baselines/ppo2/microbatched_model.py @@ -8,7 +8,7 @@ class MicrobatchedModel(Model): on the entire minibatch causes some overflow """ def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train, - nsteps, ent_coef, vf_coef, max_grad_norm, microbatch_size): + nsteps, ent_coef, vf_coef, max_grad_norm, mpi_rank_weight, microbatch_size): self.nmicrobatches = nbatch_train // microbatch_size self.microbatch_size = microbatch_size @@ -23,7 +23,8 @@ class MicrobatchedModel(Model): nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, - max_grad_norm=max_grad_norm) + max_grad_norm=max_grad_norm, + mpi_rank_weight=mpi_rank_weight) self.grads_ph = [tf.placeholder(dtype=g.dtype, shape=g.shape) for g in self.grads] grads_ph_and_vars = list(zip(self.grads_ph, self.var)) diff --git a/baselines/ppo2/model.py b/baselines/ppo2/model.py index 2326b46..9370d5c 100644 --- a/baselines/ppo2/model.py +++ b/baselines/ppo2/model.py @@ -25,7 +25,7 @@ class Model(object): - Save load the model """ def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train, - nsteps, ent_coef, vf_coef, max_grad_norm, microbatch_size=None): + nsteps, ent_coef, vf_coef, max_grad_norm, mpi_rank_weight=1, microbatch_size=None): self.sess = sess = get_session() with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE): @@ -92,7 +92,7 @@ class Model(object): params = tf.trainable_variables('ppo2_model') # 2. Build our trainer if MPI is not None: - self.trainer = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=LR, epsilon=1e-5) + self.trainer = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=LR, mpi_rank_weight=mpi_rank_weight, epsilon=1e-5) else: self.trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5) # 3. Calculate the gradients diff --git a/baselines/ppo2/ppo2.py b/baselines/ppo2/ppo2.py index 7f3d204..09bc933 100644 --- a/baselines/ppo2/ppo2.py +++ b/baselines/ppo2/ppo2.py @@ -21,7 +21,7 @@ def constfn(val): def learn(*, network, env, total_timesteps, eval_env = None, seed=None, nsteps=2048, ent_coef=0.0, lr=3e-4, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2, - save_interval=0, load_path=None, model_fn=None, **network_kwargs): + save_interval=0, load_path=None, model_fn=None, update_fn=None, init_fn=None, mpi_rank_weight=1, **network_kwargs): ''' Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347) @@ -105,7 +105,7 @@ def learn(*, network, env, total_timesteps, eval_env = None, seed=None, nsteps=2 model = model_fn(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, - max_grad_norm=max_grad_norm) + max_grad_norm=max_grad_norm, mpi_rank_weight=mpi_rank_weight) if load_path is not None: model.load(load_path) @@ -118,6 +118,9 @@ def learn(*, network, env, total_timesteps, eval_env = None, seed=None, nsteps=2 if eval_env is not None: eval_epinfobuf = deque(maxlen=100) + if init_fn is not None: + init_fn() + # Start total timer tfirststart = time.perf_counter() @@ -176,31 +179,36 @@ def learn(*, network, env, total_timesteps, eval_env = None, seed=None, nsteps=2 tnow = time.perf_counter() # Calculate the fps (frame per second) fps = int(nbatch / (tnow - tstart)) + + if update_fn is not None: + update_fn(update) + if update % log_interval == 0 or update == 1: # Calculates if value function is a good predicator of the returns (ev > 1) # or if it's just worse than predicting nothing (ev =< 0) ev = explained_variance(values, returns) - logger.logkv("serial_timesteps", update*nsteps) - logger.logkv("nupdates", update) - logger.logkv("total_timesteps", update*nbatch) + logger.logkv("misc/serial_timesteps", update*nsteps) + logger.logkv("misc/nupdates", update) + logger.logkv("misc/total_timesteps", update*nbatch) logger.logkv("fps", fps) - logger.logkv("explained_variance", float(ev)) + logger.logkv("misc/explained_variance", float(ev)) logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf])) logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf])) if eval_env is not None: logger.logkv('eval_eprewmean', safemean([epinfo['r'] for epinfo in eval_epinfobuf]) ) logger.logkv('eval_eplenmean', safemean([epinfo['l'] for epinfo in eval_epinfobuf]) ) - logger.logkv('time_elapsed', tnow - tfirststart) + logger.logkv('misc/time_elapsed', tnow - tfirststart) for (lossval, lossname) in zip(lossvals, model.loss_names): - logger.logkv(lossname, lossval) - if MPI is None or MPI.COMM_WORLD.Get_rank() == 0: - logger.dumpkvs() + logger.logkv('loss/' + lossname, lossval) + + logger.dumpkvs() if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir() and (MPI is None or MPI.COMM_WORLD.Get_rank() == 0): checkdir = osp.join(logger.get_dir(), 'checkpoints') os.makedirs(checkdir, exist_ok=True) savepath = osp.join(checkdir, '%.5i'%update) print('Saving to', savepath) model.save(savepath) + return model # Avoid division error when calculate the mean (in our case if epinfo is empty returns np.nan, not return an error) def safemean(xs): From 07cbf1e26a30fb0452f06b79db6a30a8f19a67fd Mon Sep 17 00:00:00 2001 From: John Schulman Date: Tue, 16 Apr 2019 09:20:09 -0700 Subject: [PATCH 03/14] Grad clipping in MpiAdamOptimizer, transformer changes (#304) * transformer mnist experiments * version that only builds one model * work on inverted mnist * Add grad clipping to MpiAdamOptimizer * various * transformer changes, loading * get rid of soft labels * transformer baseline * minor * experiments involving all possible training sets * vary training * minor * get ready for fine-tuning expers * lint * minor --- baselines/common/mpi_adam_optimizer.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/baselines/common/mpi_adam_optimizer.py b/baselines/common/mpi_adam_optimizer.py index dcbcd74..3d7cee5 100644 --- a/baselines/common/mpi_adam_optimizer.py +++ b/baselines/common/mpi_adam_optimizer.py @@ -2,6 +2,7 @@ import numpy as np import tensorflow as tf from baselines.common import tf_util as U from baselines.common.tests.test_with_mpi import with_mpi +from baselines import logger try: from mpi4py import MPI except ImportError: @@ -9,8 +10,9 @@ except ImportError: class MpiAdamOptimizer(tf.train.AdamOptimizer): """Adam optimizer that averages gradients across mpi processes.""" - def __init__(self, comm, mpi_rank_weight=1, **kwargs): + def __init__(self, comm, grad_clip=None, mpi_rank_weight=1, **kwargs): self.comm = comm + self.grad_clip = grad_clip self.mpi_rank_weight = mpi_rank_weight tf.train.AdamOptimizer.__init__(self, **kwargs) def compute_gradients(self, loss, var_list, **kwargs): @@ -28,6 +30,12 @@ class MpiAdamOptimizer(tf.train.AdamOptimizer): countholder = [0] # Counts how many times _collect_grads has been called stat = tf.reduce_sum(grads_and_vars[0][1]) # sum of first variable def _collect_grads(flat_grad, np_stat): + if self.grad_clip is not None: + gradnorm = np.linalg.norm(flat_grad) + if gradnorm > 1: + flat_grad /= gradnorm + logger.logkv_mean('gradnorm', gradnorm) + logger.logkv_mean('gradclipfrac', float(gradnorm > 1)) self.comm.Allreduce(flat_grad, buf, op=MPI.SUM) np.divide(buf, float(total_weight), out=buf) if countholder[0] % 100 == 0: @@ -56,8 +64,8 @@ def check_synced(localval, comm=None): comm = comm or MPI.COMM_WORLD vals = comm.gather(localval) if comm.rank == 0: - assert all(val==vals[0] for val in vals[1:]) - + assert all(val==vals[0] for val in vals[1:]),\ + f'MpiAdamOptimizer detected that different workers have different weights: {vals}' @with_mpi(timeout=5) def test_nonfreeze(): From b83a66527de2f327555d267f63ad14ced5589ec1 Mon Sep 17 00:00:00 2001 From: John Schulman Date: Wed, 17 Apr 2019 10:13:12 -0700 Subject: [PATCH 04/14] Add jrl19 as backend for workbench (#324) enable jrl in workbench minor logger changes --- baselines/logger.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/baselines/logger.py b/baselines/logger.py index a0e75ab..4d9ffe6 100644 --- a/baselines/logger.py +++ b/baselines/logger.py @@ -38,8 +38,8 @@ class HumanOutputFormat(KVWriter, SeqWriter): # Create strings for printing key2str = {} for (key, val) in sorted(kvs.items()): - if isinstance(val, float): - valstr = '%-8.3g' % (val,) + if hasattr(val, '__float__'): + valstr = '%-8.3g' % val else: valstr = str(val) key2str[self._truncate(key)] = self._truncate(valstr) @@ -361,6 +361,15 @@ class Logger(object): if isinstance(fmt, SeqWriter): fmt.writeseq(map(str, args)) +def get_rank_without_mpi_import(): + # check environment variables here instead of importing mpi4py + # to avoid calling MPI_Init() when this module is imported + for varname in ['PMI_RANK', 'OMPI_COMM_WORLD_RANK']: + if varname in os.environ: + return int(os.environ[varname]) + return 0 + + def configure(dir=None, format_strs=None, comm=None, log_suffix=''): """ If comm is provided, average all numerical stats across that comm @@ -373,12 +382,7 @@ def configure(dir=None, format_strs=None, comm=None, log_suffix=''): assert isinstance(dir, str) os.makedirs(dir, exist_ok=True) - rank = 0 - # check environment variables here instead of importing mpi4py - # to avoid calling MPI_Init() when this module is imported - for varname in ['PMI_RANK', 'OMPI_COMM_WORLD_RANK']: - if varname in os.environ: - rank = int(os.environ[varname]) + rank = get_rank_without_mpi_import() if rank > 0: log_suffix = log_suffix + "-rank%03i" % rank From a93dde3b2b974917d41cf4e589d6a2b0aba01b64 Mon Sep 17 00:00:00 2001 From: pzhokhov Date: Wed, 17 Apr 2019 15:17:27 -0700 Subject: [PATCH 05/14] extra functionality in baselines.common.plot_util (#310) * get plot_util from mt_experiments branch * add labels * unit tests for plot_util --- baselines/common/plot_util.py | 42 ++++++++++++++++++++---- baselines/common/tests/test_plot_util.py | 17 ++++++++++ baselines/common/tests/util.py | 13 ++++++++ baselines/ddpg/test_smoke.py | 8 ++--- 4 files changed, 68 insertions(+), 12 deletions(-) create mode 100644 baselines/common/tests/test_plot_util.py diff --git a/baselines/common/plot_util.py b/baselines/common/plot_util.py index 26b1613..e15c508 100644 --- a/baselines/common/plot_util.py +++ b/baselines/common/plot_util.py @@ -248,7 +248,10 @@ def plot_results( figsize=None, legend_outside=False, resample=0, - smooth_step=1.0 + smooth_step=1.0, + tiling='vertical', + xlabel=None, + ylabel=None ): ''' Plot multiple Results objects @@ -300,9 +303,23 @@ def plot_results( sk2r[splitkey].append(result) assert len(sk2r) > 0 assert isinstance(resample, int), "0: don't resample. : that many samples" - nrows = len(sk2r) - ncols = 1 - figsize = figsize or (6, 6 * nrows) + if tiling == 'vertical' or tiling is None: + nrows = len(sk2r) + ncols = 1 + elif tiling == 'horizontal': + ncols = len(sk2r) + nrows = 1 + elif tiling == 'symmetric': + import math + N = len(sk2r) + largest_divisor = 1 + for i in range(1, int(math.sqrt(N))+1): + if N % i == 0: + largest_divisor = i + ncols = largest_divisor + nrows = N // ncols + figsize = figsize or (6 * ncols, 6 * nrows) + f, axarr = plt.subplots(nrows, ncols, sharex=False, squeeze=False, figsize=figsize) groups = list(set(group_fn(result) for result in allresults)) @@ -316,7 +333,9 @@ def plot_results( g2c = defaultdict(int) sresults = sk2r[sk] gresults = defaultdict(list) - ax = axarr[isplit][0] + idx_row = isplit // ncols + idx_col = isplit % ncols + ax = axarr[idx_row][idx_col] for result in sresults: group = group_fn(result) g2c[group] += 1 @@ -355,7 +374,7 @@ def plot_results( ymean = np.mean(ys, axis=0) ystd = np.std(ys, axis=0) ystderr = ystd / np.sqrt(len(ys)) - l, = axarr[isplit][0].plot(usex, ymean, color=color) + l, = axarr[idx_row][idx_col].plot(usex, ymean, color=color) g2l[group] = l if shaded_err: ax.fill_between(usex, ymean - ystderr, ymean + ystderr, color=color, alpha=.4) @@ -372,6 +391,17 @@ def plot_results( loc=2 if legend_outside else None, bbox_to_anchor=(1,1) if legend_outside else None) ax.set_title(sk) + # add xlabels, but only to the bottom row + if xlabel is not None: + for ax in axarr[-1]: + plt.sca(ax) + plt.xlabel(xlabel) + # add ylabels, but only to left column + if ylabel is not None: + for ax in axarr[:,0]: + plt.sca(ax) + plt.ylabel(ylabel) + return f, axarr def regression_analysis(df): diff --git a/baselines/common/tests/test_plot_util.py b/baselines/common/tests/test_plot_util.py new file mode 100644 index 0000000..be33308 --- /dev/null +++ b/baselines/common/tests/test_plot_util.py @@ -0,0 +1,17 @@ +# smoke tests of plot_util +from baselines.common import plot_util as pu +from baselines.common.tests.util import smoketest + + +def test_plot_util(): + nruns = 4 + logdirs = [smoketest('--alg=ppo2 --env=CartPole-v0 --num_timesteps=10000') for _ in range(nruns)] + data = pu.load_results(logdirs) + assert len(data) == 4 + + _, axes = pu.plot_results(data[:1]); assert len(axes) == 1 + _, axes = pu.plot_results(data, tiling='vertical'); assert axes.shape==(4,1) + _, axes = pu.plot_results(data, tiling='horizontal'); assert axes.shape==(1,4) + _, axes = pu.plot_results(data, tiling='symmetric'); assert axes.shape==(2,2) + _, axes = pu.plot_results(data, split_fn=lambda _: ''); assert len(axes) == 1 + diff --git a/baselines/common/tests/util.py b/baselines/common/tests/util.py index 441e3f7..b3d31fe 100644 --- a/baselines/common/tests/util.py +++ b/baselines/common/tests/util.py @@ -77,3 +77,16 @@ def rollout(env, model, n_trials): observations.append(episode_obs) return observations, actions, rewards + +def smoketest(argstr, **kwargs): + import tempfile + import subprocess + import os + argstr = 'python -m baselines.run ' + argstr + for key, value in kwargs: + argstr += ' --{}={}'.format(key, value) + tempdir = tempfile.mkdtemp() + env = os.environ.copy() + env['OPENAI_LOGDIR'] = tempdir + subprocess.run(argstr.split(' '), env=env) + return tempdir diff --git a/baselines/ddpg/test_smoke.py b/baselines/ddpg/test_smoke.py index a9fdc05..bd7eba6 100644 --- a/baselines/ddpg/test_smoke.py +++ b/baselines/ddpg/test_smoke.py @@ -1,10 +1,6 @@ -from multiprocessing import Process -import baselines.run - +from baselines.common.tests.util import smoketest def _run(argstr): - p = Process(target=baselines.run.main, args=('--alg=ddpg --env=Pendulum-v0 --num_timesteps=0 ' + argstr).split(' ')) - p.start() - p.join() + smoketest('--alg=ddpg --env=Pendulum-v0 --num_timesteps=0 ' + argstr) def test_popart(): _run('--normalize_returns=True --popart=True') From 967fc8c37f8d3ad39db223c002fc0a502686d214 Mon Sep 17 00:00:00 2001 From: John Schulman Date: Sat, 20 Apr 2019 10:08:09 -0700 Subject: [PATCH 06/14] Fixed sequence env minor (#333) minor changes to FixedSequenceEnv to allow full score --- baselines/common/tests/envs/fixed_sequence_env.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/baselines/common/tests/envs/fixed_sequence_env.py b/baselines/common/tests/envs/fixed_sequence_env.py index f5460d5..b3fe396 100644 --- a/baselines/common/tests/envs/fixed_sequence_env.py +++ b/baselines/common/tests/envs/fixed_sequence_env.py @@ -9,18 +9,16 @@ class FixedSequenceEnv(Env): n_actions=10, episode_len=100 ): - self.np_random = np.random.RandomState() - self.sequence = None - self.action_space = Discrete(n_actions) self.observation_space = Discrete(1) - + self.np_random = np.random.RandomState(0) self.episode_len = episode_len + self.sequence = [self.np_random.randint(0, self.action_space.n) + for _ in range(self.episode_len)] self.time = 0 + def reset(self): - if self.sequence is None: - self.sequence = [self.np_random.randint(0, self.action_space.n-1) for _ in range(self.episode_len)] self.time = 0 return 0 @@ -29,7 +27,6 @@ class FixedSequenceEnv(Env): self._choose_next_state() done = False if self.episode_len and self.time >= self.episode_len: - rew = 0 done = True return 0, rew, done, {} From bc4eef60531b317bc6e00ae2c5a8e2792318191c Mon Sep 17 00:00:00 2001 From: Christopher Hesse Date: Sat, 20 Apr 2019 15:06:18 -0700 Subject: [PATCH 07/14] fix tests (#335) --- baselines/logger.py | 1 - 1 file changed, 1 deletion(-) diff --git a/baselines/logger.py b/baselines/logger.py index 4d9ffe6..36b0c98 100644 --- a/baselines/logger.py +++ b/baselines/logger.py @@ -92,7 +92,6 @@ class JSONOutputFormat(KVWriter): def writekvs(self, kvs): for k, v in sorted(kvs.items()): if hasattr(v, 'dtype'): - v = v.tolist() kvs[k] = float(v) self.file.write(json.dumps(kvs) + '\n') self.file.flush() From ddcab1606d779c7e66c742b0d1e6dccc278305a6 Mon Sep 17 00:00:00 2001 From: Karl Cobbe Date: Mon, 22 Apr 2019 13:12:45 -0700 Subject: [PATCH 08/14] Procgen Benchmark Updates (#328) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * directory cleanup * logging, num_experiments * fixes * cleanup * gin fixes * fix local max gpu * resid nx * tweak * num machines and download params * rename * cleanup * create workbench * more reorg * fix * more logging wrappers * lint fix * restore train procgen * restore train procgen * pylint fix * better wrapping * whackamole walls * config sweep * tweak * args sweep * tweak * test workers * mpi_weight * train test comm and high difficulty fix * enjoy show returns * better joint training * tweak * Add —update to args and add gin-config to requirements.txt * add username to download_file * removing gin, procgen_parser * removing gin * procgen args * config fixes * cleanup * cleanup * procgen args fix * fix * rcall syncing * lint * rename mpi_weight * begin composable game * more composable game * tweak * background alpha * use username for sync * fixes * microbatch fix * lure composable game * merge * proc trans update * proc trans update (#307) * finetuning experiment * Change is_local to use `use_rcall` and fix error of `enjoy.py` with multiple ends * graphing help * add --local * change args_dict['env_name'] to ENV_NAME * finetune experiments * tweak * tweak * reorg wrappers, remove is_local * workdir/local fixes * move finetune experiments * default dir and graphing * more graphing * fix * pooled syncing * tweaks * dir fix * tweak * wrapper mpi fix * wind and turrets * composability cleanup * radius cleanup * composable reorg * laser gates * composable tweaks * soft walls * tweak * begin swamp * more swamp * more swamp * fix * hidden mines * use maze layout * tweak * laser gate tweaks * tweaks * tweaks * lure/propel updates * composable midnight * composable coinmaze * composability difficulty * tweak * add step to save_params * composable offsets * composable boxpush * composable combiner * tweak * tweak * always choose correct number of mechanics * fix * rcall local fix * add steps when dump and save parmas * loading rank 1,2,3.. error fix * add experiments.py * fix loading latest weight with no -rest * support more complex run_id and add more examples * fix typo * move post_run_id into experiments.py * add hp_search example * error fix * joint experiments in progress * joint hp finished * typo * error fix * edit experiments * Save experiments set up in code and save weights per step (#319) * add step to save_params * add steps when dump and save parmas * loading rank 1,2,3.. error fix * add experiments.py * fix loading latest weight with no -rest * support more complex run_id and add more examples * fix typo * move post_run_id into experiments.py * add hp_search example * error fix * joint experiments in progress * joint hp finished * typo * error fix * edit experiments * tweaks * graph exp WIP * depth tweaks * move save_all * fix * restore_dir name * restore depth * choose max mechanics * use override mode * tweak frogger * lstm default * fix * patience is composable * hunter is composable * fixed asset seed cleanup * minesweeper is composable * eggcatch is composable * tweak * applesort is composable * chaser game * begin lighter * lighter game * tractor game * boxgather game * plumber game * hitcher game * doorbell game * lawnmower game * connecter game * cannonaim * outrun game * encircle game * spinner game * tweak * tweak * detonator game * driller * driller * mixer * conveyor * conveyor game * joint pcg experiments * fixes * pcg sweep experiment * cannonaim fix * combiner fix * store save time * laseraim fix * lightup fix * detonator tweaks * detonator fixes * driller fix * lawnmower calibration * spinner calibration * propel fix * train experiment * print load time * system independent hashing * remove gin configurable * task ids fix * test_pcg experiment * connecter dense reward * hard_pcg * num train comms * mpi splits envs * tweaks * tweaks * graph tweaks * graph tweaks * lint fix * fix tests * load bugfix * difficulty timeout tweak * tweaks * more graphing * graph tweaks * tweak * download file fix * pcg train envs list * cleanup * tweak * manually name impala layers * tweak * expect fps * backend arg * args tweak * workbench cleanup * move graph files * workbench cleanup * split env name by comma * workbench cleanup * ema graph * remove Dict * use tf.io.gfile * comments for auto-killing jobs * lint fix * write latest file when not saving all and load it when step=None --- baselines/common/models.py | 13 +++++++++++-- baselines/ppo2/microbatched_model.py | 5 +++-- baselines/ppo2/model.py | 9 ++++++--- baselines/ppo2/ppo2.py | 4 ++-- 4 files changed, 22 insertions(+), 9 deletions(-) diff --git a/baselines/common/models.py b/baselines/common/models.py index 0798916..a6fe467 100644 --- a/baselines/common/models.py +++ b/baselines/common/models.py @@ -30,8 +30,17 @@ def build_impala_cnn(unscaled_images, depths=[16,32,32], **conv_kwargs): Model used in the paper "IMPALA: Scalable Distributed Deep-RL with Importance Weighted Actor-Learner Architectures" https://arxiv.org/abs/1802.01561 """ + + layer_num = 0 + + def get_layer_num_str(): + nonlocal layer_num + num_str = str(layer_num) + layer_num += 1 + return num_str + def conv_layer(out, depth): - return tf.layers.conv2d(out, depth, 3, padding='same') + return tf.layers.conv2d(out, depth, 3, padding='same', name='layer_' + get_layer_num_str()) def residual_block(inputs): depth = inputs.get_shape()[-1].value @@ -57,7 +66,7 @@ def build_impala_cnn(unscaled_images, depths=[16,32,32], **conv_kwargs): out = tf.layers.flatten(out) out = tf.nn.relu(out) - out = tf.layers.dense(out, 256, activation=tf.nn.relu) + out = tf.layers.dense(out, 256, activation=tf.nn.relu, name='layer_' + get_layer_num_str()) return out diff --git a/baselines/ppo2/microbatched_model.py b/baselines/ppo2/microbatched_model.py index 8d8b688..a35b830 100644 --- a/baselines/ppo2/microbatched_model.py +++ b/baselines/ppo2/microbatched_model.py @@ -8,7 +8,7 @@ class MicrobatchedModel(Model): on the entire minibatch causes some overflow """ def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train, - nsteps, ent_coef, vf_coef, max_grad_norm, mpi_rank_weight, microbatch_size): + nsteps, ent_coef, vf_coef, max_grad_norm, mpi_rank_weight, comm, microbatch_size): self.nmicrobatches = nbatch_train // microbatch_size self.microbatch_size = microbatch_size @@ -24,7 +24,8 @@ class MicrobatchedModel(Model): ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, - mpi_rank_weight=mpi_rank_weight) + mpi_rank_weight=mpi_rank_weight, + comm=comm) self.grads_ph = [tf.placeholder(dtype=g.dtype, shape=g.shape) for g in self.grads] grads_ph_and_vars = list(zip(self.grads_ph, self.var)) diff --git a/baselines/ppo2/model.py b/baselines/ppo2/model.py index 9370d5c..35a883d 100644 --- a/baselines/ppo2/model.py +++ b/baselines/ppo2/model.py @@ -25,9 +25,12 @@ class Model(object): - Save load the model """ def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train, - nsteps, ent_coef, vf_coef, max_grad_norm, mpi_rank_weight=1, microbatch_size=None): + nsteps, ent_coef, vf_coef, max_grad_norm, mpi_rank_weight=1, comm=None, microbatch_size=None): self.sess = sess = get_session() + if MPI is not None and comm is None: + comm = MPI.COMM_WORLD + with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE): # CREATE OUR TWO MODELS # act_model that is used for sampling @@ -91,8 +94,8 @@ class Model(object): # 1. Get the model parameters params = tf.trainable_variables('ppo2_model') # 2. Build our trainer - if MPI is not None: - self.trainer = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=LR, mpi_rank_weight=mpi_rank_weight, epsilon=1e-5) + if comm is not None and comm.Get_size() > 1: + self.trainer = MpiAdamOptimizer(comm, learning_rate=LR, mpi_rank_weight=mpi_rank_weight, epsilon=1e-5) else: self.trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5) # 3. Calculate the gradients diff --git a/baselines/ppo2/ppo2.py b/baselines/ppo2/ppo2.py index 09bc933..f3a69d8 100644 --- a/baselines/ppo2/ppo2.py +++ b/baselines/ppo2/ppo2.py @@ -21,7 +21,7 @@ def constfn(val): def learn(*, network, env, total_timesteps, eval_env = None, seed=None, nsteps=2048, ent_coef=0.0, lr=3e-4, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2, - save_interval=0, load_path=None, model_fn=None, update_fn=None, init_fn=None, mpi_rank_weight=1, **network_kwargs): + save_interval=0, load_path=None, model_fn=None, update_fn=None, init_fn=None, mpi_rank_weight=1, comm=None, **network_kwargs): ''' Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347) @@ -105,7 +105,7 @@ def learn(*, network, env, total_timesteps, eval_env = None, seed=None, nsteps=2 model = model_fn(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, - max_grad_norm=max_grad_norm, mpi_rank_weight=mpi_rank_weight) + max_grad_norm=max_grad_norm, comm=comm, mpi_rank_weight=mpi_rank_weight) if load_path is not None: model.load(load_path) From 8e0282ee941c70ddc732556845f8ac12b5bd7f19 Mon Sep 17 00:00:00 2001 From: pzhokhov Date: Mon, 22 Apr 2019 14:41:46 -0700 Subject: [PATCH 09/14] ci/runtests.sh - pass all folders to pytest (#342) * ci/runtests.sh - pass all folders to pytest * mpi_optimizer_test precision 1e-4 * fixes to tests * search for tests in the entire jax folder, also remove unnecessary humor --- baselines/common/tests/test_with_mpi.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/baselines/common/tests/test_with_mpi.py b/baselines/common/tests/test_with_mpi.py index 86be475..9388078 100644 --- a/baselines/common/tests/test_with_mpi.py +++ b/baselines/common/tests/test_with_mpi.py @@ -4,6 +4,7 @@ import subprocess import cloudpickle import base64 import pytest +from functools import wraps try: from mpi4py import MPI @@ -12,6 +13,7 @@ except ImportError: def with_mpi(nproc=2, timeout=30, skip_if_no_mpi=True): def outer_thunk(fn): + @wraps(fn) def thunk(*args, **kwargs): serialized_fn = base64.b64encode(cloudpickle.dumps(lambda: fn(*args, **kwargs))) subprocess.check_call([ From f5daca8c22d964a703beabbcab44e11d75881a34 Mon Sep 17 00:00:00 2001 From: John Schulman Date: Mon, 22 Apr 2019 14:45:01 -0700 Subject: [PATCH 10/14] delete unnecessary stuff (#338) --- setup.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/setup.py b/setup.py index 130cdb5..6074009 100644 --- a/setup.py +++ b/setup.py @@ -36,8 +36,6 @@ setup(name='baselines', 'scipy', 'tqdm', 'joblib', - 'dill', - 'progressbar2', 'cloudpickle', 'click', 'opencv-python' From 64dfabb8eb53664d300c573dbd456fbd85ce4aad Mon Sep 17 00:00:00 2001 From: Greg Brockman Date: Tue, 23 Apr 2019 13:40:08 -0700 Subject: [PATCH 11/14] Add initializer for process-level setup in SubprocVecEnv (#276) * Add initializer for process-level setup in SubprocVecEnv Use case: run logger.configure() in each subprocess * Add option to force dummy vec env --- baselines/common/cmd_util.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/baselines/common/cmd_util.py b/baselines/common/cmd_util.py index 016df93..99ec11c 100644 --- a/baselines/common/cmd_util.py +++ b/baselines/common/cmd_util.py @@ -25,7 +25,9 @@ def make_vec_env(env_id, env_type, num_env, seed, start_index=0, reward_scale=1.0, flatten_dict_observations=True, - gamestate=None): + gamestate=None, + initializer=None, + force_dummy=False): """ Create a wrapped, monitored SubprocVecEnv for Atari and MuJoCo. """ @@ -34,7 +36,7 @@ def make_vec_env(env_id, env_type, num_env, seed, mpi_rank = MPI.COMM_WORLD.Get_rank() if MPI else 0 seed = seed + 10000 * mpi_rank if seed is not None else None logger_dir = logger.get_dir() - def make_thunk(rank): + def make_thunk(rank, initializer=None): return lambda: make_env( env_id=env_id, env_type=env_type, @@ -46,17 +48,21 @@ def make_vec_env(env_id, env_type, num_env, seed, flatten_dict_observations=flatten_dict_observations, wrapper_kwargs=wrapper_kwargs, env_kwargs=env_kwargs, - logger_dir=logger_dir + logger_dir=logger_dir, + initializer=initializer ) set_global_seeds(seed) - if num_env > 1: - return SubprocVecEnv([make_thunk(i + start_index) for i in range(num_env)]) + if not force_dummy and num_env > 1: + return SubprocVecEnv([make_thunk(i + start_index, initializer=initializer) for i in range(num_env)]) else: - return DummyVecEnv([make_thunk(start_index)]) + return DummyVecEnv([make_thunk(i + start_index, initializer=None) for i in range(num_env)]) -def make_env(env_id, env_type, mpi_rank=0, subrank=0, seed=None, reward_scale=1.0, gamestate=None, flatten_dict_observations=True, wrapper_kwargs=None, env_kwargs=None, logger_dir=None): +def make_env(env_id, env_type, mpi_rank=0, subrank=0, seed=None, reward_scale=1.0, gamestate=None, flatten_dict_observations=True, wrapper_kwargs=None, env_kwargs=None, logger_dir=None, initializer=None): + if initializer is not None: + initializer(mpi_rank=mpi_rank, subrank=subrank) + wrapper_kwargs = wrapper_kwargs or {} env_kwargs = env_kwargs or {} if ':' in env_id: From 07536451eee9ecb9d019156272ff139e1ee1284f Mon Sep 17 00:00:00 2001 From: Karl Cobbe Date: Tue, 23 Apr 2019 16:33:30 -0700 Subject: [PATCH 12/14] Procgen fixes (#352) * tweak * documentation * rely on log_comm, remove mpi averaging from wrappers * pass comm for ppo2 initialization * ppo2 logging * experiment tweaks * auto launch tensorboard when using local backend * graph tweaks * pass caller to config * configure logger and tensorboard * make parent dir if necessary * parentdir tweak --- baselines/ppo2/model.py | 2 +- baselines/ppo2/ppo2.py | 8 +++++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/baselines/ppo2/model.py b/baselines/ppo2/model.py index 35a883d..3d56bc9 100644 --- a/baselines/ppo2/model.py +++ b/baselines/ppo2/model.py @@ -128,7 +128,7 @@ class Model(object): initialize() global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="") if MPI is not None: - sync_from_root(sess, global_variables) #pylint: disable=E1101 + sync_from_root(sess, global_variables, comm=comm) #pylint: disable=E1101 def train(self, lr, cliprange, obs, returns, masks, actions, values, neglogpacs, states=None): # Here we calculate advantage A(s,a) = R + yV(s') - V(s) diff --git a/baselines/ppo2/ppo2.py b/baselines/ppo2/ppo2.py index f3a69d8..d307e9b 100644 --- a/baselines/ppo2/ppo2.py +++ b/baselines/ppo2/ppo2.py @@ -97,6 +97,7 @@ def learn(*, network, env, total_timesteps, eval_env = None, seed=None, nsteps=2 # Calculate the batch_size nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches + is_mpi_root = (MPI is None or MPI.COMM_WORLD.Get_rank() == 0) # Instantiate the model object (that creates act_model and train_model) if model_fn is None: @@ -134,11 +135,16 @@ def learn(*, network, env, total_timesteps, eval_env = None, seed=None, nsteps=2 lrnow = lr(frac) # Calculate the cliprange cliprangenow = cliprange(frac) + + if update % log_interval == 0 and is_mpi_root: logger.info('Stepping environment...') + # Get minibatch obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run() #pylint: disable=E0632 if eval_env is not None: eval_obs, eval_returns, eval_masks, eval_actions, eval_values, eval_neglogpacs, eval_states, eval_epinfos = eval_runner.run() #pylint: disable=E0632 + if update % log_interval == 0 and is_mpi_root: logger.info('Done.') + epinfobuf.extend(epinfos) if eval_env is not None: eval_epinfobuf.extend(eval_epinfos) @@ -202,7 +208,7 @@ def learn(*, network, env, total_timesteps, eval_env = None, seed=None, nsteps=2 logger.logkv('loss/' + lossname, lossval) logger.dumpkvs() - if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir() and (MPI is None or MPI.COMM_WORLD.Get_rank() == 0): + if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir() and is_mpi_root: checkdir = osp.join(logger.get_dir(), 'checkpoints') os.makedirs(checkdir, exist_ok=True) savepath = osp.join(checkdir, '%.5i'%update) From 1fa6ac38f1a01a43c7911697712826f29d223df4 Mon Sep 17 00:00:00 2001 From: pzhokhov Date: Wed, 24 Apr 2019 17:04:36 -0700 Subject: [PATCH 13/14] JRL PPO test with delayed identity env (#355) * add a custom delay to identity_env * min reward 0.8 in delayed identity test * seed the tests, perfect score on delayed_identity_test * delay=1 in delayed_identity_test * flake8 complaints * increased number of steps in fixed_seq_test * seed identity tests to ensure reproducibility * docstrings --- baselines/common/tests/envs/identity_env.py | 55 ++++++++++--------- .../common/tests/envs/identity_env_test.py | 36 ++++++++++++ 2 files changed, 66 insertions(+), 25 deletions(-) create mode 100644 baselines/common/tests/envs/identity_env_test.py diff --git a/baselines/common/tests/envs/identity_env.py b/baselines/common/tests/envs/identity_env.py index 79e6c48..fb2dca6 100644 --- a/baselines/common/tests/envs/identity_env.py +++ b/baselines/common/tests/envs/identity_env.py @@ -2,43 +2,45 @@ import numpy as np from abc import abstractmethod from gym import Env from gym.spaces import MultiDiscrete, Discrete, Box - +from collections import deque class IdentityEnv(Env): def __init__( self, - episode_len=None + episode_len=None, + delay=0, + zero_first_rewards=True ): self.observation_space = self.action_space self.episode_len = episode_len self.time = 0 - self.reset() + self.delay = delay + self.zero_first_rewards = zero_first_rewards + self.q = deque(maxlen=delay+1) def reset(self): - self._choose_next_state() + self.q.clear() + for _ in range(self.delay + 1): + self.q.append(self.action_space.sample()) self.time = 0 - return self.state + return self.q[-1] def step(self, actions): - rew = self._get_reward(actions) - self._choose_next_state() - done = False - if self.episode_len and self.time >= self.episode_len: - done = True - - return self.state, rew, done, {} + rew = self._get_reward(self.q.popleft(), actions) + if self.zero_first_rewards and self.time < self.delay: + rew = 0 + self.q.append(self.action_space.sample()) + self.time += 1 + done = self.episode_len is not None and self.time >= self.episode_len + return self.q[-1], rew, done, {} def seed(self, seed=None): self.action_space.seed(seed) - def _choose_next_state(self): - self.state = self.action_space.sample() - self.time += 1 - @abstractmethod - def _get_reward(self, actions): + def _get_reward(self, state, actions): raise NotImplementedError @@ -47,26 +49,29 @@ class DiscreteIdentityEnv(IdentityEnv): self, dim, episode_len=None, + delay=0, + zero_first_rewards=True ): self.action_space = Discrete(dim) - super().__init__(episode_len=episode_len) + super().__init__(episode_len=episode_len, delay=delay, zero_first_rewards=zero_first_rewards) - def _get_reward(self, actions): - return 1 if self.state == actions else 0 + def _get_reward(self, state, actions): + return 1 if state == actions else 0 class MultiDiscreteIdentityEnv(IdentityEnv): def __init__( self, dims, episode_len=None, + delay=0, ): self.action_space = MultiDiscrete(dims) - super().__init__(episode_len=episode_len) + super().__init__(episode_len=episode_len, delay=delay) - def _get_reward(self, actions): - return 1 if all(self.state == actions) else 0 + def _get_reward(self, state, actions): + return 1 if all(state == actions) else 0 class BoxIdentityEnv(IdentityEnv): @@ -79,7 +84,7 @@ class BoxIdentityEnv(IdentityEnv): self.action_space = Box(low=-1.0, high=1.0, shape=shape, dtype=np.float32) super().__init__(episode_len=episode_len) - def _get_reward(self, actions): - diff = actions - self.state + def _get_reward(self, state, actions): + diff = actions - state diff = diff[:] return -0.5 * np.dot(diff, diff) diff --git a/baselines/common/tests/envs/identity_env_test.py b/baselines/common/tests/envs/identity_env_test.py new file mode 100644 index 0000000..c73ee57 --- /dev/null +++ b/baselines/common/tests/envs/identity_env_test.py @@ -0,0 +1,36 @@ +from baselines.common.tests.envs.identity_env import DiscreteIdentityEnv + + +def test_discrete_nodelay(): + nsteps = 100 + eplen = 50 + env = DiscreteIdentityEnv(10, episode_len=eplen) + ob = env.reset() + for t in range(nsteps): + action = env.action_space.sample() + next_ob, rew, done, info = env.step(action) + assert rew == (1 if action == ob else 0) + if (t + 1) % eplen == 0: + assert done + next_ob = env.reset() + else: + assert not done + ob = next_ob + +def test_discrete_delay1(): + eplen = 50 + env = DiscreteIdentityEnv(10, episode_len=eplen, delay=1) + ob = env.reset() + prev_ob = None + for t in range(eplen): + action = env.action_space.sample() + next_ob, rew, done, info = env.step(action) + if t > 0: + assert rew == (1 if action == prev_ob else 0) + else: + assert rew == 0 + prev_ob = ob + ob = next_ob + if t < eplen - 1: + assert not done + assert done From ef7ac116cb7a75ce7bf44c421aea499fc1945697 Mon Sep 17 00:00:00 2001 From: John Schulman Date: Fri, 26 Apr 2019 17:42:43 -0700 Subject: [PATCH 14/14] (onp, np) -> (np, jp), switch jax code to use mark_slow decorator (#363) switch to mark_slow decorator --- .travis.yml | 2 +- baselines/common/tests/__init__.py | 2 ++ baselines/common/tests/test_cartpole.py | 3 ++- baselines/common/tests/test_fetchreach.py | 3 ++- baselines/common/tests/test_fixed_sequence.py | 4 +++- baselines/common/tests/test_identity.py | 7 ++++--- baselines/common/tests/test_mnist.py | 4 ++-- 7 files changed, 16 insertions(+), 9 deletions(-) diff --git a/.travis.yml b/.travis.yml index 712fc84..c68bfc1 100644 --- a/.travis.yml +++ b/.travis.yml @@ -11,4 +11,4 @@ install: script: - flake8 . --show-source --statistics - - docker run baselines-test pytest -v . + - docker run -e RUNSLOW=1 baselines-test pytest -v . diff --git a/baselines/common/tests/__init__.py b/baselines/common/tests/__init__.py index e69de29..a6561a2 100644 --- a/baselines/common/tests/__init__.py +++ b/baselines/common/tests/__init__.py @@ -0,0 +1,2 @@ +import os, pytest +mark_slow = pytest.mark.skipif(not os.getenv('RUNSLOW'), reason='slow') \ No newline at end of file diff --git a/baselines/common/tests/test_cartpole.py b/baselines/common/tests/test_cartpole.py index 475ad1d..f9d5ac6 100644 --- a/baselines/common/tests/test_cartpole.py +++ b/baselines/common/tests/test_cartpole.py @@ -3,6 +3,7 @@ import gym from baselines.run import get_learn_function from baselines.common.tests.util import reward_per_episode_test +from baselines.common.tests import mark_slow common_kwargs = dict( total_timesteps=30000, @@ -20,7 +21,7 @@ learn_kwargs = { 'trpo_mpi': {} } -@pytest.mark.slow +@mark_slow @pytest.mark.parametrize("alg", learn_kwargs.keys()) def test_cartpole(alg): ''' diff --git a/baselines/common/tests/test_fetchreach.py b/baselines/common/tests/test_fetchreach.py index be73663..8bcd32b 100644 --- a/baselines/common/tests/test_fetchreach.py +++ b/baselines/common/tests/test_fetchreach.py @@ -3,6 +3,7 @@ import gym from baselines.run import get_learn_function from baselines.common.tests.util import reward_per_episode_test +from baselines.common.tests import mark_slow pytest.importorskip('mujoco_py') @@ -15,7 +16,7 @@ learn_kwargs = { 'her': dict(total_timesteps=2000) } -@pytest.mark.slow +@mark_slow @pytest.mark.parametrize("alg", learn_kwargs.keys()) def test_fetchreach(alg): ''' diff --git a/baselines/common/tests/test_fixed_sequence.py b/baselines/common/tests/test_fixed_sequence.py index 061c375..68ee8d3 100644 --- a/baselines/common/tests/test_fixed_sequence.py +++ b/baselines/common/tests/test_fixed_sequence.py @@ -3,6 +3,8 @@ from baselines.common.tests.envs.fixed_sequence_env import FixedSequenceEnv from baselines.common.tests.util import simple_test from baselines.run import get_learn_function +from baselines.common.tests import mark_slow + common_kwargs = dict( seed=0, @@ -21,7 +23,7 @@ learn_kwargs = { alg_list = learn_kwargs.keys() rnn_list = ['lstm'] -@pytest.mark.slow +@mark_slow @pytest.mark.parametrize("alg", alg_list) @pytest.mark.parametrize("rnn", rnn_list) def test_fixed_sequence(alg, rnn): diff --git a/baselines/common/tests/test_identity.py b/baselines/common/tests/test_identity.py index c950e5a..6b66a66 100644 --- a/baselines/common/tests/test_identity.py +++ b/baselines/common/tests/test_identity.py @@ -2,6 +2,7 @@ import pytest from baselines.common.tests.envs.identity_env import DiscreteIdentityEnv, BoxIdentityEnv, MultiDiscreteIdentityEnv from baselines.run import get_learn_function from baselines.common.tests.util import simple_test +from baselines.common.tests import mark_slow common_kwargs = dict( total_timesteps=30000, @@ -24,7 +25,7 @@ algos_disc = ['a2c', 'acktr', 'deepq', 'ppo2', 'trpo_mpi'] algos_multidisc = ['a2c', 'acktr', 'ppo2', 'trpo_mpi'] algos_cont = ['a2c', 'acktr', 'ddpg', 'ppo2', 'trpo_mpi'] -@pytest.mark.slow +@mark_slow @pytest.mark.parametrize("alg", algos_disc) def test_discrete_identity(alg): ''' @@ -39,7 +40,7 @@ def test_discrete_identity(alg): env_fn = lambda: DiscreteIdentityEnv(10, episode_len=100) simple_test(env_fn, learn_fn, 0.9) -@pytest.mark.slow +@mark_slow @pytest.mark.parametrize("alg", algos_multidisc) def test_multidiscrete_identity(alg): ''' @@ -54,7 +55,7 @@ def test_multidiscrete_identity(alg): env_fn = lambda: MultiDiscreteIdentityEnv((3,3), episode_len=100) simple_test(env_fn, learn_fn, 0.9) -@pytest.mark.slow +@mark_slow @pytest.mark.parametrize("alg", algos_cont) def test_continuous_identity(alg): ''' diff --git a/baselines/common/tests/test_mnist.py b/baselines/common/tests/test_mnist.py index bacc914..06a4e2b 100644 --- a/baselines/common/tests/test_mnist.py +++ b/baselines/common/tests/test_mnist.py @@ -4,7 +4,7 @@ import pytest from baselines.common.tests.envs.mnist_env import MnistEnv from baselines.common.tests.util import simple_test from baselines.run import get_learn_function - +from baselines.common.tests import mark_slow # TODO investigate a2c and ppo2 failures - is it due to bad hyperparameters for this problem? # GitHub issue https://github.com/openai/baselines/issues/189 @@ -28,7 +28,7 @@ learn_args = { #tests pass, but are too slow on travis. Same algorithms are covered # by other tests with less compute-hungry nn's and by benchmarks @pytest.mark.skip -@pytest.mark.slow +@mark_slow @pytest.mark.parametrize("alg", learn_args.keys()) def test_mnist(alg): '''