From bb403781182c6e31d3bf5de16f42b0cb0d8421f7 Mon Sep 17 00:00:00 2001 From: John Schulman Date: Wed, 25 Oct 2017 09:21:29 -0400 Subject: [PATCH] change atari preprocessing to use faster opencv some logger changes --- .gitignore | 3 + README.md | 11 + baselines/a2c/policies.py | 5 +- baselines/a2c/run_atari.py | 19 +- baselines/acktr/acktr_cont.py | 6 +- baselines/acktr/acktr_disc.py | 11 +- baselines/acktr/policies.py | 3 - baselines/acktr/run_atari.py | 20 +- baselines/acktr/run_mujoco.py | 9 +- baselines/acktr/value_functions.py | 4 +- baselines/bench/__init__.py | 2 +- baselines/bench/benchmarks.py | 134 +++++----- baselines/bench/monitor.py | 140 +++++----- baselines/common/atari_wrappers.py | 129 ++++++---- baselines/common/atari_wrappers_deprecated.py | 239 ------------------ baselines/common/azure_utils.py | 7 +- baselines/common/distributions.py | 1 - baselines/common/misc_util.py | 71 ------ baselines/common/tf_util.py | 68 ----- baselines/common/vec_env/subproc_vec_env.py | 28 +- baselines/ddpg/main.py | 19 +- baselines/ddpg/util.py | 3 - baselines/deepq/__init__.py | 5 +- baselines/deepq/experiments/atari/enjoy.py | 4 +- baselines/deepq/experiments/atari/train.py | 6 +- .../deepq/experiments/atari/wang2015_eval.py | 12 +- baselines/deepq/experiments/enjoy_pong.py | 4 +- baselines/deepq/experiments/run_atari.py | 47 ++++ baselines/deepq/experiments/train_pong.py | 34 --- baselines/deepq/simple.py | 50 ++-- baselines/logger.py | 226 ++++++++++++----- baselines/ppo1/cnn_policy.py | 3 +- baselines/ppo1/mlp_policy.py | 6 +- baselines/ppo1/pposgd_simple.py | 4 +- baselines/ppo1/run_atari.py | 30 +-- baselines/ppo1/run_mujoco.py | 13 +- baselines/trpo_mpi/nosharing_cnn_policy.py | 5 +- baselines/trpo_mpi/run_atari.py | 32 +-- baselines/trpo_mpi/run_mujoco.py | 8 +- baselines/trpo_mpi/trpo_mpi.py | 2 +- 40 files changed, 600 insertions(+), 823 deletions(-) delete mode 100644 baselines/common/atari_wrappers_deprecated.py create mode 100644 baselines/deepq/experiments/run_atari.py delete mode 100644 baselines/deepq/experiments/train_pong.py diff --git a/.gitignore b/.gitignore index 8b8b5de..ba043ee 100644 --- a/.gitignore +++ b/.gitignore @@ -30,3 +30,6 @@ src *.egg-info .cache + +MUJOCO_LOG.TXT + diff --git a/README.md b/README.md index e041600..a1aac33 100644 --- a/README.md +++ b/README.md @@ -20,3 +20,14 @@ pip install -e . - [DQN](baselines/deepq) - [PPO](baselines/ppo1) - [TRPO](baselines/trpo_mpi) + +To cite this repository in publications: + + @misc{baselines, + author = {Hesse, Christopher and Plappert, Matthias and Radford, Alec and Schulman, John and Sidor, Szymon and Wu, Yuhuai}, + title = {OpenAI Baselines}, + year = {2017}, + publisher = {GitHub}, + journal = {GitHub repository}, + howpublished = {\url{https://github.com/openai/baselines}}, + } diff --git a/baselines/a2c/policies.py b/baselines/a2c/policies.py index d58a32b..4c37df7 100644 --- a/baselines/a2c/policies.py +++ b/baselines/a2c/policies.py @@ -1,9 +1,6 @@ import numpy as np import tensorflow as tf -from baselines.a2c.utils import conv, fc, conv_to_fc, batch_to_seq, seq_to_batch, lstm, lnlstm, sample, check_shape -from baselines.common.distributions import make_pdtype -import baselines.common.tf_util as U -import gym +from baselines.a2c.utils import conv, fc, conv_to_fc, batch_to_seq, seq_to_batch, lstm, lnlstm, sample class LnLstmPolicy(object): def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, nlstm=256, reuse=False): diff --git a/baselines/a2c/run_atari.py b/baselines/a2c/run_atari.py index e277ebe..ed20f1c 100644 --- a/baselines/a2c/run_atari.py +++ b/baselines/a2c/run_atari.py @@ -5,18 +5,15 @@ from baselines.common import set_global_seeds from baselines import bench from baselines.a2c.a2c import learn from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv -from baselines.common.atari_wrappers import wrap_deepmind +from baselines.common.atari_wrappers import make_atari, wrap_deepmind from baselines.a2c.policies import CnnPolicy, LstmPolicy, LnLstmPolicy -def train(env_id, num_frames, seed, policy, lrschedule, num_cpu): - num_timesteps = int(num_frames / 4 * 1.1) - # divide by 4 due to frameskip, then do a little extras so episodes end +def train(env_id, num_timesteps, seed, policy, lrschedule, num_cpu): def make_env(rank): def _thunk(): - env = gym.make(env_id) + env = make_atari(env_id) env.seed(seed + rank) - env = bench.Monitor(env, logger.get_dir() and - os.path.join(logger.get_dir(), "{}.monitor.json".format(rank))) + env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) gym.logger.setLevel(logging.WARN) return wrap_deepmind(env) return _thunk @@ -28,7 +25,7 @@ def train(env_id, num_frames, seed, policy, lrschedule, num_cpu): policy_fn = LstmPolicy elif policy == 'lnlstm': policy_fn = LnLstmPolicy - learn(policy_fn, env, seed, total_timesteps=num_timesteps, lrschedule=lrschedule) + learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), lrschedule=lrschedule) env.close() def main(): @@ -38,10 +35,10 @@ def main(): parser.add_argument('--seed', help='RNG seed', type=int, default=0) parser.add_argument('--policy', help='Policy architecture', choices=['cnn', 'lstm', 'lnlstm'], default='cnn') parser.add_argument('--lrschedule', help='Learning rate schedule', choices=['constant', 'linear'], default='constant') - parser.add_argument('--million_frames', help='How many frames to train (/ 1e6). ' - 'This number gets divided by 4 due to frameskip', type=int, default=40) + parser.add_argument('--num-timesteps', type=int, default=int(10e6)) args = parser.parse_args() - train(args.env, num_frames=1e6 * args.million_frames, seed=args.seed, + logger.configure() + train(args.env, num_timesteps=args.num_timesteps, seed=args.seed, policy=args.policy, lrschedule=args.lrschedule, num_cpu=16) if __name__ == '__main__': diff --git a/baselines/acktr/acktr_cont.py b/baselines/acktr/acktr_cont.py index 22cfbb1..93fb164 100644 --- a/baselines/acktr/acktr_cont.py +++ b/baselines/acktr/acktr_cont.py @@ -110,7 +110,6 @@ def learn(env, policy, vf, gamma, lam, timesteps_per_batch, num_timesteps, ob_no = np.concatenate([path["observation"] for path in paths]) action_na = np.concatenate([path["action"] for path in paths]) oldac_dist = np.concatenate([path["action_dist"] for path in paths]) - logp_n = np.concatenate([path["logp"] for path in paths]) adv_n = np.concatenate(advs) standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8) @@ -126,7 +125,7 @@ def learn(env, policy, vf, gamma, lam, timesteps_per_batch, num_timesteps, U.eval(tf.assign(stepsize, tf.maximum(min_stepsize, stepsize / 1.5))) elif kl < desired_kl / 2: logger.log("kl too low") - U.eval(tf.assign(stepsize, tf.minimum(max_stepsize, stepsize * 1.5))) + U.eval(tf.assign(stepsize, tf.minimum(max_stepsize, stepsize * 1.5))) else: logger.log("kl just right!") @@ -138,3 +137,6 @@ def learn(env, policy, vf, gamma, lam, timesteps_per_batch, num_timesteps, callback() logger.dump_tabular() i += 1 + + coord.request_stop() + coord.join(enqueue_threads) diff --git a/baselines/acktr/acktr_disc.py b/baselines/acktr/acktr_disc.py index feb702c..56e0f03 100644 --- a/baselines/acktr/acktr_disc.py +++ b/baselines/acktr/acktr_disc.py @@ -113,7 +113,6 @@ class Runner(object): nenv = env.num_envs self.batch_ob_shape = (nenv*nsteps, nh, nw, nc*nstack) self.obs = np.zeros((nenv, nh, nw, nc*nstack), dtype=np.uint8) - self.nc = nc obs = env.reset() self.update_obs(obs) self.gamma = gamma @@ -122,8 +121,8 @@ class Runner(object): self.dones = [False for _ in range(nenv)] def update_obs(self, obs): - self.obs = np.roll(self.obs, shift=-self.nc, axis=3) - self.obs[:, :, :, -self.nc:] = obs + self.obs = np.roll(self.obs, shift=-1, axis=3) + self.obs[:, :, :, -1] = obs[:, :, :, 0] def run(self): mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[] @@ -189,7 +188,8 @@ def learn(policy, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interval runner = Runner(env, model, nsteps=nsteps, nstack=nstack, gamma=gamma) nbatch = nenvs*nsteps tstart = time.time() - enqueue_threads = model.q_runner.create_threads(model.sess, coord=tf.train.Coordinator(), start=True) + coord = tf.train.Coordinator() + enqueue_threads = model.q_runner.create_threads(model.sess, coord=coord, start=True) for update in range(1, total_timesteps//nbatch+1): obs, states, rewards, masks, actions, values = runner.run() policy_loss, value_loss, policy_entropy = model.train(obs, states, rewards, masks, actions, values) @@ -211,5 +211,6 @@ def learn(policy, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interval savepath = osp.join(logger.get_dir(), 'checkpoint%.5i'%update) print('Saving to', savepath) model.save(savepath) - + coord.request_stop() + coord.join(enqueue_threads) env.close() diff --git a/baselines/acktr/policies.py b/baselines/acktr/policies.py index 13f722c..47965a5 100644 --- a/baselines/acktr/policies.py +++ b/baselines/acktr/policies.py @@ -1,9 +1,7 @@ import numpy as np import tensorflow as tf from baselines.acktr.utils import conv, fc, dense, conv_to_fc, sample, kl_div -from baselines.common.distributions import make_pdtype import baselines.common.tf_util as U -import gym class CnnPolicy(object): @@ -51,7 +49,6 @@ class GaussianMlpPolicy(object): oldac_na = tf.placeholder(tf.float32, shape=[None, ac_dim], name="ac") # batch of actions previous actions oldac_dist = tf.placeholder(tf.float32, shape=[None, ac_dim*2], name="oldac_dist") # batch of actions previous action distributions adv_n = tf.placeholder(tf.float32, shape=[None], name="adv") # advantage function estimate - oldlogprob_n = tf.placeholder(tf.float32, shape=[None], name='oldlogprob') # log probability of previous actions wd_dict = {} h1 = tf.nn.tanh(dense(ob_no, 64, "h1", weight_init=U.normc_initializer(1.0), bias_init=0.0, weight_loss_dict=wd_dict)) h2 = tf.nn.tanh(dense(h1, 64, "h2", weight_init=U.normc_initializer(1.0), bias_init=0.0, weight_loss_dict=wd_dict)) diff --git a/baselines/acktr/run_atari.py b/baselines/acktr/run_atari.py index d295ba8..11b9461 100644 --- a/baselines/acktr/run_atari.py +++ b/baselines/acktr/run_atari.py @@ -5,24 +5,22 @@ from baselines.common import set_global_seeds from baselines import bench from baselines.acktr.acktr_disc import learn from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv -from baselines.common.atari_wrappers import wrap_deepmind +from baselines.common.atari_wrappers import make_atari, wrap_deepmind from baselines.acktr.policies import CnnPolicy -def train(env_id, num_frames, seed, num_cpu): - num_timesteps = int(num_frames / 4 * 1.1) +def train(env_id, num_timesteps, seed, num_cpu): def make_env(rank): def _thunk(): - env = gym.make(env_id) + env = make_atari(env_id) env.seed(seed + rank) - if logger.get_dir(): - env = bench.Monitor(env, os.path.join(logger.get_dir(), "{}.monitor.json".format(rank))) + env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) gym.logger.setLevel(logging.WARN) return wrap_deepmind(env) return _thunk set_global_seeds(seed) env = SubprocVecEnv([make_env(i) for i in range(num_cpu)]) policy_fn = CnnPolicy - learn(policy_fn, env, seed, total_timesteps=num_timesteps, nprocs=num_cpu) + learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), nprocs=num_cpu) env.close() def main(): @@ -30,10 +28,10 @@ def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4') parser.add_argument('--seed', help='RNG seed', type=int, default=0) - parser.add_argument('--million_frames', help='How many frames to train (/ 1e6). ' - 'This number gets divided by 4 due to frameskip', type=int, default=40) - args = parser.parse_args() - train(args.env, num_frames=1e6 * args.million_frames, seed=args.seed, num_cpu=32) + parser.add_argument('--num-timesteps', type=int, default=int(10e6)) + args = parser.parse_args() + logger.configure() + train(args.env, num_timesteps=args.num_timesteps, seed=args.seed, num_cpu=32) if __name__ == '__main__': diff --git a/baselines/acktr/run_mujoco.py b/baselines/acktr/run_mujoco.py index 70859c9..3e16309 100644 --- a/baselines/acktr/run_mujoco.py +++ b/baselines/acktr/run_mujoco.py @@ -13,13 +13,12 @@ from baselines.acktr.value_functions import NeuralNetValueFunction def train(env_id, num_timesteps, seed): env=gym.make(env_id) - if logger.get_dir(): - env = bench.Monitor(env, os.path.join(logger.get_dir(), "monitor.json")) + env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) set_global_seeds(seed) env.seed(seed) gym.logger.setLevel(logging.WARN) - with tf.Session(config=tf.ConfigProto()) as session: + with tf.Session(config=tf.ConfigProto()): ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.shape[0] with tf.variable_scope("vf"): @@ -38,5 +37,7 @@ if __name__ == "__main__": parser = argparse.ArgumentParser(description='Run Mujoco benchmark.') parser.add_argument('--seed', help='RNG seed', type=int, default=0) parser.add_argument('--env', help='environment ID', type=str, default="Reacher-v1") + parser.add_argument('--num-timesteps', type=int, default=int(1e6)) args = parser.parse_args() - train(args.env, num_timesteps=1e6, seed=args.seed) + logger.configure() + train(args.env, num_timesteps=args.num_timesteps, seed=args.seed) diff --git a/baselines/acktr/value_functions.py b/baselines/acktr/value_functions.py index f4fa4a6..5151cfc 100644 --- a/baselines/acktr/value_functions.py +++ b/baselines/acktr/value_functions.py @@ -13,7 +13,7 @@ class NeuralNetValueFunction(object): wd_dict = {} h1 = tf.nn.elu(dense(X, 64, "h1", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)) h2 = tf.nn.elu(dense(h1, 64, "h2", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)) - vpred_n = dense(h2, 1, "hfinal", weight_init=None, bias_init=0, weight_loss_dict=wd_dict)[:,0] + vpred_n = dense(h2, 1, "hfinal", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)[:,0] sample_vpred_n = vpred_n + tf.random_normal(tf.shape(vpred_n)) wd_loss = tf.get_collection("vf_losses", None) loss = U.mean(tf.square(vpred_n - vtarg_n)) + tf.add_n(wd_loss) @@ -22,7 +22,7 @@ class NeuralNetValueFunction(object): optim = kfac.KfacOptimizer(learning_rate=0.001, cold_lr=0.001*(1-0.9), momentum=0.9, \ clip_kl=0.3, epsilon=0.1, stats_decay=0.95, \ async=1, kfac_update=2, cold_iter=50, \ - weight_decay_dict=wd_dict, max_grad_norm=1.0) + weight_decay_dict=wd_dict, max_grad_norm=None) vf_var_list = [] for var in tf.trainable_variables(): if "vf" in var.name: diff --git a/baselines/bench/__init__.py b/baselines/bench/__init__.py index 6947a83..3503929 100644 --- a/baselines/bench/__init__.py +++ b/baselines/bench/__init__.py @@ -1,3 +1,3 @@ from baselines.bench.benchmarks import * from baselines.bench.monitor import * - +from baselines.bench.simple_bench import simple_bench diff --git a/baselines/bench/benchmarks.py b/baselines/bench/benchmarks.py index 6b8f52a..dd49757 100644 --- a/baselines/bench/benchmarks.py +++ b/baselines/bench/benchmarks.py @@ -1,61 +1,71 @@ +import os.path as osp + _atari7 = ['BeamRider', 'Breakout', 'Enduro', 'Pong', 'Qbert', 'Seaquest', 'SpaceInvaders'] _atariexpl7 = ['Freeway', 'Gravitar', 'MontezumaRevenge', 'Pitfall', 'PrivateEye', 'Solaris', 'Venture'] _BENCHMARKS = [] + def register_benchmark(benchmark): for b in _BENCHMARKS: if b['name'] == benchmark['name']: - raise ValueError('Benchmark with name %s already registered!'%b['name']) + raise ValueError('Benchmark with name %s already registered!' % b['name']) _BENCHMARKS.append(benchmark) + def list_benchmarks(): return [b['name'] for b in _BENCHMARKS] + def get_benchmark(benchmark_name): for b in _BENCHMARKS: if b['name'] == benchmark_name: return b raise ValueError('%s not found! Known benchmarks: %s' % (benchmark_name, list_benchmarks())) + def get_task(benchmark, env_id): """Get a task by env_id. Return None if the benchmark doesn't have the env""" return next(filter(lambda task: task['env_id'] == env_id, benchmark['tasks']), None) + def find_task_for_env_id_in_any_benchmark(env_id): for bm in _BENCHMARKS: for task in bm["tasks"]: - if task["env_id"]==env_id: + if task["env_id"] == env_id: return bm, task return None, None + _ATARI_SUFFIX = 'NoFrameskip-v4' register_benchmark({ - 'name' : 'Atari200M', - 'description' :'7 Atari games from Mnih et al. (2013), with pixel observations, 200M frames', - 'tasks' : [{'env_id' : _game + _ATARI_SUFFIX, 'trials' : 2, 'num_timesteps' : int(200e6)} for _game in _atari7] + 'name': 'Atari50M', + 'description': '7 Atari games from Mnih et al. (2013), with pixel observations, 50M timesteps', + 'tasks': [{'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(50e6)} for _game in _atari7] }) register_benchmark({ - 'name' : 'Atari40M', - 'description' :'7 Atari games from Mnih et al. (2013), with pixel observations, 40M frames', - 'tasks' : [{'env_id' : _game + _ATARI_SUFFIX, 'trials' : 2, 'num_timesteps' : int(40e6)} for _game in _atari7] + 'name': 'Atari10M', + 'description': '7 Atari games from Mnih et al. (2013), with pixel observations, 10M timesteps', + 'tasks': [{'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(10e6)} for _game in _atari7] }) register_benchmark({ - 'name' : 'Atari1Hr', - 'description' :'7 Atari games from Mnih et al. (2013), with pixel observations, 1 hour of walltime', - 'tasks' : [{'env_id' : _game + _ATARI_SUFFIX, 'trials' : 2, 'num_seconds' : 60*60} for _game in _atari7] + 'name': 'Atari1Hr', + 'description': '7 Atari games from Mnih et al. (2013), with pixel observations, 1 hour of walltime', + 'tasks': [{'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_seconds': 60 * 60} for _game in _atari7] }) register_benchmark({ - 'name' : 'AtariExploration40M', - 'description' :'7 Atari games emphasizing exploration, with pixel observations, 40M frames', - 'tasks' : [{'env_id' : _game + _ATARI_SUFFIX, 'trials' : 2, 'num_timesteps' : int(40e6)} for _game in _atariexpl7] + 'name': 'AtariExploration10M', + 'description': '7 Atari games emphasizing exploration, with pixel observations, 10M timesteps', + 'tasks': [{'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(10e6)} for _game in _atariexpl7] }) + + # MuJoCo _mujocosmall = [ @@ -63,78 +73,60 @@ _mujocosmall = [ 'HalfCheetah-v1', 'Hopper-v1', 'Walker2d-v1', 'Reacher-v1', 'Swimmer-v1'] register_benchmark({ - 'name' : 'Mujoco1M', - 'description' : 'Some small 2D MuJoCo tasks, run for 1M timesteps', - 'tasks' : [{'env_id' : _envid, 'trials' : 3, 'num_timesteps' : int(1e6)} for _envid in _mujocosmall] + 'name': 'Mujoco1M', + 'description': 'Some small 2D MuJoCo tasks, run for 1M timesteps', + 'tasks': [{'env_id': _envid, 'trials': 3, 'num_timesteps': int(1e6)} for _envid in _mujocosmall] }) register_benchmark({ - 'name' : 'MujocoWalkers', - 'description' : 'MuJoCo forward walkers, run for 8M, humanoid 100M', - 'tasks' : [ - {'env_id' : "Hopper-v1", 'trials' : 4, 'num_timesteps' : 8*1000000 }, - {'env_id' : "Walker2d-v1", 'trials' : 4, 'num_timesteps' : 8*1000000 }, - {'env_id' : "Humanoid-v1", 'trials' : 4, 'num_timesteps' : 100*1000000 }, + 'name': 'MujocoWalkers', + 'description': 'MuJoCo forward walkers, run for 8M, humanoid 100M', + 'tasks': [ + {'env_id': "Hopper-v1", 'trials': 4, 'num_timesteps': 8 * 1000000}, + {'env_id': "Walker2d-v1", 'trials': 4, 'num_timesteps': 8 * 1000000}, + {'env_id': "Humanoid-v1", 'trials': 4, 'num_timesteps': 100 * 1000000}, ] }) -# To reproduce: -# python3 baselines/baselines/ppo2/ppo2_run_benchmark.py gce MujocoWalkers myrun_ppo2_whiteobs1_cpu8 -# (observation input filters necessary) - # Roboschool register_benchmark({ - 'name' : 'Roboschool8M', - 'description' : 'Small 2D tasks, up to 30 minutes to complete on 8 cores', - 'tasks' : [ - {'env_id' : "RoboschoolReacher-v1", 'trials' : 4, 'num_timesteps' : 2*1000000 }, - {'env_id' : "RoboschoolAnt-v1", 'trials' : 4, 'num_timesteps' : 8*1000000 }, - {'env_id' : "RoboschoolHalfCheetah-v1", 'trials' : 4, 'num_timesteps' : 8*1000000 }, - {'env_id' : "RoboschoolHopper-v1", 'trials' : 4, 'num_timesteps' : 8*1000000 }, - {'env_id' : "RoboschoolWalker2d-v1", 'trials' : 4, 'num_timesteps' : 8*1000000 }, - ] + 'name': 'Roboschool8M', + 'description': 'Small 2D tasks, up to 30 minutes to complete on 8 cores', + 'tasks': [ + {'env_id': "RoboschoolReacher-v1", 'trials': 4, 'num_timesteps': 2 * 1000000}, + {'env_id': "RoboschoolAnt-v1", 'trials': 4, 'num_timesteps': 8 * 1000000}, + {'env_id': "RoboschoolHalfCheetah-v1", 'trials': 4, 'num_timesteps': 8 * 1000000}, + {'env_id': "RoboschoolHopper-v1", 'trials': 4, 'num_timesteps': 8 * 1000000}, + {'env_id': "RoboschoolWalker2d-v1", 'trials': 4, 'num_timesteps': 8 * 1000000}, + ] }) register_benchmark({ - 'name' : 'RoboschoolHarder', - 'description' : 'Test your might!!! Up to 12 hours on 32 cores', - 'tasks' : [ - {'env_id' : "RoboschoolHumanoid-v1", 'trials' : 4, 'num_timesteps' : 100*1000000 }, - {'env_id' : "RoboschoolHumanoidFlagrun-v1", 'trials' : 4, 'num_timesteps' : 200*1000000 }, - {'env_id' : "RoboschoolHumanoidFlagrunHarder-v1", 'trials' : 4, 'num_timesteps' : 400*1000000 }, - ] + 'name': 'RoboschoolHarder', + 'description': 'Test your might!!! Up to 12 hours on 32 cores', + 'tasks': [ + {'env_id': "RoboschoolHumanoid-v1", 'trials': 4, 'num_timesteps': 100 * 1000000}, + {'env_id': "RoboschoolHumanoidFlagrun-v1", 'trials': 4, 'num_timesteps': 200 * 1000000}, + {'env_id': "RoboschoolHumanoidFlagrunHarder-v1", 'trials': 4, 'num_timesteps': 400 * 1000000}, + ] }) -# To reproduce: -# python3 baselines/baselines/ppo2/ppo2_run_benchmark.py gce Roboschool8M myrun_ppo2_cpu8 -# python3 baselines/baselines/ppo2/ppo2_run_benchmark.py gce RoboschoolHarder myrun_ppo2_cpu32_large_samples65536 -# (Large network, train on 65536 samples each iteration. Also, _large is really necessary only for Harder) - # Other -_atari50 = [ # actually 49 - 'Alien', 'Amidar', 'Assault', 'Asterix', 'Asteroids', - 'Atlantis', 'BankHeist', 'BattleZone', 'BeamRider', 'Bowling', - 'Boxing', 'Breakout', 'Centipede', 'ChopperCommand', 'CrazyClimber', - 'DemonAttack', 'DoubleDunk', 'Enduro', 'FishingDerby', 'Freeway', - 'Frostbite', 'Gopher', 'Gravitar', 'IceHockey', 'Jamesbond', - 'Kangaroo', 'Krull', 'KungFuMaster', 'MontezumaRevenge', 'MsPacman', - 'NameThisGame', 'Pitfall', 'Pong', 'PrivateEye', 'Qbert', - 'Riverraid', 'RoadRunner', 'Robotank', 'Seaquest', 'SpaceInvaders', - 'StarGunner', 'Tennis', 'TimePilot', 'Tutankham', 'UpNDown', - 'Venture', 'VideoPinball', 'WizardOfWor', 'Zaxxon', +_atari50 = [ # actually 47 + 'Alien', 'Amidar', 'Assault', 'Asterix', 'Asteroids', + 'Atlantis', 'BankHeist', 'BattleZone', 'BeamRider', 'Bowling', + 'Breakout', 'Centipede', 'ChopperCommand', 'CrazyClimber', + 'DemonAttack', 'DoubleDunk', 'Enduro', 'FishingDerby', 'Freeway', + 'Frostbite', 'Gopher', 'Gravitar', 'IceHockey', 'Jamesbond', + 'Kangaroo', 'Krull', 'KungFuMaster', 'MontezumaRevenge', 'MsPacman', + 'NameThisGame', 'Pitfall', 'Pong', 'PrivateEye', 'Qbert', + 'RoadRunner', 'Robotank', 'Seaquest', 'SpaceInvaders', 'StarGunner', + 'Tennis', 'TimePilot', 'Tutankham', 'UpNDown', 'Venture', + 'VideoPinball', 'WizardOfWor', 'Zaxxon', ] register_benchmark({ - 'name' : 'Atari50_40M', - 'description' :'7 Atari games from Mnih et al. (2013), with pixel observations, 40M frames', - 'tasks' : [{'env_id' : _game + _ATARI_SUFFIX, 'trials' : 3, 'num_timesteps' : int(40e6)} for _game in _atari50] + 'name': 'Atari50_10M', + 'description': '47 Atari games from Mnih et al. (2013), with pixel observations, 10M timesteps', + 'tasks': [{'env_id': _game + _ATARI_SUFFIX, 'trials': 3, 'num_timesteps': int(10e6)} for _game in _atari50] }) - -def env_shortname(s): - "Make typical names above shorter, while keeping recognizable" - s = s.replace("NoFrameskip", "") - if s[:10]=="Roboschool": s = s[10:] - i = s.rfind("-v") - if i!=-1: s = s[:i] - - return s.lower() diff --git a/baselines/bench/monitor.py b/baselines/bench/monitor.py index 9118acf..b2a9d75 100644 --- a/baselines/bench/monitor.py +++ b/baselines/bench/monitor.py @@ -2,20 +2,17 @@ __all__ = ['Monitor', 'get_monitor_files', 'load_results'] import gym from gym.core import Wrapper -from os import path import time from glob import glob - -try: - import ujson as json # Not necessary for monitor writing, but very useful for monitor loading -except ImportError: - import json +import csv +import os.path as osp +import json class Monitor(Wrapper): - EXT = "monitor.json" + EXT = "monitor.csv" f = None - def __init__(self, env, filename, allow_early_resets=False): + def __init__(self, env, filename, allow_early_resets=False, reset_keywords=()): Wrapper.__init__(self, env=env) self.tstart = time.time() if filename is None: @@ -23,50 +20,38 @@ class Monitor(Wrapper): self.logger = None else: if not filename.endswith(Monitor.EXT): - filename = filename + "." + Monitor.EXT + if osp.isdir(filename): + filename = osp.join(filename, Monitor.EXT) + else: + filename = filename + "." + Monitor.EXT self.f = open(filename, "wt") - self.logger = JSONLogger(self.f) - self.logger.writekvs({"t_start": self.tstart, "gym_version": gym.__version__, - "env_id": env.spec.id if env.spec else 'Unknown'}) + self.f.write('#%s\n'%json.dumps({"t_start": self.tstart, "gym_version": gym.__version__, + "env_id": env.spec.id if env.spec else 'Unknown'})) + self.logger = csv.DictWriter(self.f, fieldnames=('r', 'l', 't')+reset_keywords) + self.logger.writeheader() + + self.reset_keywords = reset_keywords self.allow_early_resets = allow_early_resets self.rewards = None self.needs_reset = True self.episode_rewards = [] self.episode_lengths = [] self.total_steps = 0 - self.current_metadata = {} # extra info that gets injected into each log entry - # Useful for metalearning where we're modifying the environment externally - # But want our logs to know about these modifications + self.current_reset_info = {} # extra info about the current episode, that was passed in during reset() - def __getstate__(self): # XXX - d = self.__dict__.copy() - if self.f: - del d['f'], d['logger'] - d['_filename'] = self.f.name - d['_num_episodes'] = len(self.episode_rewards) - else: - d['_filename'] = None - return d - def __setstate__(self, d): - filename = d.pop('_filename') - self.__dict__ = d - if filename is not None: - nlines = d.pop('_num_episodes') + 1 - self.f = open(filename, "r+t") - for _ in range(nlines): - self.f.readline() - self.f.truncate() - self.logger = JSONLogger(self.f) - - - def reset(self): + def _reset(self, **kwargs): if not self.allow_early_resets and not self.needs_reset: raise RuntimeError("Tried to reset an environment before done. If you want to allow early resets, wrap your env with Monitor(env, path, allow_early_resets=True)") self.rewards = [] self.needs_reset = False - return self.env.reset() + for k in self.reset_keywords: + v = kwargs.get(k) + if v is None: + raise ValueError('Expected you to pass kwarg %s into reset'%k) + self.current_reset_info[k] = v + return self.env.reset(**kwargs) - def step(self, action): + def _step(self, action): if self.needs_reset: raise RuntimeError("Tried to step environment that needs reset") ob, rew, done, info = self.env.step(action) @@ -75,10 +60,11 @@ class Monitor(Wrapper): self.needs_reset = True eprew = sum(self.rewards) eplen = len(self.rewards) - epinfo = {"r": eprew, "l": eplen, "t": round(time.time() - self.tstart, 6)} - epinfo.update(self.current_metadata) + epinfo = {"r": round(eprew, 6), "l": eplen, "t": round(time.time() - self.tstart, 6)} + epinfo.update(self.current_reset_info) if self.logger: - self.logger.writekvs(epinfo) + self.logger.writerow(epinfo) + self.f.flush() self.episode_rewards.append(eprew) self.episode_lengths.append(eplen) info['episode'] = epinfo @@ -98,52 +84,40 @@ class Monitor(Wrapper): def get_episode_lengths(self): return self.episode_lengths -class JSONLogger(object): - def __init__(self, file): - self.file = file - - def writekvs(self, kvs): - for k,v in kvs.items(): - if hasattr(v, 'dtype'): - v = v.tolist() - kvs[k] = float(v) - self.file.write(json.dumps(kvs) + '\n') - self.file.flush() - - class LoadMonitorResultsError(Exception): pass def get_monitor_files(dir): - return glob(path.join(dir, "*" + Monitor.EXT)) + return glob(osp.join(dir, "*" + Monitor.EXT)) -def load_results(dir, raw_episodes=False): - fnames = get_monitor_files(dir) - if not fnames: +def load_results(dir): + import pandas + monitor_files = glob(osp.join(dir, "*monitor.*")) # get both csv and (old) json files + if not monitor_files: raise LoadMonitorResultsError("no monitor files of the form *%s found in %s" % (Monitor.EXT, dir)) - episodes = [] + dfs = [] headers = [] - for fname in fnames: + for fname in monitor_files: with open(fname, 'rt') as fh: - lines = fh.readlines() - header = json.loads(lines[0]) - headers.append(header) - for line in lines[1:]: - episode = json.loads(line) - episode['abstime'] = header['t_start'] + episode['t'] - del episode['t'] - episodes.append(episode) - header0 = headers[0] - for header in headers[1:]: - assert header['env_id'] == header0['env_id'], "mixing data from two envs" - episodes = sorted(episodes, key=lambda e: e['abstime']) - if raw_episodes: - return episodes - else: - return { - 'env_info': {'env_id': header0['env_id'], 'gym_version': header0['gym_version']}, - 'episode_end_times': [e['abstime'] for e in episodes], - 'episode_lengths': [e['l'] for e in episodes], - 'episode_rewards': [e['r'] for e in episodes], - 'initial_reset_time': min([min(header['t_start'] for header in headers)]) - } + if fname.endswith('csv'): + firstline = fh.readline() + assert firstline[0] == '#' + header = json.loads(firstline[1:]) + df = pandas.read_csv(fh, index_col=None) + headers.append(header) + elif fname.endswith('json'): # Deprecated json format + episodes = [] + lines = fh.readlines() + header = json.loads(lines[0]) + headers.append(header) + for line in lines[1:]: + episode = json.loads(line) + episodes.append(episode) + df = pandas.DataFrame(episodes) + df['t'] += header['t_start'] + dfs.append(df) + df = pandas.concat(dfs) + df.sort_values('t', inplace=True) + df['t'] -= min(header['t_start'] for header in headers) + df.headers = headers # HACK to preserve backwards compatibility + return df \ No newline at end of file diff --git a/baselines/common/atari_wrappers.py b/baselines/common/atari_wrappers.py index 182c66d..7738901 100644 --- a/baselines/common/atari_wrappers.py +++ b/baselines/common/atari_wrappers.py @@ -1,9 +1,8 @@ import numpy as np from collections import deque -from PIL import Image import gym from gym import spaces - +import cv2 class NoopResetEnv(gym.Wrapper): def __init__(self, env, noop_max=30): @@ -13,11 +12,16 @@ class NoopResetEnv(gym.Wrapper): gym.Wrapper.__init__(self, env) self.noop_max = noop_max self.override_num_noops = None - assert env.unwrapped.get_action_meanings()[0] == 'NOOP' + if isinstance(env.action_space, gym.spaces.MultiBinary): + self.noop_action = np.zeros(self.env.action_space.n, dtype=np.int64) + else: + # used for atari environments + self.noop_action = 0 + assert env.unwrapped.get_action_meanings()[0] == 'NOOP' - def _reset(self): + def _reset(self, **kwargs): """ Do no-op action for a number of steps in [1, noop_max].""" - self.env.reset() + self.env.reset(**kwargs) if self.override_num_noops is not None: noops = self.override_num_noops else: @@ -25,9 +29,9 @@ class NoopResetEnv(gym.Wrapper): assert noops > 0 obs = None for _ in range(noops): - obs, _, done, _ = self.env.step(0) + obs, _, done, _ = self.env.step(self.noop_action) if done: - obs = self.env.reset() + obs = self.env.reset(**kwargs) return obs class FireResetEnv(gym.Wrapper): @@ -37,14 +41,14 @@ class FireResetEnv(gym.Wrapper): assert env.unwrapped.get_action_meanings()[1] == 'FIRE' assert len(env.unwrapped.get_action_meanings()) >= 3 - def _reset(self): - self.env.reset() + def _reset(self, **kwargs): + self.env.reset(**kwargs) obs, _, done, _ = self.env.step(1) if done: - self.env.reset() + self.env.reset(**kwargs) obs, _, done, _ = self.env.step(2) if done: - self.env.reset() + self.env.reset(**kwargs) return obs class EpisodicLifeEnv(gym.Wrapper): @@ -70,13 +74,13 @@ class EpisodicLifeEnv(gym.Wrapper): self.lives = lives return obs, reward, done, info - def _reset(self): + def _reset(self, **kwargs): """Reset only when lives are exhausted. This way all states are still reachable even though lives are episodic, and the learner need not know about any of this behind-the-scenes. """ if self.was_real_done: - obs = self.env.reset() + obs = self.env.reset(**kwargs) else: # no-op step to advance from terminal/lost life state obs, _, _, _ = self.env.step(0) @@ -88,30 +92,26 @@ class MaxAndSkipEnv(gym.Wrapper): """Return only every `skip`-th frame""" gym.Wrapper.__init__(self, env) # most recent raw observations (for max pooling across time steps) - self._obs_buffer = deque(maxlen=2) + self._obs_buffer = np.zeros((2,)+env.observation_space.shape, dtype='uint8') self._skip = skip def _step(self, action): """Repeat action, sum reward, and max over last observations.""" total_reward = 0.0 done = None - for _ in range(self._skip): + for i in range(self._skip): obs, reward, done, info = self.env.step(action) - self._obs_buffer.append(obs) + if i == self._skip - 2: self._obs_buffer[0] = obs + if i == self._skip - 1: self._obs_buffer[1] = obs total_reward += reward if done: break - max_frame = np.max(np.stack(self._obs_buffer), axis=0) + # Note that the observation on the done=True frame + # doesn't matter + max_frame = self._obs_buffer.max(axis=0) return max_frame, total_reward, done, info - def _reset(self): - """Clear past frame buffer and init. to first obs. from inner env.""" - self._obs_buffer.clear() - obs = self.env.reset() - self._obs_buffer.append(obs) - return obs - class ClipRewardEnv(gym.RewardWrapper): def _reward(self, reward): """Bin reward to {+1, 0, -1} by its sign.""" @@ -121,52 +121,89 @@ class WarpFrame(gym.ObservationWrapper): def __init__(self, env): """Warp frames to 84x84 as done in the Nature paper and later work.""" gym.ObservationWrapper.__init__(self, env) - self.res = 84 - self.observation_space = spaces.Box(low=0, high=255, shape=(self.res, self.res, 1)) + self.width = 84 + self.height = 84 + self.observation_space = spaces.Box(low=0, high=255, shape=(self.height, self.width, 1)) - def _observation(self, obs): - frame = np.dot(obs.astype('float32'), np.array([0.299, 0.587, 0.114], 'float32')) - frame = np.array(Image.fromarray(frame).resize((self.res, self.res), - resample=Image.BILINEAR), dtype=np.uint8) - return frame.reshape((self.res, self.res, 1)) + def _observation(self, frame): + frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY) + frame = cv2.resize(frame, (self.width, self.height), interpolation=cv2.INTER_AREA) + return frame[:, :, None] class FrameStack(gym.Wrapper): def __init__(self, env, k): - """Buffer observations and stack across channels (last axis).""" + """Stack k last frames. + + Returns lazy array, which is much more memory efficient. + + See Also + -------- + baselines.common.atari_wrappers.LazyFrames + """ gym.Wrapper.__init__(self, env) self.k = k self.frames = deque([], maxlen=k) shp = env.observation_space.shape - assert shp[2] == 1 # can only stack 1-channel frames - self.observation_space = spaces.Box(low=0, high=255, shape=(shp[0], shp[1], k)) + self.observation_space = spaces.Box(low=0, high=255, shape=(shp[0], shp[1], shp[2] * k)) def _reset(self): - """Clear buffer and re-fill by duplicating the first observation.""" ob = self.env.reset() - for _ in range(self.k): self.frames.append(ob) - return self._observation() + for _ in range(self.k): + self.frames.append(ob) + return self._get_ob() def _step(self, action): ob, reward, done, info = self.env.step(action) self.frames.append(ob) - return self._observation(), reward, done, info + return self._get_ob(), reward, done, info - def _observation(self): + def _get_ob(self): assert len(self.frames) == self.k - return np.concatenate(self.frames, axis=2) + return LazyFrames(list(self.frames)) -def wrap_deepmind(env, episode_life=True, clip_rewards=True): - """Configure environment for DeepMind-style Atari. +class ScaledFloatFrame(gym.ObservationWrapper): + def _observation(self, observation): + # careful! This undoes the memory optimization, use + # with smaller replay buffers only. + return np.array(observation).astype(np.float32) / 255.0 - Note: this does not include frame stacking!""" - assert 'NoFrameskip' in env.spec.id # required for DeepMind-style skip - if episode_life: - env = EpisodicLifeEnv(env) +class LazyFrames(object): + def __init__(self, frames): + """This object ensures that common frames between the observations are only stored once. + It exists purely to optimize memory usage which can be huge for DQN's 1M frames replay + buffers. + + This object should only be converted to numpy array before being passed to the model. + + You'd not belive how complex the previous solution was.""" + self._frames = frames + + def __array__(self, dtype=None): + out = np.concatenate(self._frames, axis=2) + if dtype is not None: + out = out.astype(dtype) + return out + +def make_atari(env_id): + env = gym.make(env_id) + assert 'NoFrameskip' in env.spec.id env = NoopResetEnv(env, noop_max=30) env = MaxAndSkipEnv(env, skip=4) + return env + +def wrap_deepmind(env, episode_life=True, clip_rewards=True, frame_stack=False, scale=False): + """Configure environment for DeepMind-style Atari. + """ + if episode_life: + env = EpisodicLifeEnv(env) if 'FIRE' in env.unwrapped.get_action_meanings(): env = FireResetEnv(env) env = WarpFrame(env) + if scale: + env = ScaledFloatFrame(env) if clip_rewards: env = ClipRewardEnv(env) + if frame_stack: + env = FrameStack(env, 4) return env + diff --git a/baselines/common/atari_wrappers_deprecated.py b/baselines/common/atari_wrappers_deprecated.py deleted file mode 100644 index fe7ca26..0000000 --- a/baselines/common/atari_wrappers_deprecated.py +++ /dev/null @@ -1,239 +0,0 @@ -import cv2 -import gym -import numpy as np - -from collections import deque -from gym import spaces - - -class NoopResetEnv(gym.Wrapper): - def __init__(self, env=None, noop_max=30): - """Sample initial states by taking random number of no-ops on reset. - No-op is assumed to be action 0. - """ - super(NoopResetEnv, self).__init__(env) - self.noop_max = noop_max - self.override_num_noops = None - assert env.unwrapped.get_action_meanings()[0] == 'NOOP' - - def _reset(self): - """ Do no-op action for a number of steps in [1, noop_max].""" - self.env.reset() - if self.override_num_noops is not None: - noops = self.override_num_noops - else: - noops = np.random.randint(1, self.noop_max + 1) - assert noops > 0 - obs = None - for _ in range(noops): - obs, _, done, _ = self.env.step(0) - if done: - obs = self.env.reset() - return obs - - -class FireResetEnv(gym.Wrapper): - def __init__(self, env=None): - """For environments where the user need to press FIRE for the game to start.""" - super(FireResetEnv, self).__init__(env) - assert env.unwrapped.get_action_meanings()[1] == 'FIRE' - assert len(env.unwrapped.get_action_meanings()) >= 3 - - def _reset(self): - self.env.reset() - obs, _, done, _ = self.env.step(1) - if done: - self.env.reset() - obs, _, done, _ = self.env.step(2) - if done: - self.env.reset() - return obs - - -class EpisodicLifeEnv(gym.Wrapper): - def __init__(self, env=None): - """Make end-of-life == end-of-episode, but only reset on true game over. - Done by DeepMind for the DQN and co. since it helps value estimation. - """ - super(EpisodicLifeEnv, self).__init__(env) - self.lives = 0 - self.was_real_done = True - self.was_real_reset = False - - def _step(self, action): - obs, reward, done, info = self.env.step(action) - self.was_real_done = done - # check current lives, make loss of life terminal, - # then update lives to handle bonus lives - lives = self.env.unwrapped.ale.lives() - if lives < self.lives and lives > 0: - # for Qbert somtimes we stay in lives == 0 condtion for a few frames - # so its important to keep lives > 0, so that we only reset once - # the environment advertises done. - done = True - self.lives = lives - return obs, reward, done, info - - def _reset(self): - """Reset only when lives are exhausted. - This way all states are still reachable even though lives are episodic, - and the learner need not know about any of this behind-the-scenes. - """ - if self.was_real_done: - obs = self.env.reset() - self.was_real_reset = True - else: - # no-op step to advance from terminal/lost life state - obs, _, _, _ = self.env.step(0) - self.was_real_reset = False - self.lives = self.env.unwrapped.ale.lives() - return obs - - -class MaxAndSkipEnv(gym.Wrapper): - def __init__(self, env=None, skip=4): - """Return only every `skip`-th frame""" - super(MaxAndSkipEnv, self).__init__(env) - # most recent raw observations (for max pooling across time steps) - self._obs_buffer = deque(maxlen=2) - self._skip = skip - - def _step(self, action): - total_reward = 0.0 - done = None - for _ in range(self._skip): - obs, reward, done, info = self.env.step(action) - self._obs_buffer.append(obs) - total_reward += reward - if done: - break - - max_frame = np.max(np.stack(self._obs_buffer), axis=0) - - return max_frame, total_reward, done, info - - def _reset(self): - """Clear past frame buffer and init. to first obs. from inner env.""" - self._obs_buffer.clear() - obs = self.env.reset() - self._obs_buffer.append(obs) - return obs - - -class ProcessFrame84(gym.ObservationWrapper): - def __init__(self, env=None): - super(ProcessFrame84, self).__init__(env) - self.observation_space = spaces.Box(low=0, high=255, shape=(84, 84, 1)) - - def _observation(self, obs): - return ProcessFrame84.process(obs) - - @staticmethod - def process(frame): - if frame.size == 210 * 160 * 3: - img = np.reshape(frame, [210, 160, 3]).astype(np.float32) - elif frame.size == 250 * 160 * 3: - img = np.reshape(frame, [250, 160, 3]).astype(np.float32) - else: - assert False, "Unknown resolution." - img = img[:, :, 0] * 0.299 + img[:, :, 1] * 0.587 + img[:, :, 2] * 0.114 - resized_screen = cv2.resize(img, (84, 110), interpolation=cv2.INTER_AREA) - x_t = resized_screen[18:102, :] - x_t = np.reshape(x_t, [84, 84, 1]) - return x_t.astype(np.uint8) - - -class ClippedRewardsWrapper(gym.RewardWrapper): - def _reward(self, reward): - """Change all the positive rewards to 1, negative to -1 and keep zero.""" - return np.sign(reward) - - -class LazyFrames(object): - def __init__(self, frames): - """This object ensures that common frames between the observations are only stored once. - It exists purely to optimize memory usage which can be huge for DQN's 1M frames replay - buffers. - - This object should only be converted to numpy array before being passed to the model. - - You'd not belive how complex the previous solution was.""" - self._frames = frames - - def __array__(self, dtype=None): - out = np.concatenate(self._frames, axis=2) - if dtype is not None: - out = out.astype(dtype) - return out - - -class FrameStack(gym.Wrapper): - def __init__(self, env, k): - """Stack k last frames. - - Returns lazy array, which is much more memory efficient. - - See Also - -------- - baselines.common.atari_wrappers.LazyFrames - """ - gym.Wrapper.__init__(self, env) - self.k = k - self.frames = deque([], maxlen=k) - shp = env.observation_space.shape - self.observation_space = spaces.Box(low=0, high=255, shape=(shp[0], shp[1], shp[2] * k)) - - def _reset(self): - ob = self.env.reset() - for _ in range(self.k): - self.frames.append(ob) - return self._get_ob() - - def _step(self, action): - ob, reward, done, info = self.env.step(action) - self.frames.append(ob) - return self._get_ob(), reward, done, info - - def _get_ob(self): - assert len(self.frames) == self.k - return LazyFrames(list(self.frames)) - - -class ScaledFloatFrame(gym.ObservationWrapper): - def _observation(self, obs): - # careful! This undoes the memory optimization, use - # with smaller replay buffers only. - return np.array(obs).astype(np.float32) / 255.0 - - -def wrap_dqn(env): - """Apply a common set of wrappers for Atari games.""" - assert 'NoFrameskip' in env.spec.id - env = EpisodicLifeEnv(env) - env = NoopResetEnv(env, noop_max=30) - env = MaxAndSkipEnv(env, skip=4) - if 'FIRE' in env.unwrapped.get_action_meanings(): - env = FireResetEnv(env) - env = ProcessFrame84(env) - env = FrameStack(env, 4) - env = ClippedRewardsWrapper(env) - return env - - -class A2cProcessFrame(gym.Wrapper): - def __init__(self, env): - gym.Wrapper.__init__(self, env) - self.observation_space = spaces.Box(low=0, high=255, shape=(84, 84, 1)) - - def _step(self, action): - ob, reward, done, info = self.env.step(action) - return A2cProcessFrame.process(ob), reward, done, info - - def _reset(self): - return A2cProcessFrame.process(self.env.reset()) - - @staticmethod - def process(frame): - frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY) - frame = cv2.resize(frame, (84, 84), interpolation=cv2.INTER_AREA) - return frame.reshape(84, 84, 1) diff --git a/baselines/common/azure_utils.py b/baselines/common/azure_utils.py index a26cb3f..76380f4 100644 --- a/baselines/common/azure_utils.py +++ b/baselines/common/azure_utils.py @@ -10,10 +10,7 @@ except ImportError: from shutil import unpack_archive from threading import Event -"""TODOS: - - use Azure snapshots instead of hacky backups -""" - +# TODOS: use Azure snapshots instead of hacky backups def fixed_list_blobs(service, *args, **kwargs): """By defualt list_containers only returns a subset of results. @@ -37,7 +34,7 @@ def make_archive(source_path, dest_path): prefix_path = os.path.dirname(source_path) with zipfile.ZipFile(dest_path, "w", compression=zipfile.ZIP_STORED) as zf: if os.path.isdir(source_path): - for dirname, subdirs, files in os.walk(source_path): + for dirname, _subdirs, files in os.walk(source_path): zf.write(dirname, os.path.relpath(dirname, prefix_path)) for filename in files: filepath = os.path.join(dirname, filename) diff --git a/baselines/common/distributions.py b/baselines/common/distributions.py index 315c2c5..1dc02c3 100644 --- a/baselines/common/distributions.py +++ b/baselines/common/distributions.py @@ -2,7 +2,6 @@ import tensorflow as tf import numpy as np import baselines.common.tf_util as U from tensorflow.python.ops import math_ops -from tensorflow.python.ops import nn class Pd(object): """ diff --git a/baselines/common/misc_util.py b/baselines/common/misc_util.py index 4e45ce7..a8d7977 100644 --- a/baselines/common/misc_util.py +++ b/baselines/common/misc_util.py @@ -4,7 +4,6 @@ import os import pickle import random import tempfile -import time import zipfile @@ -153,76 +152,6 @@ class RunningAvg(object): """Get the current estimate""" return self._value - -class SimpleMonitor(gym.Wrapper): - def __init__(self, env): - """Adds two qunatities to info returned by every step: - - num_steps: int - Number of steps takes so far - rewards: [float] - All the cumulative rewards for the episodes completed so far. - """ - super().__init__(env) - # current episode state - self._current_reward = None - self._num_steps = None - # temporary monitor state that we do not save - self._time_offset = None - self._total_steps = None - # monitor state - self._episode_rewards = [] - self._episode_lengths = [] - self._episode_end_times = [] - - def _reset(self): - obs = self.env.reset() - # recompute temporary state if needed - if self._time_offset is None: - self._time_offset = time.time() - if len(self._episode_end_times) > 0: - self._time_offset -= self._episode_end_times[-1] - if self._total_steps is None: - self._total_steps = sum(self._episode_lengths) - # update monitor state - if self._current_reward is not None: - self._episode_rewards.append(self._current_reward) - self._episode_lengths.append(self._num_steps) - self._episode_end_times.append(time.time() - self._time_offset) - # reset episode state - self._current_reward = 0 - self._num_steps = 0 - - return obs - - def _step(self, action): - obs, rew, done, info = self.env.step(action) - self._current_reward += rew - self._num_steps += 1 - self._total_steps += 1 - info['steps'] = self._total_steps - info['rewards'] = self._episode_rewards - return (obs, rew, done, info) - - def get_state(self): - return { - 'env_id': self.env.unwrapped.spec.id, - 'episode_data': { - 'episode_rewards': self._episode_rewards, - 'episode_lengths': self._episode_lengths, - 'episode_end_times': self._episode_end_times, - 'initial_reset_time': 0, - } - } - - def set_state(self, state): - assert state['env_id'] == self.env.unwrapped.spec.id - ed = state['episode_data'] - self._episode_rewards = ed['episode_rewards'] - self._episode_lengths = ed['episode_lengths'] - self._episode_end_times = ed['episode_end_times'] - - def boolean_flag(parser, name, default=False, help=None): """Add a boolean flag to argparse parser. diff --git a/baselines/common/tf_util.py b/baselines/common/tf_util.py index 029bf19..5e7eb62 100644 --- a/baselines/common/tf_util.py +++ b/baselines/common/tf_util.py @@ -6,51 +6,41 @@ import copy import os import collections - # ================================================================ # Make consistent with numpy # ================================================================ clip = tf.clip_by_value - def sum(x, axis=None, keepdims=False): axis = None if axis is None else [axis] return tf.reduce_sum(x, axis=axis, keep_dims=keepdims) - def mean(x, axis=None, keepdims=False): axis = None if axis is None else [axis] return tf.reduce_mean(x, axis=axis, keep_dims=keepdims) - def var(x, axis=None, keepdims=False): meanx = mean(x, axis=axis, keepdims=keepdims) return mean(tf.square(x - meanx), axis=axis, keepdims=keepdims) - def std(x, axis=None, keepdims=False): return tf.sqrt(var(x, axis=axis, keepdims=keepdims)) - def max(x, axis=None, keepdims=False): axis = None if axis is None else [axis] return tf.reduce_max(x, axis=axis, keep_dims=keepdims) - def min(x, axis=None, keepdims=False): axis = None if axis is None else [axis] return tf.reduce_min(x, axis=axis, keep_dims=keepdims) - def concatenate(arrs, axis=0): return tf.concat(axis=axis, values=arrs) - def argmax(x, axis=None): return tf.argmax(x, axis=axis) - def switch(condition, then_expression, else_expression): """Switches between two operations depending on a scalar value (int or bool). Note that both `then_expression` and `else_expression` @@ -72,35 +62,29 @@ def switch(condition, then_expression, else_expression): # Extras # ================================================================ - def l2loss(params): if len(params) == 0: return tf.constant(0.0) else: return tf.add_n([sum(tf.square(p)) for p in params]) - def lrelu(x, leak=0.2): f1 = 0.5 * (1 + leak) f2 = 0.5 * (1 - leak) return f1 * x + f2 * abs(x) - def categorical_sample_logits(X): # https://github.com/tensorflow/tensorflow/issues/456 U = tf.random_uniform(tf.shape(X)) return argmax(X - tf.log(-tf.log(U)), axis=1) - # ================================================================ # Inputs # ================================================================ - def is_placeholder(x): return type(x) is tf.Tensor and len(x.op.inputs) == 0 - class TfInput(object): def __init__(self, name="(unnamed)"): """Generalized Tensorflow placeholder. The main differences are: @@ -119,7 +103,6 @@ class TfInput(object): """Given data input it to the placeholder(s).""" raise NotImplemented() - class PlacholderTfInput(TfInput): def __init__(self, placeholder): """Wrapper for regular tensorflow placeholder.""" @@ -132,7 +115,6 @@ class PlacholderTfInput(TfInput): def make_feed_dict(self, data): return {self._placeholder: data} - class BatchInput(PlacholderTfInput): def __init__(self, shape, dtype=tf.float32, name=None): """Creates a placeholder for a batch of tensors of a given shape and dtype @@ -148,7 +130,6 @@ class BatchInput(PlacholderTfInput): """ super().__init__(tf.placeholder(dtype, [None] + list(shape), name=name)) - class Uint8Input(PlacholderTfInput): def __init__(self, shape, name=None): """Takes input in uint8 format which is cast to float32 and divided by 255 @@ -171,7 +152,6 @@ class Uint8Input(PlacholderTfInput): def get(self): return self._output - def ensure_tf_input(thing): """Takes either tf.placeholder of TfInput and outputs equivalent TfInput""" if isinstance(thing, TfInput): @@ -185,7 +165,6 @@ def ensure_tf_input(thing): # Mathematical utils # ================================================================ - def huber_loss(x, delta=1.0): """Reference: https://en.wikipedia.org/wiki/Huber_loss""" return tf.where( @@ -198,7 +177,6 @@ def huber_loss(x, delta=1.0): # Optimizer utils # ================================================================ - def minimize_and_clip(optimizer, objective, var_list, clip_val=10): """Minimized `objective` using `optimizer` w.r.t. variables in `var_list` while ensure the norm of the gradients for each @@ -210,7 +188,6 @@ def minimize_and_clip(optimizer, objective, var_list, clip_val=10): gradients[i] = (tf.clip_by_norm(grad, clip_val), var) return optimizer.apply_gradients(gradients) - # ================================================================ # Global session # ================================================================ @@ -219,7 +196,6 @@ def get_session(): """Returns recently made Tensorflow session""" return tf.get_default_session() - def make_session(num_cpu): """Returns a session that will use CPU's only""" tf_config = tf.ConfigProto( @@ -227,31 +203,25 @@ def make_session(num_cpu): intra_op_parallelism_threads=num_cpu) return tf.Session(config=tf_config) - def single_threaded_session(): """Returns a session which will only use a single CPU""" return make_session(1) - ALREADY_INITIALIZED = set() - def initialize(): """Initialize all the uninitialized variables in the global scope.""" new_variables = set(tf.global_variables()) - ALREADY_INITIALIZED get_session().run(tf.variables_initializer(new_variables)) ALREADY_INITIALIZED.update(new_variables) - def eval(expr, feed_dict=None): if feed_dict is None: feed_dict = {} return get_session().run(expr, feed_dict=feed_dict) - VALUE_SETTERS = collections.OrderedDict() - def set_value(v, val): global VALUE_SETTERS if v in VALUE_SETTERS: @@ -262,17 +232,14 @@ def set_value(v, val): VALUE_SETTERS[v] = (set_op, set_endpoint) get_session().run(set_op, feed_dict={set_endpoint: val}) - # ================================================================ # Saving variables # ================================================================ - def load_state(fname): saver = tf.train.Saver() saver.restore(get_session(), fname) - def save_state(fname): os.makedirs(os.path.dirname(fname), exist_ok=True) saver = tf.train.Saver() @@ -282,7 +249,6 @@ def save_state(fname): # Model components # ================================================================ - def normc_initializer(std=1.0): def _initializer(shape, dtype=None, partition_info=None): # pylint: disable=W0613 out = np.random.randn(*shape).astype(np.float32) @@ -290,7 +256,6 @@ def normc_initializer(std=1.0): return tf.constant(out) return _initializer - def conv2d(x, num_filters, name, filter_size=(3, 3), stride=(1, 1), pad="SAME", dtype=tf.float32, collections=None, summary_tag=None): with tf.variable_scope(name): @@ -320,7 +285,6 @@ def conv2d(x, num_filters, name, filter_size=(3, 3), stride=(1, 1), pad="SAME", return tf.nn.conv2d(x, w, stride_shape, pad) + b - def dense(x, size, name, weight_init=None, bias=True): w = tf.get_variable(name + "/w", [x.get_shape()[1], size], initializer=weight_init) ret = tf.matmul(x, w) @@ -330,7 +294,6 @@ def dense(x, size, name, weight_init=None, bias=True): else: return ret - def wndense(x, size, name, init_scale=1.0): v = tf.get_variable(name + "/V", [int(x.get_shape()[1]), size], initializer=tf.random_normal_initializer(0, 0.05)) @@ -342,11 +305,9 @@ def wndense(x, size, name, init_scale=1.0): scaler = g / tf.sqrt(sum(tf.square(v), axis=0, keepdims=True)) return tf.reshape(scaler, [1, size]) * x + tf.reshape(b, [1, size]) - def densenobias(x, size, name, weight_init=None): return dense(x, size, name, weight_init=weight_init, bias=False) - def dropout(x, pkeep, phase=None, mask=None): mask = tf.floor(pkeep + tf.random_uniform(tf.shape(x))) if mask is None else mask if phase is None: @@ -354,13 +315,10 @@ def dropout(x, pkeep, phase=None, mask=None): else: return switch(phase, mask * x, pkeep * x) - # ================================================================ # Theano-like Function # ================================================================ - - def function(inputs, outputs, updates=None, givens=None): """Just like Theano function. Take a bunch of tensorflow placeholders and expressions computed based on those placeholders and produces f(inputs) -> outputs. Function f takes @@ -401,7 +359,6 @@ def function(inputs, outputs, updates=None, givens=None): f = _Function(inputs, [outputs], updates, givens=givens) return lambda *args, **kwargs: f(*args, **kwargs)[0] - class _Function(object): def __init__(self, inputs, outputs, updates, givens, check_nan=False): for inpt in inputs: @@ -448,7 +405,6 @@ class _Function(object): raise RuntimeError("Nan detected") return results - def mem_friendly_function(nondata_inputs, data_inputs, outputs, batch_size): if isinstance(outputs, list): return _MemFriendlyFunction(nondata_inputs, data_inputs, outputs, batch_size) @@ -456,7 +412,6 @@ def mem_friendly_function(nondata_inputs, data_inputs, outputs, batch_size): f = _MemFriendlyFunction(nondata_inputs, data_inputs, [outputs], batch_size) return lambda *inputs: f(*inputs)[0] - class _MemFriendlyFunction(object): def __init__(self, nondata_inputs, data_inputs, outputs, batch_size): self.nondata_inputs = nondata_inputs @@ -490,7 +445,6 @@ class _MemFriendlyFunction(object): # Modules # ================================================================ - class Module(object): def __init__(self, name): self.name = name @@ -528,7 +482,6 @@ class Module(object): assert self.scope is not None, "need to call module once before getting variables" return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope) - def module(name): @functools.wraps def wrapper(f): @@ -542,14 +495,11 @@ def module(name): # Graph traversal # ================================================================ - VARIABLES = {} - def get_parents(node): return node.op.inputs - def topsorted(outputs): """ Topological sort via non-recursive depth-first search @@ -586,7 +536,6 @@ def topsorted(outputs): stack.append((j, 0)) return out - # ================================================================ # Flat vectors # ================================================================ @@ -597,15 +546,12 @@ def var_shape(x): "shape function assumes that shape is fully known" return out - def numel(x): return intprod(var_shape(x)) - def intprod(x): return int(np.prod(x)) - def flatgrad(loss, var_list, clip_norm=None): grads = tf.gradients(loss, var_list) if clip_norm is not None: @@ -615,7 +561,6 @@ def flatgrad(loss, var_list, clip_norm=None): for (v, grad) in zip(var_list, grads) ]) - class SetFromFlat(object): def __init__(self, var_list, dtype=tf.float32): assigns = [] @@ -634,7 +579,6 @@ class SetFromFlat(object): def __call__(self, theta): get_session().run(self.op, feed_dict={self.theta: theta}) - class GetFlat(object): def __init__(self, var_list): self.op = tf.concat(axis=0, values=[tf.reshape(v, [numel(v)]) for v in var_list]) @@ -646,7 +590,6 @@ class GetFlat(object): # Misc # ================================================================ - def fancy_slice_2d(X, inds0, inds1): """ like numpy X[inds0, inds1] @@ -659,12 +602,10 @@ def fancy_slice_2d(X, inds0, inds1): Xflat = tf.reshape(X, [-1]) return tf.gather(Xflat, inds0 * ncols + inds1) - # ================================================================ # Scopes # ================================================================ - def scope_vars(scope, trainable_only=False): """ Get variables inside a scope @@ -687,17 +628,14 @@ def scope_vars(scope, trainable_only=False): scope=scope if isinstance(scope, str) else scope.name ) - def scope_name(): """Returns the name of current scope as a string, e.g. deepq/q_func""" return tf.get_variable_scope().name - def absolute_scope_name(relative_scope_name): """Appends parent scope name to `relative_scope_name`""" return scope_name() + "/" + relative_scope_name - def lengths_to_mask(lengths_b, max_length): """ Turns a vector of lengths into a boolean mask @@ -715,7 +653,6 @@ def lengths_to_mask(lengths_b, max_length): mask_bt = tf.expand_dims(tf.range(max_length), 0) < tf.expand_dims(lengths_b, 1) return mask_bt - def in_session(f): @functools.wraps(f) def newfunc(*args, **kwargs): @@ -723,10 +660,8 @@ def in_session(f): f(*args, **kwargs) return newfunc - _PLACEHOLDER_CACHE = {} # name -> (placeholder, dtype, shape) - def get_placeholder(name, dtype, shape): if name in _PLACEHOLDER_CACHE: out, dtype1, shape1 = _PLACEHOLDER_CACHE[name] @@ -737,15 +672,12 @@ def get_placeholder(name, dtype, shape): _PLACEHOLDER_CACHE[name] = (out, dtype, shape) return out - def get_placeholder_cached(name): return _PLACEHOLDER_CACHE[name][0] - def flattenallbut0(x): return tf.reshape(x, [-1, intprod(x.get_shape().as_list()[1:])]) - def reset(): global _PLACEHOLDER_CACHE global VARIABLES diff --git a/baselines/common/vec_env/subproc_vec_env.py b/baselines/common/vec_env/subproc_vec_env.py index fa40a10..de97d8d 100644 --- a/baselines/common/vec_env/subproc_vec_env.py +++ b/baselines/common/vec_env/subproc_vec_env.py @@ -2,7 +2,9 @@ import numpy as np from multiprocessing import Process, Pipe from baselines.common.vec_env import VecEnv -def worker(remote, env_fn_wrapper): + +def worker(remote, parent_remote, env_fn_wrapper): + parent_remote.close() env = env_fn_wrapper.x() while True: cmd, data = remote.recv() @@ -14,6 +16,9 @@ def worker(remote, env_fn_wrapper): elif cmd == 'reset': ob = env.reset() remote.send(ob) + elif cmd == 'reset_task': + ob = env.reset_task() + remote.send(ob) elif cmd == 'close': remote.close() break @@ -22,6 +27,7 @@ def worker(remote, env_fn_wrapper): else: raise NotImplementedError + class CloudpickleWrapper(object): """ Uses cloudpickle to serialize contents (otherwise multiprocessing tries to use pickle) @@ -35,17 +41,22 @@ class CloudpickleWrapper(object): import pickle self.x = pickle.loads(ob) + class SubprocVecEnv(VecEnv): def __init__(self, env_fns): """ envs: list of gym environments to run in subprocesses """ + self.closed = False nenvs = len(env_fns) - self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(nenvs)]) - self.ps = [Process(target=worker, args=(work_remote, CloudpickleWrapper(env_fn))) - for (work_remote, env_fn) in zip(self.work_remotes, env_fns)] + self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(nenvs)]) + self.ps = [Process(target=worker, args=(work_remote, remote, CloudpickleWrapper(env_fn))) + for (work_remote, remote, env_fn) in zip(self.work_remotes, self.remotes, env_fns)] for p in self.ps: + p.daemon = True # if the main process crashes, we should not cause things to hang p.start() + for remote in self.work_remotes: + remote.close() self.remotes[0].send(('get_spaces', None)) self.action_space, self.observation_space = self.remotes[0].recv() @@ -63,11 +74,20 @@ class SubprocVecEnv(VecEnv): remote.send(('reset', None)) return np.stack([remote.recv() for remote in self.remotes]) + def reset_task(self): + for remote in self.remotes: + remote.send(('reset_task', None)) + return np.stack([remote.recv() for remote in self.remotes]) + def close(self): + if self.closed: + return + for remote in self.remotes: remote.send(('close', None)) for p in self.ps: p.join() + self.closed = True @property def num_envs(self): diff --git a/baselines/ddpg/main.py b/baselines/ddpg/main.py index 81f294e..747553f 100644 --- a/baselines/ddpg/main.py +++ b/baselines/ddpg/main.py @@ -19,11 +19,12 @@ from mpi4py import MPI def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs): # Configure things. rank = MPI.COMM_WORLD.Get_rank() - if rank != 0: logger.set_level(logger.DISABLED) + if rank != 0: + logger.set_level(logger.DISABLED) # Create envs. env = gym.make(env_id) - env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), "%i.monitor.json"%rank)) + env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) gym.logger.setLevel(logging.WARN) if evaluation and rank==0: @@ -81,7 +82,7 @@ def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs): def parse_args(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) - + parser.add_argument('--env-id', type=str, default='HalfCheetah-v1') boolean_flag(parser, 'render-eval', default=False) boolean_flag(parser, 'layer-norm', default=True) @@ -103,11 +104,21 @@ def parse_args(): parser.add_argument('--nb-eval-steps', type=int, default=100) # per epoch cycle and MPI worker parser.add_argument('--nb-rollout-steps', type=int, default=100) # per epoch cycle and MPI worker parser.add_argument('--noise-type', type=str, default='adaptive-param_0.2') # choices are adaptive-param_xx, ou_xx, normal_xx, none + parser.add_argument('--num-timesteps', type=int, default=None) boolean_flag(parser, 'evaluation', default=False) - return vars(parser.parse_args()) + args = parser.parse_args() + # we don't directly specify timesteps for this script, so make sure that if we do specify them + # they agree with the other parameters + if args.num_timesteps is not None: + assert(args.num_timesteps == args.nb_epochs * args.nb_epoch_cycles * args.nb_rollout_steps) + dict_args = vars(args) + del dict_args['num_timesteps'] + return dict_args if __name__ == '__main__': args = parse_args() + if MPI.COMM_WORLD.Get_rank() == 0: + logger.configure() # Run actual script. run(**args) diff --git a/baselines/ddpg/util.py b/baselines/ddpg/util.py index aed39c3..b378008 100644 --- a/baselines/ddpg/util.py +++ b/baselines/ddpg/util.py @@ -1,6 +1,3 @@ -import time - -import gym import numpy as np import tensorflow as tf from mpi4py import MPI diff --git a/baselines/deepq/__init__.py b/baselines/deepq/__init__.py index cc04c49..4472399 100644 --- a/baselines/deepq/__init__.py +++ b/baselines/deepq/__init__.py @@ -1,5 +1,8 @@ from baselines.deepq import models # noqa from baselines.deepq.build_graph import build_act, build_train # noqa - from baselines.deepq.simple import learn, load # noqa from baselines.deepq.replay_buffer import ReplayBuffer, PrioritizedReplayBuffer # noqa + +def wrap_atari_dqn(env): + from baselines.common.atari_wrappers import wrap_deepmind + return wrap_deepmind(env, frame_stack=True, scale=True) \ No newline at end of file diff --git a/baselines/deepq/experiments/atari/enjoy.py b/baselines/deepq/experiments/atari/enjoy.py index fe482ca..db2b70f 100644 --- a/baselines/deepq/experiments/atari/enjoy.py +++ b/baselines/deepq/experiments/atari/enjoy.py @@ -10,8 +10,8 @@ import baselines.common.tf_util as U from baselines import deepq from baselines.common.misc_util import ( boolean_flag, - SimpleMonitor, ) +from baselines import bench from baselines.common.atari_wrappers_deprecated import wrap_dqn from baselines.deepq.experiments.atari.model import model, dueling_model @@ -30,7 +30,7 @@ def parse_args(): def make_env(game_name): env = gym.make(game_name + "NoFrameskip-v4") - env = SimpleMonitor(env) + env = bench.Monitor(env, None) env = wrap_dqn(env) return env diff --git a/baselines/deepq/experiments/atari/train.py b/baselines/deepq/experiments/atari/train.py index 223a6dc..30188c8 100644 --- a/baselines/deepq/experiments/atari/train.py +++ b/baselines/deepq/experiments/atari/train.py @@ -19,11 +19,9 @@ from baselines.common.misc_util import ( relatively_safe_pickle_dump, set_global_seeds, RunningAvg, - SimpleMonitor ) from baselines.common.schedules import LinearSchedule, PiecewiseSchedule -# when updating this to non-deperecated ones, it is important to -# copy over LazyFrames +from baselines import bench from baselines.common.atari_wrappers_deprecated import wrap_dqn from baselines.common.azure_utils import Container from .model import model, dueling_model @@ -64,7 +62,7 @@ def parse_args(): def make_env(game_name): env = gym.make(game_name + "NoFrameskip-v4") - monitored_env = SimpleMonitor(env) # puts rewards and number of steps in info, before environment is wrapped + monitored_env = bench.Monitor(env, logger.get_dir()) # puts rewards and number of steps in info, before environment is wrapped env = wrap_dqn(monitored_env) # applies a bunch of modification to simplify the observation space (downsample, make b/w) return env, monitored_env diff --git a/baselines/deepq/experiments/atari/wang2015_eval.py b/baselines/deepq/experiments/atari/wang2015_eval.py index b1fdfe4..42b8ba8 100644 --- a/baselines/deepq/experiments/atari/wang2015_eval.py +++ b/baselines/deepq/experiments/atari/wang2015_eval.py @@ -5,15 +5,15 @@ import os import baselines.common.tf_util as U -from baselines import deepq -from baselines.common.misc_util import get_wrapper_by_name, SimpleMonitor, boolean_flag, set_global_seeds +from baselines import deepq, bench +from baselines.common.misc_util import get_wrapper_by_name, boolean_flag, set_global_seeds from baselines.common.atari_wrappers_deprecated import wrap_dqn from baselines.deepq.experiments.atari.model import model, dueling_model def make_env(game_name): env = gym.make(game_name + "NoFrameskip-v4") - env_monitored = SimpleMonitor(env) + env_monitored = bench.Monitor(env, None) env = wrap_dqn(env_monitored) return env_monitored, env @@ -47,14 +47,14 @@ def wang2015_eval(game_name, act, stochastic): eval_episode_steps += 1 action = act(np.array(obs)[None], stochastic=stochastic)[0] - obs, reward, done, info = eval_env.step(action) + obs, _reward, done, info = eval_env.step(action) if done: obs = eval_env.reset() if len(info["rewards"]) > 0: episode_rewards.append(info["rewards"][0]) break if info["steps"] > 108000: # 5 minutes of gameplay - episode_rewards.append(env_monitored._current_reward) + episode_rewards.append(sum(env_monitored.rewards)) break print("Num steps in episode {} was {} yielding {} reward".format( num_noops, eval_episode_steps, episode_rewards[-1]), flush=True) @@ -66,7 +66,7 @@ def wang2015_eval(game_name, act, stochastic): def main(): set_global_seeds(1) args = parse_args() - with U.make_session(4) as sess: # noqa + with U.make_session(4): # noqa _, env = make_env(args.env) act = deepq.build_act( make_obs_ph=lambda name: U.Uint8Input(env.observation_space.shape, name=name), diff --git a/baselines/deepq/experiments/enjoy_pong.py b/baselines/deepq/experiments/enjoy_pong.py index d70d6e6..5b16fec 100644 --- a/baselines/deepq/experiments/enjoy_pong.py +++ b/baselines/deepq/experiments/enjoy_pong.py @@ -1,12 +1,10 @@ import gym - from baselines import deepq -from baselines.common.atari_wrappers_deprecated import wrap_dqn, ScaledFloatFrame def main(): env = gym.make("PongNoFrameskip-v4") - env = ScaledFloatFrame(wrap_dqn(env)) + env = deepq.wrap_atari_dqn(env) act = deepq.load("pong_model.pkl") while True: diff --git a/baselines/deepq/experiments/run_atari.py b/baselines/deepq/experiments/run_atari.py new file mode 100644 index 0000000..21e3b60 --- /dev/null +++ b/baselines/deepq/experiments/run_atari.py @@ -0,0 +1,47 @@ +import gym + +from baselines import deepq +from baselines.common import set_global_seeds +from baselines import bench +import argparse +from baselines import logger +from baselines.common.atari_wrappers import make_atari + +def main(): + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4') + parser.add_argument('--seed', help='RNG seed', type=int, default=0) + parser.add_argument('--prioritized', type=int, default=1) + parser.add_argument('--dueling', type=int, default=1) + parser.add_argument('--num-timesteps', type=int, default=int(10e6)) + args = parser.parse_args() + logger.configure() + set_global_seeds(args.seed) + env = make_atari(args.env) + env = bench.Monitor(env, logger.get_dir()) + env = deepq.wrap_atari_dqn(env) + model = deepq.models.cnn_to_mlp( + convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], + hiddens=[256], + dueling=bool(args.dueling), + ) + act = deepq.learn( + env, + q_func=model, + lr=1e-4, + max_timesteps=args.num_timesteps, + buffer_size=10000, + exploration_fraction=0.1, + exploration_final_eps=0.01, + train_freq=4, + learning_starts=10000, + target_network_update_freq=1000, + gamma=0.99, + prioritized_replay=bool(args.prioritized) + ) + # act.save("pong_model.pkl") XXX + env.close() + + +if __name__ == '__main__': + main() diff --git a/baselines/deepq/experiments/train_pong.py b/baselines/deepq/experiments/train_pong.py deleted file mode 100644 index efa78c3..0000000 --- a/baselines/deepq/experiments/train_pong.py +++ /dev/null @@ -1,34 +0,0 @@ -import gym - -from baselines import deepq -from baselines.common.atari_wrappers_deprecated import wrap_dqn, ScaledFloatFrame - - -def main(): - env = gym.make("PongNoFrameskip-v4") - env = ScaledFloatFrame(wrap_dqn(env)) - model = deepq.models.cnn_to_mlp( - convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], - hiddens=[256], - dueling=True - ) - act = deepq.learn( - env, - q_func=model, - lr=1e-4, - max_timesteps=2000000, - buffer_size=10000, - exploration_fraction=0.1, - exploration_final_eps=0.01, - train_freq=4, - learning_starts=10000, - target_network_update_freq=1000, - gamma=0.99, - prioritized_replay=True - ) - act.save("pong_model.pkl") - env.close() - - -if __name__ == '__main__': - main() diff --git a/baselines/deepq/simple.py b/baselines/deepq/simple.py index 052d95b..091e81f 100644 --- a/baselines/deepq/simple.py +++ b/baselines/deepq/simple.py @@ -1,12 +1,13 @@ -import numpy as np import os -import dill import tempfile + import tensorflow as tf import zipfile +import cloudpickle +import numpy as np +import gym import baselines.common.tf_util as U - from baselines import logger from baselines.common.schedules import LinearSchedule from baselines import deepq @@ -19,11 +20,11 @@ class ActWrapper(object): self._act_params = act_params @staticmethod - def load(path, num_cpu=16): + def load(path): with open(path, "rb") as f: - model_data, act_params = dill.load(f) + model_data, act_params = cloudpickle.load(f) act = deepq.build_act(**act_params) - sess = U.make_session(num_cpu=num_cpu) + sess = tf.Session() sess.__enter__() with tempfile.TemporaryDirectory() as td: arc_path = os.path.join(td, "packed.zip") @@ -38,8 +39,11 @@ class ActWrapper(object): def __call__(self, *args, **kwargs): return self._act(*args, **kwargs) - def save(self, path): + def save(self, path=None): """Save model to a pickle located at `path`""" + if path is None: + path = os.path.join(logger.get_dir(), "model.pkl") + with tempfile.TemporaryDirectory() as td: U.save_state(os.path.join(td, "model")) arc_name = os.path.join(td, "packed.zip") @@ -52,18 +56,16 @@ class ActWrapper(object): with open(arc_name, "rb") as f: model_data = f.read() with open(path, "wb") as f: - dill.dump((model_data, self._act_params), f) + cloudpickle.dump((model_data, self._act_params), f) -def load(path, num_cpu=16): +def load(path): """Load act function that was returned by learn function. Parameters ---------- path: str path to the act function pickle - num_cpu: int - number of cpus to use for executing the policy Returns ------- @@ -71,7 +73,7 @@ def load(path, num_cpu=16): function that takes a batch of observations and returns actions. """ - return ActWrapper.load(path, num_cpu=num_cpu) + return ActWrapper.load(path) def learn(env, @@ -83,7 +85,7 @@ def learn(env, exploration_final_eps=0.02, train_freq=1, batch_size=32, - print_freq=1, + print_freq=100, checkpoint_freq=10000, learning_starts=1000, gamma=1.0, @@ -93,7 +95,6 @@ def learn(env, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, - num_cpu=16, param_noise=False, callback=None): """Train a deepq model. @@ -151,8 +152,6 @@ def learn(env, to 1.0. If set to None equals to max_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. - num_cpu: int - number of cpus to use for training callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. @@ -165,11 +164,14 @@ def learn(env, """ # Create all the functions necessary to train the model - sess = U.make_session(num_cpu=num_cpu) + sess = tf.Session() sess.__enter__() + # capture the shape outside the closure so that the env object is not serialized + # by cloudpickle when serializing make_obs_ph + observation_space_shape = env.observation_space.shape def make_obs_ph(name): - return U.BatchInput(env.observation_space.shape, name=name) + return U.BatchInput(observation_space_shape, name=name) act, train, update_target, debug = deepq.build_train( make_obs_ph=make_obs_ph, @@ -180,12 +182,15 @@ def learn(env, grad_norm_clipping=10, param_noise=param_noise ) + act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': env.action_space.n, } + act = ActWrapper(act, act_params) + # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) @@ -233,8 +238,13 @@ def learn(env, kwargs['update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True action = act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] + if isinstance(env.action_space, gym.spaces.MultiBinary): + env_action = np.zeros(env.action_space.n) + env_action[action] = 1 + else: + env_action = action reset = False - new_obs, rew, done, _ = env.step(action) + new_obs, rew, done, _ = env.step(env_action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs @@ -285,4 +295,4 @@ def learn(env, logger.log("Restored model with mean reward: {}".format(saved_mean_reward)) U.load_state(model_file) - return ActWrapper(act, act_params) + return act diff --git a/baselines/logger.py b/baselines/logger.py index 981c4b2..55d540d 100644 --- a/baselines/logger.py +++ b/baselines/logger.py @@ -6,8 +6,10 @@ import json import time import datetime import tempfile +from mpi4py import MPI -LOG_OUTPUT_FORMATS = ['stdout', 'log', 'json'] +LOG_OUTPUT_FORMATS = ['stdout', 'log', 'csv'] +# Also valid: json, tensorboard DEBUG = 10 INFO = 20 @@ -16,26 +18,23 @@ ERROR = 40 DISABLED = 50 -class OutputFormat(object): +class KVWriter(object): def writekvs(self, kvs): - """ - Write key-value pairs - """ raise NotImplementedError - def writeseq(self, args): - """ - Write a sequence of other data (e.g. a logging message) - """ - pass +class SeqWriter(object): + def writeseq(self, seq): + raise NotImplementedError - def close(self): - return - - -class HumanOutputFormat(OutputFormat): - def __init__(self, file): - self.file = file +class HumanOutputFormat(KVWriter, SeqWriter): + def __init__(self, filename_or_file): + if isinstance(filename_or_file, str): + self.file = open(filename_or_file, 'at') + self.own_file = True + else: + assert hasattr(filename_or_file, 'read'), 'expected file or str, got %s'%filename_or_file + self.file = filename_or_file + self.own_file = False def writekvs(self, kvs): # Create strings for printing @@ -48,8 +47,12 @@ class HumanOutputFormat(OutputFormat): key2str[self._truncate(key)] = self._truncate(valstr) # Find max widths - keywidth = max(map(len, key2str.keys())) - valwidth = max(map(len, key2str.values())) + if len(key2str) == 0: + print('WARNING: tried to write empty key-value dict') + return + else: + keywidth = max(map(len, key2str.keys())) + valwidth = max(map(len, key2str.values())) # Write out the data dashes = '-' * (keywidth + valwidth + 7) @@ -70,15 +73,19 @@ class HumanOutputFormat(OutputFormat): def _truncate(self, s): return s[:20] + '...' if len(s) > 23 else s - def writeseq(self, args): - for arg in args: + def writeseq(self, seq): + for arg in seq: self.file.write(arg) self.file.write('\n') self.file.flush() -class JSONOutputFormat(OutputFormat): - def __init__(self, file): - self.file = file + def close(self): + if self.own_file: + self.file.close() + +class JSONOutputFormat(KVWriter): + def __init__(self, filename): + self.file = open(filename, 'at') def writekvs(self, kvs): for k, v in sorted(kvs.items()): @@ -88,7 +95,46 @@ class JSONOutputFormat(OutputFormat): self.file.write(json.dumps(kvs) + '\n') self.file.flush() -class TensorBoardOutputFormat(OutputFormat): + def close(self): + self.file.close() + +class CSVOutputFormat(KVWriter): + def __init__(self, filename): + self.file = open(filename, 'a+t') + self.keys = [] + self.sep = ',' + + def writekvs(self, kvs): + # Add our current row to the history + extra_keys = kvs.keys() - self.keys + if extra_keys: + self.keys.extend(extra_keys) + self.file.seek(0) + lines = self.file.readlines() + self.file.seek(0) + for (i, k) in enumerate(self.keys): + if i > 0: + self.file.write(',') + self.file.write(k) + self.file.write('\n') + for line in lines[1:]: + self.file.write(line[:-1]) + self.file.write(self.sep * len(extra_keys)) + self.file.write('\n') + for (i, k) in enumerate(self.keys): + if i > 0: + self.file.write(',') + v = kvs.get(k) + if v: + self.file.write(str(v)) + self.file.write('\n') + self.file.flush() + + def close(self): + self.file.close() + + +class TensorBoardOutputFormat(KVWriter): """ Dumps key/value pairs into TensorBoard's numeric format. """ @@ -99,7 +145,7 @@ class TensorBoardOutputFormat(OutputFormat): prefix = 'events' path = osp.join(osp.abspath(dir), prefix) import tensorflow as tf - from tensorflow.python import pywrap_tensorflow + from tensorflow.python import pywrap_tensorflow from tensorflow.core.util import event_pb2 from tensorflow.python.util import compat self.tf = tf @@ -123,18 +169,22 @@ class TensorBoardOutputFormat(OutputFormat): self.writer.Close() self.writer = None - def make_output_format(format, ev_dir): os.makedirs(ev_dir, exist_ok=True) + rank = MPI.COMM_WORLD.Get_rank() if format == 'stdout': return HumanOutputFormat(sys.stdout) elif format == 'log': - log_file = open(osp.join(ev_dir, 'log.txt'), 'wt') - return HumanOutputFormat(log_file) + suffix = "" if rank==0 else ("-mpi%03i"%rank) + return HumanOutputFormat(osp.join(ev_dir, 'log%s.txt' % suffix)) elif format == 'json': - json_file = open(osp.join(ev_dir, 'progress.json'), 'wt') - return JSONOutputFormat(json_file) + assert rank==0 + return JSONOutputFormat(osp.join(ev_dir, 'progress.json')) + elif format == 'csv': + assert rank==0 + return CSVOutputFormat(osp.join(ev_dir, 'progress.csv')) elif format == 'tensorboard': + assert rank==0 return TensorBoardOutputFormat(osp.join(ev_dir, 'tb')) else: raise ValueError('Unknown format specified: %s' % (format,)) @@ -167,7 +217,7 @@ def dumpkvs(): Logger.CURRENT.dumpkvs() def getkvs(): - return Logger.CURRENT.name2val + return Logger.CURRENT.name2val def log(*args, level=INFO): @@ -176,19 +226,15 @@ def log(*args, level=INFO): """ Logger.CURRENT.log(*args, level=level) - def debug(*args): log(*args, level=DEBUG) - def info(*args): log(*args, level=INFO) - def warn(*args): log(*args, level=WARN) - def error(*args): log(*args, level=ERROR) @@ -232,7 +278,8 @@ class Logger(object): def dumpkvs(self): if self.level == DISABLED: return for fmt in self.output_formats: - fmt.writekvs(self.name2val) + if isinstance(fmt, KVWriter): + fmt.writekvs(self.name2val) self.name2val.clear() def log(self, *args, level=INFO): @@ -255,34 +302,45 @@ class Logger(object): # ---------------------------------------- def _do_log(self, args): for fmt in self.output_formats: - fmt.writeseq(args) + if isinstance(fmt, SeqWriter): + fmt.writeseq(map(str, args)) Logger.DEFAULT = Logger.CURRENT = Logger(dir=None, output_formats=[HumanOutputFormat(sys.stdout)]) def configure(dir=None, format_strs=None): - assert Logger.CURRENT is Logger.DEFAULT,\ - "Only call logger.configure() when it's in the default state. Try calling logger.reset() first." - prevlogger = Logger.CURRENT if dir is None: dir = os.getenv('OPENAI_LOGDIR') if dir is None: - dir = osp.join(tempfile.gettempdir(), + dir = osp.join(tempfile.gettempdir(), datetime.datetime.now().strftime("openai-%Y-%m-%d-%H-%M-%S-%f")) + assert isinstance(dir, str) + os.makedirs(dir, exist_ok=True) + if format_strs is None: - format_strs = LOG_OUTPUT_FORMATS + strs = os.getenv('OPENAI_LOG_FORMAT') + format_strs = strs.split(',') if strs else LOG_OUTPUT_FORMATS output_formats = [make_output_format(f, dir) for f in format_strs] + Logger.CURRENT = Logger(dir=dir, output_formats=output_formats) log('Logging to %s'%dir) -if os.getenv('OPENAI_LOGDIR'): - # if OPENAI_LOGDIR is set, configure the logger on import - # this kind of nasty (unexpected to user), but I don't know how else to inject the logger - # to a script that's getting run in a subprocess - configure(dir=os.getenv('OPENAI_LOGDIR')) - def reset(): - Logger.CURRENT = Logger.DEFAULT - log('Reset logger') + if Logger.CURRENT is not Logger.DEFAULT: + Logger.CURRENT.close() + Logger.CURRENT = Logger.DEFAULT + log('Reset logger') + +class scoped_configure(object): + def __init__(self, dir=None, format_strs=None): + self.dir = dir + self.format_strs = format_strs + self.prevlogger = None + def __enter__(self): + self.prevlogger = Logger.CURRENT + configure(dir=self.dir, format_strs=self.format_strs) + def __exit__(self, *args): + Logger.CURRENT.close() + Logger.CURRENT = self.prevlogger # ================================================================ @@ -294,14 +352,14 @@ def _demo(): dir = "/tmp/testlogging" if os.path.exists(dir): shutil.rmtree(dir) - with session(dir=dir): - logkv("a", 3) - logkv("b", 2.5) - dumpkvs() - logkv("b", -2.5) - logkv("a", 5.5) - dumpkvs() - info("^^^ should see a = 5.5") + configure(dir=dir) + logkv("a", 3) + logkv("b", 2.5) + dumpkvs() + logkv("b", -2.5) + logkv("a", 5.5) + dumpkvs() + info("^^^ should see a = 5.5") logkv("b", -2.5) dumpkvs() @@ -310,5 +368,55 @@ def _demo(): dumpkvs() +# ================================================================ +# Readers +# ================================================================ + +def read_json(fname): + import pandas + ds = [] + with open(fname, 'rt') as fh: + for line in fh: + ds.append(json.loads(line)) + return pandas.DataFrame(ds) + +def read_csv(fname): + import pandas + return pandas.read_csv(fname, index_col=None, comment='#') + +def read_tb(path): + """ + path : a tensorboard file OR a directory, where we will find all TB files + of the form events.* + """ + import pandas + import numpy as np + from glob import glob + from collections import defaultdict + import tensorflow as tf + if osp.isdir(path): + fnames = glob(osp.join(path, "events.*")) + elif osp.basename(path).startswith("events."): + fnames = [path] + else: + raise NotImplementedError("Expected tensorboard file or directory containing them. Got %s"%path) + tag2pairs = defaultdict(list) + maxstep = 0 + for fname in fnames: + for summary in tf.train.summary_iterator(fname): + if summary.step > 0: + for v in summary.summary.value: + pair = (summary.step, v.simple_value) + tag2pairs[v.tag].append(pair) + maxstep = max(summary.step, maxstep) + data = np.empty((maxstep, len(tag2pairs))) + data[:] = np.nan + tags = sorted(tag2pairs.keys()) + for (colidx,tag) in enumerate(tags): + pairs = tag2pairs[tag] + for (step, value) in pairs: + data[step-1, colidx] = value + return pandas.DataFrame(data, columns=tags) + if __name__ == "__main__": _demo() diff --git a/baselines/ppo1/cnn_policy.py b/baselines/ppo1/cnn_policy.py index 57160a2..85ab7b0 100644 --- a/baselines/ppo1/cnn_policy.py +++ b/baselines/ppo1/cnn_policy.py @@ -1,4 +1,3 @@ -from baselines.common.mpi_running_mean_std import RunningMeanStd import baselines.common.tf_util as U import tensorflow as tf import gym @@ -18,7 +17,7 @@ class CnnPolicy(object): sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) - + x = ob / 255.0 if kind == 'small': # from A3C paper x = tf.nn.relu(U.conv2d(x, 16, "l1", [8, 8], [4, 4], pad="VALID")) diff --git a/baselines/ppo1/mlp_policy.py b/baselines/ppo1/mlp_policy.py index 4be3b19..4ec8439 100644 --- a/baselines/ppo1/mlp_policy.py +++ b/baselines/ppo1/mlp_policy.py @@ -18,7 +18,7 @@ class MlpPolicy(object): sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) - + with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) @@ -27,12 +27,12 @@ class MlpPolicy(object): for i in range(num_hid_layers): last_out = tf.nn.tanh(U.dense(last_out, hid_size, "vffc%i"%(i+1), weight_init=U.normc_initializer(1.0))) self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:,0] - + last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh(U.dense(last_out, hid_size, "polfc%i"%(i+1), weight_init=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): - mean = U.dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01)) + mean = U.dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01)) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer()) pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1) else: diff --git a/baselines/ppo1/pposgd_simple.py b/baselines/ppo1/pposgd_simple.py index 1ecca8a..55d96ad 100644 --- a/baselines/ppo1/pposgd_simple.py +++ b/baselines/ppo1/pposgd_simple.py @@ -78,7 +78,7 @@ def add_vtarg_and_adv(seg, gamma, lam): seg["tdlamret"] = seg["adv"] + seg["vpred"] def learn(env, policy_func, *, - timesteps_per_batch, # timesteps per actor per update + timesteps_per_actorbatch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize,# optimization hypers gamma, lam, # advantage estimation @@ -130,7 +130,7 @@ def learn(env, policy_func, *, # Prepare for rollouts # ---------------------------------------- - seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True) + seg_gen = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 diff --git a/baselines/ppo1/run_atari.py b/baselines/ppo1/run_atari.py index 97b8f71..1fd5e11 100644 --- a/baselines/ppo1/run_atari.py +++ b/baselines/ppo1/run_atari.py @@ -6,37 +6,34 @@ from baselines import bench import os.path as osp import gym, logging from baselines import logger +from baselines.common.atari_wrappers import make_atari, wrap_deepmind -def wrap_train(env): - from baselines.common.atari_wrappers import (wrap_deepmind, FrameStack) - env = wrap_deepmind(env, clip_rewards=True) - env = FrameStack(env, 4) - return env - -def train(env_id, num_frames, seed): +def train(env_id, num_timesteps, seed): from baselines.ppo1 import pposgd_simple, cnn_policy import baselines.common.tf_util as U rank = MPI.COMM_WORLD.Get_rank() sess = U.single_threaded_session() sess.__enter__() - if rank != 0: logger.set_level(logger.DISABLED) + if rank == 0: + logger.configure() + else: + logger.configure(format_strs=[]) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) - env = gym.make(env_id) + env = make_atari(env_id) def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613 return cnn_policy.CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space) - env = bench.Monitor(env, logger.get_dir() and - osp.join(logger.get_dir(), "%i.monitor.json" % rank)) + env = bench.Monitor(env, logger.get_dir() and + osp.join(logger.get_dir(), str(rank))) env.seed(workerseed) gym.logger.setLevel(logging.WARN) - env = wrap_train(env) - num_timesteps = int(num_frames / 4 * 1.1) + env = wrap_deepmind(env) env.seed(workerseed) pposgd_simple.learn(env, policy_fn, - max_timesteps=num_timesteps, - timesteps_per_batch=256, + max_timesteps=int(num_timesteps * 1.1), + timesteps_per_actorbatch=256, clip_param=0.2, entcoeff=0.01, optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=64, gamma=0.99, lam=0.95, @@ -49,8 +46,9 @@ def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--env', help='environment ID', default='PongNoFrameskip-v4') parser.add_argument('--seed', help='RNG seed', type=int, default=0) + parser.add_argument('--num-timesteps', type=int, default=int(10e6)) args = parser.parse_args() - train(args.env, num_frames=40e6, seed=args.seed) + train(args.env, num_timesteps=args.num_timesteps, seed=args.seed) if __name__ == '__main__': main() diff --git a/baselines/ppo1/run_mujoco.py b/baselines/ppo1/run_mujoco.py index 1da6155..fc5479a 100644 --- a/baselines/ppo1/run_mujoco.py +++ b/baselines/ppo1/run_mujoco.py @@ -1,10 +1,8 @@ #!/usr/bin/env python from baselines.common import set_global_seeds, tf_util as U from baselines import bench -import os.path as osp import gym, logging from baselines import logger -import sys def train(env_id, num_timesteps, seed): from baselines.ppo1 import mlp_policy, pposgd_simple @@ -14,13 +12,12 @@ def train(env_id, num_timesteps, seed): def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) - env = bench.Monitor(env, logger.get_dir() and - osp.join(logger.get_dir(), "monitor.json")) + env = bench.Monitor(env, logger.get_dir()) env.seed(seed) gym.logger.setLevel(logging.WARN) - pposgd_simple.learn(env, policy_fn, + pposgd_simple.learn(env, policy_fn, max_timesteps=num_timesteps, - timesteps_per_batch=2048, + timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', @@ -32,8 +29,10 @@ def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--env', help='environment ID', default='Hopper-v1') parser.add_argument('--seed', help='RNG seed', type=int, default=0) + parser.add_argument('--num-timesteps', type=int, default=int(1e6)) args = parser.parse_args() - train(args.env, num_timesteps=1e6, seed=args.seed) + logger.configure() + train(args.env, num_timesteps=args.num_timesteps, seed=args.seed) if __name__ == '__main__': diff --git a/baselines/trpo_mpi/nosharing_cnn_policy.py b/baselines/trpo_mpi/nosharing_cnn_policy.py index 6f96cd5..19cddab 100644 --- a/baselines/trpo_mpi/nosharing_cnn_policy.py +++ b/baselines/trpo_mpi/nosharing_cnn_policy.py @@ -1,4 +1,3 @@ -from baselines.common.mpi_running_mean_std import RunningMeanStd import baselines.common.tf_util as U import tensorflow as tf import gym @@ -18,7 +17,7 @@ class CnnPolicy(object): sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) - + obscaled = ob / 255.0 with tf.variable_scope("pol"): @@ -42,7 +41,7 @@ class CnnPolicy(object): self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) - ac = self.pd.sample() # XXX + ac = self.pd.sample() self._act = U.function([stochastic, ob], [ac, self.vpred]) def act(self, stochastic, ob): diff --git a/baselines/trpo_mpi/run_atari.py b/baselines/trpo_mpi/run_atari.py index 0b1a7eb..fe1493d 100644 --- a/baselines/trpo_mpi/run_atari.py +++ b/baselines/trpo_mpi/run_atari.py @@ -1,45 +1,38 @@ -#!/usr/bin/env python + #!/usr/bin/env python from mpi4py import MPI from baselines.common import set_global_seeds import os.path as osp import gym, logging from baselines import logger from baselines import bench -import sys +from baselines.common.atari_wrappers import make_atari, wrap_deepmind -def wrap_train(env): - from baselines.common.atari_wrappers import (wrap_deepmind, FrameStack) - env = wrap_deepmind(env, clip_rewards=False) - env = FrameStack(env, 3) - return env - -def train(env_id, num_frames, seed): +def train(env_id, num_timesteps, seed): from baselines.trpo_mpi.nosharing_cnn_policy import CnnPolicy from baselines.trpo_mpi import trpo_mpi import baselines.common.tf_util as U rank = MPI.COMM_WORLD.Get_rank() sess = U.single_threaded_session() sess.__enter__() - if rank != 0: - logger.set_level(logger.DISABLED) - + if rank == 0: + logger.configure() + else: + logger.configure(format_strs=[]) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) - env = gym.make(env_id) + env = make_atari(env_id) def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613 return CnnPolicy(name=name, ob_space=env.observation_space, ac_space=env.action_space) - env = bench.Monitor(env, logger.get_dir() and - osp.join(logger.get_dir(), "%i.monitor.json"%rank)) + env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), str(rank))) env.seed(workerseed) gym.logger.setLevel(logging.WARN) - env = wrap_train(env) - num_timesteps = int(num_frames / 4 * 1.1) + env = wrap_deepmind(env) env.seed(workerseed) trpo_mpi.learn(env, policy_fn, timesteps_per_batch=512, max_kl=0.001, cg_iters=10, cg_damping=1e-3, - max_timesteps=num_timesteps, gamma=0.98, lam=1.0, vf_iters=3, vf_stepsize=1e-4, entcoeff=0.00) + max_timesteps=int(num_timesteps * 1.1), gamma=0.98, lam=1.0, vf_iters=3, vf_stepsize=1e-4, entcoeff=0.00) env.close() def main(): @@ -47,8 +40,9 @@ def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--env', help='environment ID', default='PongNoFrameskip-v4') parser.add_argument('--seed', help='RNG seed', type=int, default=0) + parser.add_argument('--num-timesteps', type=int, default=int(10e6)) args = parser.parse_args() - train(args.env, num_frames=40e6, seed=args.seed) + train(args.env, num_timesteps=args.num_timesteps, seed=args.seed) if __name__ == "__main__": diff --git a/baselines/trpo_mpi/run_mujoco.py b/baselines/trpo_mpi/run_mujoco.py index bab6aba..bbc631e 100644 --- a/baselines/trpo_mpi/run_mujoco.py +++ b/baselines/trpo_mpi/run_mujoco.py @@ -27,8 +27,8 @@ def train(env_id, num_timesteps, seed): def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=env.observation_space, ac_space=env.action_space, hid_size=32, num_hid_layers=2) - env = bench.Monitor(env, logger.get_dir() and - osp.join(logger.get_dir(), "%i.monitor.json" % rank)) + env = bench.Monitor(env, logger.get_dir() and + osp.join(logger.get_dir(), str(rank))) env.seed(workerseed) gym.logger.setLevel(logging.WARN) @@ -41,8 +41,10 @@ def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--env', help='environment ID', default='Hopper-v1') parser.add_argument('--seed', help='RNG seed', type=int, default=0) + parser.add_argument('--num-timesteps', type=int, default=int(1e6)) args = parser.parse_args() - train(args.env, num_timesteps=1e6, seed=args.seed) + logger.configure() + train(args.env, num_timesteps=args.num_timesteps, seed=args.seed) if __name__ == '__main__': diff --git a/baselines/trpo_mpi/trpo_mpi.py b/baselines/trpo_mpi/trpo_mpi.py index ef681c4..8a59c61 100644 --- a/baselines/trpo_mpi/trpo_mpi.py +++ b/baselines/trpo_mpi/trpo_mpi.py @@ -288,4 +288,4 @@ def learn(env, policy_func, *, logger.dump_tabular() def flatten_lists(listoflists): - return [el for list_ in listoflists for el in list_] + return [el for list_ in listoflists for el in list_] \ No newline at end of file