change atari preprocessing to use faster opencv

some logger changes
This commit is contained in:
John Schulman
2017-10-25 09:21:29 -04:00
parent 4993286230
commit bb40378118
40 changed files with 600 additions and 823 deletions

3
.gitignore vendored
View File

@@ -30,3 +30,6 @@ src
*.egg-info *.egg-info
.cache .cache
MUJOCO_LOG.TXT

View File

@@ -20,3 +20,14 @@ pip install -e .
- [DQN](baselines/deepq) - [DQN](baselines/deepq)
- [PPO](baselines/ppo1) - [PPO](baselines/ppo1)
- [TRPO](baselines/trpo_mpi) - [TRPO](baselines/trpo_mpi)
To cite this repository in publications:
@misc{baselines,
author = {Hesse, Christopher and Plappert, Matthias and Radford, Alec and Schulman, John and Sidor, Szymon and Wu, Yuhuai},
title = {OpenAI Baselines},
year = {2017},
publisher = {GitHub},
journal = {GitHub repository},
howpublished = {\url{https://github.com/openai/baselines}},
}

View File

@@ -1,9 +1,6 @@
import numpy as np import numpy as np
import tensorflow as tf import tensorflow as tf
from baselines.a2c.utils import conv, fc, conv_to_fc, batch_to_seq, seq_to_batch, lstm, lnlstm, sample, check_shape from baselines.a2c.utils import conv, fc, conv_to_fc, batch_to_seq, seq_to_batch, lstm, lnlstm, sample
from baselines.common.distributions import make_pdtype
import baselines.common.tf_util as U
import gym
class LnLstmPolicy(object): class LnLstmPolicy(object):
def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, nlstm=256, reuse=False): def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, nlstm=256, reuse=False):

View File

@@ -5,18 +5,15 @@ from baselines.common import set_global_seeds
from baselines import bench from baselines import bench
from baselines.a2c.a2c import learn from baselines.a2c.a2c import learn
from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
from baselines.common.atari_wrappers import wrap_deepmind from baselines.common.atari_wrappers import make_atari, wrap_deepmind
from baselines.a2c.policies import CnnPolicy, LstmPolicy, LnLstmPolicy from baselines.a2c.policies import CnnPolicy, LstmPolicy, LnLstmPolicy
def train(env_id, num_frames, seed, policy, lrschedule, num_cpu): def train(env_id, num_timesteps, seed, policy, lrschedule, num_cpu):
num_timesteps = int(num_frames / 4 * 1.1)
# divide by 4 due to frameskip, then do a little extras so episodes end
def make_env(rank): def make_env(rank):
def _thunk(): def _thunk():
env = gym.make(env_id) env = make_atari(env_id)
env.seed(seed + rank) env.seed(seed + rank)
env = bench.Monitor(env, logger.get_dir() and env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
os.path.join(logger.get_dir(), "{}.monitor.json".format(rank)))
gym.logger.setLevel(logging.WARN) gym.logger.setLevel(logging.WARN)
return wrap_deepmind(env) return wrap_deepmind(env)
return _thunk return _thunk
@@ -28,7 +25,7 @@ def train(env_id, num_frames, seed, policy, lrschedule, num_cpu):
policy_fn = LstmPolicy policy_fn = LstmPolicy
elif policy == 'lnlstm': elif policy == 'lnlstm':
policy_fn = LnLstmPolicy policy_fn = LnLstmPolicy
learn(policy_fn, env, seed, total_timesteps=num_timesteps, lrschedule=lrschedule) learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), lrschedule=lrschedule)
env.close() env.close()
def main(): def main():
@@ -38,10 +35,10 @@ def main():
parser.add_argument('--seed', help='RNG seed', type=int, default=0) parser.add_argument('--seed', help='RNG seed', type=int, default=0)
parser.add_argument('--policy', help='Policy architecture', choices=['cnn', 'lstm', 'lnlstm'], default='cnn') parser.add_argument('--policy', help='Policy architecture', choices=['cnn', 'lstm', 'lnlstm'], default='cnn')
parser.add_argument('--lrschedule', help='Learning rate schedule', choices=['constant', 'linear'], default='constant') parser.add_argument('--lrschedule', help='Learning rate schedule', choices=['constant', 'linear'], default='constant')
parser.add_argument('--million_frames', help='How many frames to train (/ 1e6). ' parser.add_argument('--num-timesteps', type=int, default=int(10e6))
'This number gets divided by 4 due to frameskip', type=int, default=40)
args = parser.parse_args() args = parser.parse_args()
train(args.env, num_frames=1e6 * args.million_frames, seed=args.seed, logger.configure()
train(args.env, num_timesteps=args.num_timesteps, seed=args.seed,
policy=args.policy, lrschedule=args.lrschedule, num_cpu=16) policy=args.policy, lrschedule=args.lrschedule, num_cpu=16)
if __name__ == '__main__': if __name__ == '__main__':

View File

@@ -110,7 +110,6 @@ def learn(env, policy, vf, gamma, lam, timesteps_per_batch, num_timesteps,
ob_no = np.concatenate([path["observation"] for path in paths]) ob_no = np.concatenate([path["observation"] for path in paths])
action_na = np.concatenate([path["action"] for path in paths]) action_na = np.concatenate([path["action"] for path in paths])
oldac_dist = np.concatenate([path["action_dist"] for path in paths]) oldac_dist = np.concatenate([path["action_dist"] for path in paths])
logp_n = np.concatenate([path["logp"] for path in paths])
adv_n = np.concatenate(advs) adv_n = np.concatenate(advs)
standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8) standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8)
@@ -126,7 +125,7 @@ def learn(env, policy, vf, gamma, lam, timesteps_per_batch, num_timesteps,
U.eval(tf.assign(stepsize, tf.maximum(min_stepsize, stepsize / 1.5))) U.eval(tf.assign(stepsize, tf.maximum(min_stepsize, stepsize / 1.5)))
elif kl < desired_kl / 2: elif kl < desired_kl / 2:
logger.log("kl too low") logger.log("kl too low")
U.eval(tf.assign(stepsize, tf.minimum(max_stepsize, stepsize * 1.5))) U.eval(tf.assign(stepsize, tf.minimum(max_stepsize, stepsize * 1.5)))
else: else:
logger.log("kl just right!") logger.log("kl just right!")
@@ -138,3 +137,6 @@ def learn(env, policy, vf, gamma, lam, timesteps_per_batch, num_timesteps,
callback() callback()
logger.dump_tabular() logger.dump_tabular()
i += 1 i += 1
coord.request_stop()
coord.join(enqueue_threads)

View File

@@ -113,7 +113,6 @@ class Runner(object):
nenv = env.num_envs nenv = env.num_envs
self.batch_ob_shape = (nenv*nsteps, nh, nw, nc*nstack) self.batch_ob_shape = (nenv*nsteps, nh, nw, nc*nstack)
self.obs = np.zeros((nenv, nh, nw, nc*nstack), dtype=np.uint8) self.obs = np.zeros((nenv, nh, nw, nc*nstack), dtype=np.uint8)
self.nc = nc
obs = env.reset() obs = env.reset()
self.update_obs(obs) self.update_obs(obs)
self.gamma = gamma self.gamma = gamma
@@ -122,8 +121,8 @@ class Runner(object):
self.dones = [False for _ in range(nenv)] self.dones = [False for _ in range(nenv)]
def update_obs(self, obs): def update_obs(self, obs):
self.obs = np.roll(self.obs, shift=-self.nc, axis=3) self.obs = np.roll(self.obs, shift=-1, axis=3)
self.obs[:, :, :, -self.nc:] = obs self.obs[:, :, :, -1] = obs[:, :, :, 0]
def run(self): def run(self):
mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[] mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[]
@@ -189,7 +188,8 @@ def learn(policy, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interval
runner = Runner(env, model, nsteps=nsteps, nstack=nstack, gamma=gamma) runner = Runner(env, model, nsteps=nsteps, nstack=nstack, gamma=gamma)
nbatch = nenvs*nsteps nbatch = nenvs*nsteps
tstart = time.time() tstart = time.time()
enqueue_threads = model.q_runner.create_threads(model.sess, coord=tf.train.Coordinator(), start=True) coord = tf.train.Coordinator()
enqueue_threads = model.q_runner.create_threads(model.sess, coord=coord, start=True)
for update in range(1, total_timesteps//nbatch+1): for update in range(1, total_timesteps//nbatch+1):
obs, states, rewards, masks, actions, values = runner.run() obs, states, rewards, masks, actions, values = runner.run()
policy_loss, value_loss, policy_entropy = model.train(obs, states, rewards, masks, actions, values) policy_loss, value_loss, policy_entropy = model.train(obs, states, rewards, masks, actions, values)
@@ -211,5 +211,6 @@ def learn(policy, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interval
savepath = osp.join(logger.get_dir(), 'checkpoint%.5i'%update) savepath = osp.join(logger.get_dir(), 'checkpoint%.5i'%update)
print('Saving to', savepath) print('Saving to', savepath)
model.save(savepath) model.save(savepath)
coord.request_stop()
coord.join(enqueue_threads)
env.close() env.close()

View File

@@ -1,9 +1,7 @@
import numpy as np import numpy as np
import tensorflow as tf import tensorflow as tf
from baselines.acktr.utils import conv, fc, dense, conv_to_fc, sample, kl_div from baselines.acktr.utils import conv, fc, dense, conv_to_fc, sample, kl_div
from baselines.common.distributions import make_pdtype
import baselines.common.tf_util as U import baselines.common.tf_util as U
import gym
class CnnPolicy(object): class CnnPolicy(object):
@@ -51,7 +49,6 @@ class GaussianMlpPolicy(object):
oldac_na = tf.placeholder(tf.float32, shape=[None, ac_dim], name="ac") # batch of actions previous actions oldac_na = tf.placeholder(tf.float32, shape=[None, ac_dim], name="ac") # batch of actions previous actions
oldac_dist = tf.placeholder(tf.float32, shape=[None, ac_dim*2], name="oldac_dist") # batch of actions previous action distributions oldac_dist = tf.placeholder(tf.float32, shape=[None, ac_dim*2], name="oldac_dist") # batch of actions previous action distributions
adv_n = tf.placeholder(tf.float32, shape=[None], name="adv") # advantage function estimate adv_n = tf.placeholder(tf.float32, shape=[None], name="adv") # advantage function estimate
oldlogprob_n = tf.placeholder(tf.float32, shape=[None], name='oldlogprob') # log probability of previous actions
wd_dict = {} wd_dict = {}
h1 = tf.nn.tanh(dense(ob_no, 64, "h1", weight_init=U.normc_initializer(1.0), bias_init=0.0, weight_loss_dict=wd_dict)) h1 = tf.nn.tanh(dense(ob_no, 64, "h1", weight_init=U.normc_initializer(1.0), bias_init=0.0, weight_loss_dict=wd_dict))
h2 = tf.nn.tanh(dense(h1, 64, "h2", weight_init=U.normc_initializer(1.0), bias_init=0.0, weight_loss_dict=wd_dict)) h2 = tf.nn.tanh(dense(h1, 64, "h2", weight_init=U.normc_initializer(1.0), bias_init=0.0, weight_loss_dict=wd_dict))

View File

@@ -5,24 +5,22 @@ from baselines.common import set_global_seeds
from baselines import bench from baselines import bench
from baselines.acktr.acktr_disc import learn from baselines.acktr.acktr_disc import learn
from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
from baselines.common.atari_wrappers import wrap_deepmind from baselines.common.atari_wrappers import make_atari, wrap_deepmind
from baselines.acktr.policies import CnnPolicy from baselines.acktr.policies import CnnPolicy
def train(env_id, num_frames, seed, num_cpu): def train(env_id, num_timesteps, seed, num_cpu):
num_timesteps = int(num_frames / 4 * 1.1)
def make_env(rank): def make_env(rank):
def _thunk(): def _thunk():
env = gym.make(env_id) env = make_atari(env_id)
env.seed(seed + rank) env.seed(seed + rank)
if logger.get_dir(): env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
env = bench.Monitor(env, os.path.join(logger.get_dir(), "{}.monitor.json".format(rank)))
gym.logger.setLevel(logging.WARN) gym.logger.setLevel(logging.WARN)
return wrap_deepmind(env) return wrap_deepmind(env)
return _thunk return _thunk
set_global_seeds(seed) set_global_seeds(seed)
env = SubprocVecEnv([make_env(i) for i in range(num_cpu)]) env = SubprocVecEnv([make_env(i) for i in range(num_cpu)])
policy_fn = CnnPolicy policy_fn = CnnPolicy
learn(policy_fn, env, seed, total_timesteps=num_timesteps, nprocs=num_cpu) learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), nprocs=num_cpu)
env.close() env.close()
def main(): def main():
@@ -30,10 +28,10 @@ def main():
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4') parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4')
parser.add_argument('--seed', help='RNG seed', type=int, default=0) parser.add_argument('--seed', help='RNG seed', type=int, default=0)
parser.add_argument('--million_frames', help='How many frames to train (/ 1e6). ' parser.add_argument('--num-timesteps', type=int, default=int(10e6))
'This number gets divided by 4 due to frameskip', type=int, default=40) args = parser.parse_args()
args = parser.parse_args() logger.configure()
train(args.env, num_frames=1e6 * args.million_frames, seed=args.seed, num_cpu=32) train(args.env, num_timesteps=args.num_timesteps, seed=args.seed, num_cpu=32)
if __name__ == '__main__': if __name__ == '__main__':

View File

@@ -13,13 +13,12 @@ from baselines.acktr.value_functions import NeuralNetValueFunction
def train(env_id, num_timesteps, seed): def train(env_id, num_timesteps, seed):
env=gym.make(env_id) env=gym.make(env_id)
if logger.get_dir(): env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
env = bench.Monitor(env, os.path.join(logger.get_dir(), "monitor.json"))
set_global_seeds(seed) set_global_seeds(seed)
env.seed(seed) env.seed(seed)
gym.logger.setLevel(logging.WARN) gym.logger.setLevel(logging.WARN)
with tf.Session(config=tf.ConfigProto()) as session: with tf.Session(config=tf.ConfigProto()):
ob_dim = env.observation_space.shape[0] ob_dim = env.observation_space.shape[0]
ac_dim = env.action_space.shape[0] ac_dim = env.action_space.shape[0]
with tf.variable_scope("vf"): with tf.variable_scope("vf"):
@@ -38,5 +37,7 @@ if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Run Mujoco benchmark.') parser = argparse.ArgumentParser(description='Run Mujoco benchmark.')
parser.add_argument('--seed', help='RNG seed', type=int, default=0) parser.add_argument('--seed', help='RNG seed', type=int, default=0)
parser.add_argument('--env', help='environment ID', type=str, default="Reacher-v1") parser.add_argument('--env', help='environment ID', type=str, default="Reacher-v1")
parser.add_argument('--num-timesteps', type=int, default=int(1e6))
args = parser.parse_args() args = parser.parse_args()
train(args.env, num_timesteps=1e6, seed=args.seed) logger.configure()
train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)

View File

@@ -13,7 +13,7 @@ class NeuralNetValueFunction(object):
wd_dict = {} wd_dict = {}
h1 = tf.nn.elu(dense(X, 64, "h1", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)) h1 = tf.nn.elu(dense(X, 64, "h1", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict))
h2 = tf.nn.elu(dense(h1, 64, "h2", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)) h2 = tf.nn.elu(dense(h1, 64, "h2", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict))
vpred_n = dense(h2, 1, "hfinal", weight_init=None, bias_init=0, weight_loss_dict=wd_dict)[:,0] vpred_n = dense(h2, 1, "hfinal", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)[:,0]
sample_vpred_n = vpred_n + tf.random_normal(tf.shape(vpred_n)) sample_vpred_n = vpred_n + tf.random_normal(tf.shape(vpred_n))
wd_loss = tf.get_collection("vf_losses", None) wd_loss = tf.get_collection("vf_losses", None)
loss = U.mean(tf.square(vpred_n - vtarg_n)) + tf.add_n(wd_loss) loss = U.mean(tf.square(vpred_n - vtarg_n)) + tf.add_n(wd_loss)
@@ -22,7 +22,7 @@ class NeuralNetValueFunction(object):
optim = kfac.KfacOptimizer(learning_rate=0.001, cold_lr=0.001*(1-0.9), momentum=0.9, \ optim = kfac.KfacOptimizer(learning_rate=0.001, cold_lr=0.001*(1-0.9), momentum=0.9, \
clip_kl=0.3, epsilon=0.1, stats_decay=0.95, \ clip_kl=0.3, epsilon=0.1, stats_decay=0.95, \
async=1, kfac_update=2, cold_iter=50, \ async=1, kfac_update=2, cold_iter=50, \
weight_decay_dict=wd_dict, max_grad_norm=1.0) weight_decay_dict=wd_dict, max_grad_norm=None)
vf_var_list = [] vf_var_list = []
for var in tf.trainable_variables(): for var in tf.trainable_variables():
if "vf" in var.name: if "vf" in var.name:

View File

@@ -1,3 +1,3 @@
from baselines.bench.benchmarks import * from baselines.bench.benchmarks import *
from baselines.bench.monitor import * from baselines.bench.monitor import *
from baselines.bench.simple_bench import simple_bench

View File

@@ -1,61 +1,71 @@
import os.path as osp
_atari7 = ['BeamRider', 'Breakout', 'Enduro', 'Pong', 'Qbert', 'Seaquest', 'SpaceInvaders'] _atari7 = ['BeamRider', 'Breakout', 'Enduro', 'Pong', 'Qbert', 'Seaquest', 'SpaceInvaders']
_atariexpl7 = ['Freeway', 'Gravitar', 'MontezumaRevenge', 'Pitfall', 'PrivateEye', 'Solaris', 'Venture'] _atariexpl7 = ['Freeway', 'Gravitar', 'MontezumaRevenge', 'Pitfall', 'PrivateEye', 'Solaris', 'Venture']
_BENCHMARKS = [] _BENCHMARKS = []
def register_benchmark(benchmark): def register_benchmark(benchmark):
for b in _BENCHMARKS: for b in _BENCHMARKS:
if b['name'] == benchmark['name']: if b['name'] == benchmark['name']:
raise ValueError('Benchmark with name %s already registered!'%b['name']) raise ValueError('Benchmark with name %s already registered!' % b['name'])
_BENCHMARKS.append(benchmark) _BENCHMARKS.append(benchmark)
def list_benchmarks(): def list_benchmarks():
return [b['name'] for b in _BENCHMARKS] return [b['name'] for b in _BENCHMARKS]
def get_benchmark(benchmark_name): def get_benchmark(benchmark_name):
for b in _BENCHMARKS: for b in _BENCHMARKS:
if b['name'] == benchmark_name: if b['name'] == benchmark_name:
return b return b
raise ValueError('%s not found! Known benchmarks: %s' % (benchmark_name, list_benchmarks())) raise ValueError('%s not found! Known benchmarks: %s' % (benchmark_name, list_benchmarks()))
def get_task(benchmark, env_id): def get_task(benchmark, env_id):
"""Get a task by env_id. Return None if the benchmark doesn't have the env""" """Get a task by env_id. Return None if the benchmark doesn't have the env"""
return next(filter(lambda task: task['env_id'] == env_id, benchmark['tasks']), None) return next(filter(lambda task: task['env_id'] == env_id, benchmark['tasks']), None)
def find_task_for_env_id_in_any_benchmark(env_id): def find_task_for_env_id_in_any_benchmark(env_id):
for bm in _BENCHMARKS: for bm in _BENCHMARKS:
for task in bm["tasks"]: for task in bm["tasks"]:
if task["env_id"]==env_id: if task["env_id"] == env_id:
return bm, task return bm, task
return None, None return None, None
_ATARI_SUFFIX = 'NoFrameskip-v4' _ATARI_SUFFIX = 'NoFrameskip-v4'
register_benchmark({ register_benchmark({
'name' : 'Atari200M', 'name': 'Atari50M',
'description' :'7 Atari games from Mnih et al. (2013), with pixel observations, 200M frames', 'description': '7 Atari games from Mnih et al. (2013), with pixel observations, 50M timesteps',
'tasks' : [{'env_id' : _game + _ATARI_SUFFIX, 'trials' : 2, 'num_timesteps' : int(200e6)} for _game in _atari7] 'tasks': [{'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(50e6)} for _game in _atari7]
}) })
register_benchmark({ register_benchmark({
'name' : 'Atari40M', 'name': 'Atari10M',
'description' :'7 Atari games from Mnih et al. (2013), with pixel observations, 40M frames', 'description': '7 Atari games from Mnih et al. (2013), with pixel observations, 10M timesteps',
'tasks' : [{'env_id' : _game + _ATARI_SUFFIX, 'trials' : 2, 'num_timesteps' : int(40e6)} for _game in _atari7] 'tasks': [{'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(10e6)} for _game in _atari7]
}) })
register_benchmark({ register_benchmark({
'name' : 'Atari1Hr', 'name': 'Atari1Hr',
'description' :'7 Atari games from Mnih et al. (2013), with pixel observations, 1 hour of walltime', 'description': '7 Atari games from Mnih et al. (2013), with pixel observations, 1 hour of walltime',
'tasks' : [{'env_id' : _game + _ATARI_SUFFIX, 'trials' : 2, 'num_seconds' : 60*60} for _game in _atari7] 'tasks': [{'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_seconds': 60 * 60} for _game in _atari7]
}) })
register_benchmark({ register_benchmark({
'name' : 'AtariExploration40M', 'name': 'AtariExploration10M',
'description' :'7 Atari games emphasizing exploration, with pixel observations, 40M frames', 'description': '7 Atari games emphasizing exploration, with pixel observations, 10M timesteps',
'tasks' : [{'env_id' : _game + _ATARI_SUFFIX, 'trials' : 2, 'num_timesteps' : int(40e6)} for _game in _atariexpl7] 'tasks': [{'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(10e6)} for _game in _atariexpl7]
}) })
# MuJoCo # MuJoCo
_mujocosmall = [ _mujocosmall = [
@@ -63,78 +73,60 @@ _mujocosmall = [
'HalfCheetah-v1', 'Hopper-v1', 'Walker2d-v1', 'HalfCheetah-v1', 'Hopper-v1', 'Walker2d-v1',
'Reacher-v1', 'Swimmer-v1'] 'Reacher-v1', 'Swimmer-v1']
register_benchmark({ register_benchmark({
'name' : 'Mujoco1M', 'name': 'Mujoco1M',
'description' : 'Some small 2D MuJoCo tasks, run for 1M timesteps', 'description': 'Some small 2D MuJoCo tasks, run for 1M timesteps',
'tasks' : [{'env_id' : _envid, 'trials' : 3, 'num_timesteps' : int(1e6)} for _envid in _mujocosmall] 'tasks': [{'env_id': _envid, 'trials': 3, 'num_timesteps': int(1e6)} for _envid in _mujocosmall]
}) })
register_benchmark({ register_benchmark({
'name' : 'MujocoWalkers', 'name': 'MujocoWalkers',
'description' : 'MuJoCo forward walkers, run for 8M, humanoid 100M', 'description': 'MuJoCo forward walkers, run for 8M, humanoid 100M',
'tasks' : [ 'tasks': [
{'env_id' : "Hopper-v1", 'trials' : 4, 'num_timesteps' : 8*1000000 }, {'env_id': "Hopper-v1", 'trials': 4, 'num_timesteps': 8 * 1000000},
{'env_id' : "Walker2d-v1", 'trials' : 4, 'num_timesteps' : 8*1000000 }, {'env_id': "Walker2d-v1", 'trials': 4, 'num_timesteps': 8 * 1000000},
{'env_id' : "Humanoid-v1", 'trials' : 4, 'num_timesteps' : 100*1000000 }, {'env_id': "Humanoid-v1", 'trials': 4, 'num_timesteps': 100 * 1000000},
] ]
}) })
# To reproduce:
# python3 baselines/baselines/ppo2/ppo2_run_benchmark.py gce MujocoWalkers myrun_ppo2_whiteobs1_cpu8
# (observation input filters necessary)
# Roboschool # Roboschool
register_benchmark({ register_benchmark({
'name' : 'Roboschool8M', 'name': 'Roboschool8M',
'description' : 'Small 2D tasks, up to 30 minutes to complete on 8 cores', 'description': 'Small 2D tasks, up to 30 minutes to complete on 8 cores',
'tasks' : [ 'tasks': [
{'env_id' : "RoboschoolReacher-v1", 'trials' : 4, 'num_timesteps' : 2*1000000 }, {'env_id': "RoboschoolReacher-v1", 'trials': 4, 'num_timesteps': 2 * 1000000},
{'env_id' : "RoboschoolAnt-v1", 'trials' : 4, 'num_timesteps' : 8*1000000 }, {'env_id': "RoboschoolAnt-v1", 'trials': 4, 'num_timesteps': 8 * 1000000},
{'env_id' : "RoboschoolHalfCheetah-v1", 'trials' : 4, 'num_timesteps' : 8*1000000 }, {'env_id': "RoboschoolHalfCheetah-v1", 'trials': 4, 'num_timesteps': 8 * 1000000},
{'env_id' : "RoboschoolHopper-v1", 'trials' : 4, 'num_timesteps' : 8*1000000 }, {'env_id': "RoboschoolHopper-v1", 'trials': 4, 'num_timesteps': 8 * 1000000},
{'env_id' : "RoboschoolWalker2d-v1", 'trials' : 4, 'num_timesteps' : 8*1000000 }, {'env_id': "RoboschoolWalker2d-v1", 'trials': 4, 'num_timesteps': 8 * 1000000},
] ]
}) })
register_benchmark({ register_benchmark({
'name' : 'RoboschoolHarder', 'name': 'RoboschoolHarder',
'description' : 'Test your might!!! Up to 12 hours on 32 cores', 'description': 'Test your might!!! Up to 12 hours on 32 cores',
'tasks' : [ 'tasks': [
{'env_id' : "RoboschoolHumanoid-v1", 'trials' : 4, 'num_timesteps' : 100*1000000 }, {'env_id': "RoboschoolHumanoid-v1", 'trials': 4, 'num_timesteps': 100 * 1000000},
{'env_id' : "RoboschoolHumanoidFlagrun-v1", 'trials' : 4, 'num_timesteps' : 200*1000000 }, {'env_id': "RoboschoolHumanoidFlagrun-v1", 'trials': 4, 'num_timesteps': 200 * 1000000},
{'env_id' : "RoboschoolHumanoidFlagrunHarder-v1", 'trials' : 4, 'num_timesteps' : 400*1000000 }, {'env_id': "RoboschoolHumanoidFlagrunHarder-v1", 'trials': 4, 'num_timesteps': 400 * 1000000},
] ]
}) })
# To reproduce:
# python3 baselines/baselines/ppo2/ppo2_run_benchmark.py gce Roboschool8M myrun_ppo2_cpu8
# python3 baselines/baselines/ppo2/ppo2_run_benchmark.py gce RoboschoolHarder myrun_ppo2_cpu32_large_samples65536
# (Large network, train on 65536 samples each iteration. Also, _large is really necessary only for Harder)
# Other # Other
_atari50 = [ # actually 49 _atari50 = [ # actually 47
'Alien', 'Amidar', 'Assault', 'Asterix', 'Asteroids', 'Alien', 'Amidar', 'Assault', 'Asterix', 'Asteroids',
'Atlantis', 'BankHeist', 'BattleZone', 'BeamRider', 'Bowling', 'Atlantis', 'BankHeist', 'BattleZone', 'BeamRider', 'Bowling',
'Boxing', 'Breakout', 'Centipede', 'ChopperCommand', 'CrazyClimber', 'Breakout', 'Centipede', 'ChopperCommand', 'CrazyClimber',
'DemonAttack', 'DoubleDunk', 'Enduro', 'FishingDerby', 'Freeway', 'DemonAttack', 'DoubleDunk', 'Enduro', 'FishingDerby', 'Freeway',
'Frostbite', 'Gopher', 'Gravitar', 'IceHockey', 'Jamesbond', 'Frostbite', 'Gopher', 'Gravitar', 'IceHockey', 'Jamesbond',
'Kangaroo', 'Krull', 'KungFuMaster', 'MontezumaRevenge', 'MsPacman', 'Kangaroo', 'Krull', 'KungFuMaster', 'MontezumaRevenge', 'MsPacman',
'NameThisGame', 'Pitfall', 'Pong', 'PrivateEye', 'Qbert', 'NameThisGame', 'Pitfall', 'Pong', 'PrivateEye', 'Qbert',
'Riverraid', 'RoadRunner', 'Robotank', 'Seaquest', 'SpaceInvaders', 'RoadRunner', 'Robotank', 'Seaquest', 'SpaceInvaders', 'StarGunner',
'StarGunner', 'Tennis', 'TimePilot', 'Tutankham', 'UpNDown', 'Tennis', 'TimePilot', 'Tutankham', 'UpNDown', 'Venture',
'Venture', 'VideoPinball', 'WizardOfWor', 'Zaxxon', 'VideoPinball', 'WizardOfWor', 'Zaxxon',
] ]
register_benchmark({ register_benchmark({
'name' : 'Atari50_40M', 'name': 'Atari50_10M',
'description' :'7 Atari games from Mnih et al. (2013), with pixel observations, 40M frames', 'description': '47 Atari games from Mnih et al. (2013), with pixel observations, 10M timesteps',
'tasks' : [{'env_id' : _game + _ATARI_SUFFIX, 'trials' : 3, 'num_timesteps' : int(40e6)} for _game in _atari50] 'tasks': [{'env_id': _game + _ATARI_SUFFIX, 'trials': 3, 'num_timesteps': int(10e6)} for _game in _atari50]
}) })
def env_shortname(s):
"Make typical names above shorter, while keeping recognizable"
s = s.replace("NoFrameskip", "")
if s[:10]=="Roboschool": s = s[10:]
i = s.rfind("-v")
if i!=-1: s = s[:i]
return s.lower()

View File

@@ -2,20 +2,17 @@ __all__ = ['Monitor', 'get_monitor_files', 'load_results']
import gym import gym
from gym.core import Wrapper from gym.core import Wrapper
from os import path
import time import time
from glob import glob from glob import glob
import csv
try: import os.path as osp
import ujson as json # Not necessary for monitor writing, but very useful for monitor loading import json
except ImportError:
import json
class Monitor(Wrapper): class Monitor(Wrapper):
EXT = "monitor.json" EXT = "monitor.csv"
f = None f = None
def __init__(self, env, filename, allow_early_resets=False): def __init__(self, env, filename, allow_early_resets=False, reset_keywords=()):
Wrapper.__init__(self, env=env) Wrapper.__init__(self, env=env)
self.tstart = time.time() self.tstart = time.time()
if filename is None: if filename is None:
@@ -23,50 +20,38 @@ class Monitor(Wrapper):
self.logger = None self.logger = None
else: else:
if not filename.endswith(Monitor.EXT): if not filename.endswith(Monitor.EXT):
filename = filename + "." + Monitor.EXT if osp.isdir(filename):
filename = osp.join(filename, Monitor.EXT)
else:
filename = filename + "." + Monitor.EXT
self.f = open(filename, "wt") self.f = open(filename, "wt")
self.logger = JSONLogger(self.f) self.f.write('#%s\n'%json.dumps({"t_start": self.tstart, "gym_version": gym.__version__,
self.logger.writekvs({"t_start": self.tstart, "gym_version": gym.__version__, "env_id": env.spec.id if env.spec else 'Unknown'}))
"env_id": env.spec.id if env.spec else 'Unknown'}) self.logger = csv.DictWriter(self.f, fieldnames=('r', 'l', 't')+reset_keywords)
self.logger.writeheader()
self.reset_keywords = reset_keywords
self.allow_early_resets = allow_early_resets self.allow_early_resets = allow_early_resets
self.rewards = None self.rewards = None
self.needs_reset = True self.needs_reset = True
self.episode_rewards = [] self.episode_rewards = []
self.episode_lengths = [] self.episode_lengths = []
self.total_steps = 0 self.total_steps = 0
self.current_metadata = {} # extra info that gets injected into each log entry self.current_reset_info = {} # extra info about the current episode, that was passed in during reset()
# Useful for metalearning where we're modifying the environment externally
# But want our logs to know about these modifications
def __getstate__(self): # XXX def _reset(self, **kwargs):
d = self.__dict__.copy()
if self.f:
del d['f'], d['logger']
d['_filename'] = self.f.name
d['_num_episodes'] = len(self.episode_rewards)
else:
d['_filename'] = None
return d
def __setstate__(self, d):
filename = d.pop('_filename')
self.__dict__ = d
if filename is not None:
nlines = d.pop('_num_episodes') + 1
self.f = open(filename, "r+t")
for _ in range(nlines):
self.f.readline()
self.f.truncate()
self.logger = JSONLogger(self.f)
def reset(self):
if not self.allow_early_resets and not self.needs_reset: if not self.allow_early_resets and not self.needs_reset:
raise RuntimeError("Tried to reset an environment before done. If you want to allow early resets, wrap your env with Monitor(env, path, allow_early_resets=True)") raise RuntimeError("Tried to reset an environment before done. If you want to allow early resets, wrap your env with Monitor(env, path, allow_early_resets=True)")
self.rewards = [] self.rewards = []
self.needs_reset = False self.needs_reset = False
return self.env.reset() for k in self.reset_keywords:
v = kwargs.get(k)
if v is None:
raise ValueError('Expected you to pass kwarg %s into reset'%k)
self.current_reset_info[k] = v
return self.env.reset(**kwargs)
def step(self, action): def _step(self, action):
if self.needs_reset: if self.needs_reset:
raise RuntimeError("Tried to step environment that needs reset") raise RuntimeError("Tried to step environment that needs reset")
ob, rew, done, info = self.env.step(action) ob, rew, done, info = self.env.step(action)
@@ -75,10 +60,11 @@ class Monitor(Wrapper):
self.needs_reset = True self.needs_reset = True
eprew = sum(self.rewards) eprew = sum(self.rewards)
eplen = len(self.rewards) eplen = len(self.rewards)
epinfo = {"r": eprew, "l": eplen, "t": round(time.time() - self.tstart, 6)} epinfo = {"r": round(eprew, 6), "l": eplen, "t": round(time.time() - self.tstart, 6)}
epinfo.update(self.current_metadata) epinfo.update(self.current_reset_info)
if self.logger: if self.logger:
self.logger.writekvs(epinfo) self.logger.writerow(epinfo)
self.f.flush()
self.episode_rewards.append(eprew) self.episode_rewards.append(eprew)
self.episode_lengths.append(eplen) self.episode_lengths.append(eplen)
info['episode'] = epinfo info['episode'] = epinfo
@@ -98,52 +84,40 @@ class Monitor(Wrapper):
def get_episode_lengths(self): def get_episode_lengths(self):
return self.episode_lengths return self.episode_lengths
class JSONLogger(object):
def __init__(self, file):
self.file = file
def writekvs(self, kvs):
for k,v in kvs.items():
if hasattr(v, 'dtype'):
v = v.tolist()
kvs[k] = float(v)
self.file.write(json.dumps(kvs) + '\n')
self.file.flush()
class LoadMonitorResultsError(Exception): class LoadMonitorResultsError(Exception):
pass pass
def get_monitor_files(dir): def get_monitor_files(dir):
return glob(path.join(dir, "*" + Monitor.EXT)) return glob(osp.join(dir, "*" + Monitor.EXT))
def load_results(dir, raw_episodes=False): def load_results(dir):
fnames = get_monitor_files(dir) import pandas
if not fnames: monitor_files = glob(osp.join(dir, "*monitor.*")) # get both csv and (old) json files
if not monitor_files:
raise LoadMonitorResultsError("no monitor files of the form *%s found in %s" % (Monitor.EXT, dir)) raise LoadMonitorResultsError("no monitor files of the form *%s found in %s" % (Monitor.EXT, dir))
episodes = [] dfs = []
headers = [] headers = []
for fname in fnames: for fname in monitor_files:
with open(fname, 'rt') as fh: with open(fname, 'rt') as fh:
lines = fh.readlines() if fname.endswith('csv'):
header = json.loads(lines[0]) firstline = fh.readline()
headers.append(header) assert firstline[0] == '#'
for line in lines[1:]: header = json.loads(firstline[1:])
episode = json.loads(line) df = pandas.read_csv(fh, index_col=None)
episode['abstime'] = header['t_start'] + episode['t'] headers.append(header)
del episode['t'] elif fname.endswith('json'): # Deprecated json format
episodes.append(episode) episodes = []
header0 = headers[0] lines = fh.readlines()
for header in headers[1:]: header = json.loads(lines[0])
assert header['env_id'] == header0['env_id'], "mixing data from two envs" headers.append(header)
episodes = sorted(episodes, key=lambda e: e['abstime']) for line in lines[1:]:
if raw_episodes: episode = json.loads(line)
return episodes episodes.append(episode)
else: df = pandas.DataFrame(episodes)
return { df['t'] += header['t_start']
'env_info': {'env_id': header0['env_id'], 'gym_version': header0['gym_version']}, dfs.append(df)
'episode_end_times': [e['abstime'] for e in episodes], df = pandas.concat(dfs)
'episode_lengths': [e['l'] for e in episodes], df.sort_values('t', inplace=True)
'episode_rewards': [e['r'] for e in episodes], df['t'] -= min(header['t_start'] for header in headers)
'initial_reset_time': min([min(header['t_start'] for header in headers)]) df.headers = headers # HACK to preserve backwards compatibility
} return df

View File

@@ -1,9 +1,8 @@
import numpy as np import numpy as np
from collections import deque from collections import deque
from PIL import Image
import gym import gym
from gym import spaces from gym import spaces
import cv2
class NoopResetEnv(gym.Wrapper): class NoopResetEnv(gym.Wrapper):
def __init__(self, env, noop_max=30): def __init__(self, env, noop_max=30):
@@ -13,11 +12,16 @@ class NoopResetEnv(gym.Wrapper):
gym.Wrapper.__init__(self, env) gym.Wrapper.__init__(self, env)
self.noop_max = noop_max self.noop_max = noop_max
self.override_num_noops = None self.override_num_noops = None
assert env.unwrapped.get_action_meanings()[0] == 'NOOP' if isinstance(env.action_space, gym.spaces.MultiBinary):
self.noop_action = np.zeros(self.env.action_space.n, dtype=np.int64)
else:
# used for atari environments
self.noop_action = 0
assert env.unwrapped.get_action_meanings()[0] == 'NOOP'
def _reset(self): def _reset(self, **kwargs):
""" Do no-op action for a number of steps in [1, noop_max].""" """ Do no-op action for a number of steps in [1, noop_max]."""
self.env.reset() self.env.reset(**kwargs)
if self.override_num_noops is not None: if self.override_num_noops is not None:
noops = self.override_num_noops noops = self.override_num_noops
else: else:
@@ -25,9 +29,9 @@ class NoopResetEnv(gym.Wrapper):
assert noops > 0 assert noops > 0
obs = None obs = None
for _ in range(noops): for _ in range(noops):
obs, _, done, _ = self.env.step(0) obs, _, done, _ = self.env.step(self.noop_action)
if done: if done:
obs = self.env.reset() obs = self.env.reset(**kwargs)
return obs return obs
class FireResetEnv(gym.Wrapper): class FireResetEnv(gym.Wrapper):
@@ -37,14 +41,14 @@ class FireResetEnv(gym.Wrapper):
assert env.unwrapped.get_action_meanings()[1] == 'FIRE' assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
assert len(env.unwrapped.get_action_meanings()) >= 3 assert len(env.unwrapped.get_action_meanings()) >= 3
def _reset(self): def _reset(self, **kwargs):
self.env.reset() self.env.reset(**kwargs)
obs, _, done, _ = self.env.step(1) obs, _, done, _ = self.env.step(1)
if done: if done:
self.env.reset() self.env.reset(**kwargs)
obs, _, done, _ = self.env.step(2) obs, _, done, _ = self.env.step(2)
if done: if done:
self.env.reset() self.env.reset(**kwargs)
return obs return obs
class EpisodicLifeEnv(gym.Wrapper): class EpisodicLifeEnv(gym.Wrapper):
@@ -70,13 +74,13 @@ class EpisodicLifeEnv(gym.Wrapper):
self.lives = lives self.lives = lives
return obs, reward, done, info return obs, reward, done, info
def _reset(self): def _reset(self, **kwargs):
"""Reset only when lives are exhausted. """Reset only when lives are exhausted.
This way all states are still reachable even though lives are episodic, This way all states are still reachable even though lives are episodic,
and the learner need not know about any of this behind-the-scenes. and the learner need not know about any of this behind-the-scenes.
""" """
if self.was_real_done: if self.was_real_done:
obs = self.env.reset() obs = self.env.reset(**kwargs)
else: else:
# no-op step to advance from terminal/lost life state # no-op step to advance from terminal/lost life state
obs, _, _, _ = self.env.step(0) obs, _, _, _ = self.env.step(0)
@@ -88,30 +92,26 @@ class MaxAndSkipEnv(gym.Wrapper):
"""Return only every `skip`-th frame""" """Return only every `skip`-th frame"""
gym.Wrapper.__init__(self, env) gym.Wrapper.__init__(self, env)
# most recent raw observations (for max pooling across time steps) # most recent raw observations (for max pooling across time steps)
self._obs_buffer = deque(maxlen=2) self._obs_buffer = np.zeros((2,)+env.observation_space.shape, dtype='uint8')
self._skip = skip self._skip = skip
def _step(self, action): def _step(self, action):
"""Repeat action, sum reward, and max over last observations.""" """Repeat action, sum reward, and max over last observations."""
total_reward = 0.0 total_reward = 0.0
done = None done = None
for _ in range(self._skip): for i in range(self._skip):
obs, reward, done, info = self.env.step(action) obs, reward, done, info = self.env.step(action)
self._obs_buffer.append(obs) if i == self._skip - 2: self._obs_buffer[0] = obs
if i == self._skip - 1: self._obs_buffer[1] = obs
total_reward += reward total_reward += reward
if done: if done:
break break
max_frame = np.max(np.stack(self._obs_buffer), axis=0) # Note that the observation on the done=True frame
# doesn't matter
max_frame = self._obs_buffer.max(axis=0)
return max_frame, total_reward, done, info return max_frame, total_reward, done, info
def _reset(self):
"""Clear past frame buffer and init. to first obs. from inner env."""
self._obs_buffer.clear()
obs = self.env.reset()
self._obs_buffer.append(obs)
return obs
class ClipRewardEnv(gym.RewardWrapper): class ClipRewardEnv(gym.RewardWrapper):
def _reward(self, reward): def _reward(self, reward):
"""Bin reward to {+1, 0, -1} by its sign.""" """Bin reward to {+1, 0, -1} by its sign."""
@@ -121,52 +121,89 @@ class WarpFrame(gym.ObservationWrapper):
def __init__(self, env): def __init__(self, env):
"""Warp frames to 84x84 as done in the Nature paper and later work.""" """Warp frames to 84x84 as done in the Nature paper and later work."""
gym.ObservationWrapper.__init__(self, env) gym.ObservationWrapper.__init__(self, env)
self.res = 84 self.width = 84
self.observation_space = spaces.Box(low=0, high=255, shape=(self.res, self.res, 1)) self.height = 84
self.observation_space = spaces.Box(low=0, high=255, shape=(self.height, self.width, 1))
def _observation(self, obs): def _observation(self, frame):
frame = np.dot(obs.astype('float32'), np.array([0.299, 0.587, 0.114], 'float32')) frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
frame = np.array(Image.fromarray(frame).resize((self.res, self.res), frame = cv2.resize(frame, (self.width, self.height), interpolation=cv2.INTER_AREA)
resample=Image.BILINEAR), dtype=np.uint8) return frame[:, :, None]
return frame.reshape((self.res, self.res, 1))
class FrameStack(gym.Wrapper): class FrameStack(gym.Wrapper):
def __init__(self, env, k): def __init__(self, env, k):
"""Buffer observations and stack across channels (last axis).""" """Stack k last frames.
Returns lazy array, which is much more memory efficient.
See Also
--------
baselines.common.atari_wrappers.LazyFrames
"""
gym.Wrapper.__init__(self, env) gym.Wrapper.__init__(self, env)
self.k = k self.k = k
self.frames = deque([], maxlen=k) self.frames = deque([], maxlen=k)
shp = env.observation_space.shape shp = env.observation_space.shape
assert shp[2] == 1 # can only stack 1-channel frames self.observation_space = spaces.Box(low=0, high=255, shape=(shp[0], shp[1], shp[2] * k))
self.observation_space = spaces.Box(low=0, high=255, shape=(shp[0], shp[1], k))
def _reset(self): def _reset(self):
"""Clear buffer and re-fill by duplicating the first observation."""
ob = self.env.reset() ob = self.env.reset()
for _ in range(self.k): self.frames.append(ob) for _ in range(self.k):
return self._observation() self.frames.append(ob)
return self._get_ob()
def _step(self, action): def _step(self, action):
ob, reward, done, info = self.env.step(action) ob, reward, done, info = self.env.step(action)
self.frames.append(ob) self.frames.append(ob)
return self._observation(), reward, done, info return self._get_ob(), reward, done, info
def _observation(self): def _get_ob(self):
assert len(self.frames) == self.k assert len(self.frames) == self.k
return np.concatenate(self.frames, axis=2) return LazyFrames(list(self.frames))
def wrap_deepmind(env, episode_life=True, clip_rewards=True): class ScaledFloatFrame(gym.ObservationWrapper):
"""Configure environment for DeepMind-style Atari. def _observation(self, observation):
# careful! This undoes the memory optimization, use
# with smaller replay buffers only.
return np.array(observation).astype(np.float32) / 255.0
Note: this does not include frame stacking!""" class LazyFrames(object):
assert 'NoFrameskip' in env.spec.id # required for DeepMind-style skip def __init__(self, frames):
if episode_life: """This object ensures that common frames between the observations are only stored once.
env = EpisodicLifeEnv(env) It exists purely to optimize memory usage which can be huge for DQN's 1M frames replay
buffers.
This object should only be converted to numpy array before being passed to the model.
You'd not belive how complex the previous solution was."""
self._frames = frames
def __array__(self, dtype=None):
out = np.concatenate(self._frames, axis=2)
if dtype is not None:
out = out.astype(dtype)
return out
def make_atari(env_id):
env = gym.make(env_id)
assert 'NoFrameskip' in env.spec.id
env = NoopResetEnv(env, noop_max=30) env = NoopResetEnv(env, noop_max=30)
env = MaxAndSkipEnv(env, skip=4) env = MaxAndSkipEnv(env, skip=4)
return env
def wrap_deepmind(env, episode_life=True, clip_rewards=True, frame_stack=False, scale=False):
"""Configure environment for DeepMind-style Atari.
"""
if episode_life:
env = EpisodicLifeEnv(env)
if 'FIRE' in env.unwrapped.get_action_meanings(): if 'FIRE' in env.unwrapped.get_action_meanings():
env = FireResetEnv(env) env = FireResetEnv(env)
env = WarpFrame(env) env = WarpFrame(env)
if scale:
env = ScaledFloatFrame(env)
if clip_rewards: if clip_rewards:
env = ClipRewardEnv(env) env = ClipRewardEnv(env)
if frame_stack:
env = FrameStack(env, 4)
return env return env

View File

@@ -1,239 +0,0 @@
import cv2
import gym
import numpy as np
from collections import deque
from gym import spaces
class NoopResetEnv(gym.Wrapper):
def __init__(self, env=None, noop_max=30):
"""Sample initial states by taking random number of no-ops on reset.
No-op is assumed to be action 0.
"""
super(NoopResetEnv, self).__init__(env)
self.noop_max = noop_max
self.override_num_noops = None
assert env.unwrapped.get_action_meanings()[0] == 'NOOP'
def _reset(self):
""" Do no-op action for a number of steps in [1, noop_max]."""
self.env.reset()
if self.override_num_noops is not None:
noops = self.override_num_noops
else:
noops = np.random.randint(1, self.noop_max + 1)
assert noops > 0
obs = None
for _ in range(noops):
obs, _, done, _ = self.env.step(0)
if done:
obs = self.env.reset()
return obs
class FireResetEnv(gym.Wrapper):
def __init__(self, env=None):
"""For environments where the user need to press FIRE for the game to start."""
super(FireResetEnv, self).__init__(env)
assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
assert len(env.unwrapped.get_action_meanings()) >= 3
def _reset(self):
self.env.reset()
obs, _, done, _ = self.env.step(1)
if done:
self.env.reset()
obs, _, done, _ = self.env.step(2)
if done:
self.env.reset()
return obs
class EpisodicLifeEnv(gym.Wrapper):
def __init__(self, env=None):
"""Make end-of-life == end-of-episode, but only reset on true game over.
Done by DeepMind for the DQN and co. since it helps value estimation.
"""
super(EpisodicLifeEnv, self).__init__(env)
self.lives = 0
self.was_real_done = True
self.was_real_reset = False
def _step(self, action):
obs, reward, done, info = self.env.step(action)
self.was_real_done = done
# check current lives, make loss of life terminal,
# then update lives to handle bonus lives
lives = self.env.unwrapped.ale.lives()
if lives < self.lives and lives > 0:
# for Qbert somtimes we stay in lives == 0 condtion for a few frames
# so its important to keep lives > 0, so that we only reset once
# the environment advertises done.
done = True
self.lives = lives
return obs, reward, done, info
def _reset(self):
"""Reset only when lives are exhausted.
This way all states are still reachable even though lives are episodic,
and the learner need not know about any of this behind-the-scenes.
"""
if self.was_real_done:
obs = self.env.reset()
self.was_real_reset = True
else:
# no-op step to advance from terminal/lost life state
obs, _, _, _ = self.env.step(0)
self.was_real_reset = False
self.lives = self.env.unwrapped.ale.lives()
return obs
class MaxAndSkipEnv(gym.Wrapper):
def __init__(self, env=None, skip=4):
"""Return only every `skip`-th frame"""
super(MaxAndSkipEnv, self).__init__(env)
# most recent raw observations (for max pooling across time steps)
self._obs_buffer = deque(maxlen=2)
self._skip = skip
def _step(self, action):
total_reward = 0.0
done = None
for _ in range(self._skip):
obs, reward, done, info = self.env.step(action)
self._obs_buffer.append(obs)
total_reward += reward
if done:
break
max_frame = np.max(np.stack(self._obs_buffer), axis=0)
return max_frame, total_reward, done, info
def _reset(self):
"""Clear past frame buffer and init. to first obs. from inner env."""
self._obs_buffer.clear()
obs = self.env.reset()
self._obs_buffer.append(obs)
return obs
class ProcessFrame84(gym.ObservationWrapper):
def __init__(self, env=None):
super(ProcessFrame84, self).__init__(env)
self.observation_space = spaces.Box(low=0, high=255, shape=(84, 84, 1))
def _observation(self, obs):
return ProcessFrame84.process(obs)
@staticmethod
def process(frame):
if frame.size == 210 * 160 * 3:
img = np.reshape(frame, [210, 160, 3]).astype(np.float32)
elif frame.size == 250 * 160 * 3:
img = np.reshape(frame, [250, 160, 3]).astype(np.float32)
else:
assert False, "Unknown resolution."
img = img[:, :, 0] * 0.299 + img[:, :, 1] * 0.587 + img[:, :, 2] * 0.114
resized_screen = cv2.resize(img, (84, 110), interpolation=cv2.INTER_AREA)
x_t = resized_screen[18:102, :]
x_t = np.reshape(x_t, [84, 84, 1])
return x_t.astype(np.uint8)
class ClippedRewardsWrapper(gym.RewardWrapper):
def _reward(self, reward):
"""Change all the positive rewards to 1, negative to -1 and keep zero."""
return np.sign(reward)
class LazyFrames(object):
def __init__(self, frames):
"""This object ensures that common frames between the observations are only stored once.
It exists purely to optimize memory usage which can be huge for DQN's 1M frames replay
buffers.
This object should only be converted to numpy array before being passed to the model.
You'd not belive how complex the previous solution was."""
self._frames = frames
def __array__(self, dtype=None):
out = np.concatenate(self._frames, axis=2)
if dtype is not None:
out = out.astype(dtype)
return out
class FrameStack(gym.Wrapper):
def __init__(self, env, k):
"""Stack k last frames.
Returns lazy array, which is much more memory efficient.
See Also
--------
baselines.common.atari_wrappers.LazyFrames
"""
gym.Wrapper.__init__(self, env)
self.k = k
self.frames = deque([], maxlen=k)
shp = env.observation_space.shape
self.observation_space = spaces.Box(low=0, high=255, shape=(shp[0], shp[1], shp[2] * k))
def _reset(self):
ob = self.env.reset()
for _ in range(self.k):
self.frames.append(ob)
return self._get_ob()
def _step(self, action):
ob, reward, done, info = self.env.step(action)
self.frames.append(ob)
return self._get_ob(), reward, done, info
def _get_ob(self):
assert len(self.frames) == self.k
return LazyFrames(list(self.frames))
class ScaledFloatFrame(gym.ObservationWrapper):
def _observation(self, obs):
# careful! This undoes the memory optimization, use
# with smaller replay buffers only.
return np.array(obs).astype(np.float32) / 255.0
def wrap_dqn(env):
"""Apply a common set of wrappers for Atari games."""
assert 'NoFrameskip' in env.spec.id
env = EpisodicLifeEnv(env)
env = NoopResetEnv(env, noop_max=30)
env = MaxAndSkipEnv(env, skip=4)
if 'FIRE' in env.unwrapped.get_action_meanings():
env = FireResetEnv(env)
env = ProcessFrame84(env)
env = FrameStack(env, 4)
env = ClippedRewardsWrapper(env)
return env
class A2cProcessFrame(gym.Wrapper):
def __init__(self, env):
gym.Wrapper.__init__(self, env)
self.observation_space = spaces.Box(low=0, high=255, shape=(84, 84, 1))
def _step(self, action):
ob, reward, done, info = self.env.step(action)
return A2cProcessFrame.process(ob), reward, done, info
def _reset(self):
return A2cProcessFrame.process(self.env.reset())
@staticmethod
def process(frame):
frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
frame = cv2.resize(frame, (84, 84), interpolation=cv2.INTER_AREA)
return frame.reshape(84, 84, 1)

View File

@@ -10,10 +10,7 @@ except ImportError:
from shutil import unpack_archive from shutil import unpack_archive
from threading import Event from threading import Event
"""TODOS: # TODOS: use Azure snapshots instead of hacky backups
- use Azure snapshots instead of hacky backups
"""
def fixed_list_blobs(service, *args, **kwargs): def fixed_list_blobs(service, *args, **kwargs):
"""By defualt list_containers only returns a subset of results. """By defualt list_containers only returns a subset of results.
@@ -37,7 +34,7 @@ def make_archive(source_path, dest_path):
prefix_path = os.path.dirname(source_path) prefix_path = os.path.dirname(source_path)
with zipfile.ZipFile(dest_path, "w", compression=zipfile.ZIP_STORED) as zf: with zipfile.ZipFile(dest_path, "w", compression=zipfile.ZIP_STORED) as zf:
if os.path.isdir(source_path): if os.path.isdir(source_path):
for dirname, subdirs, files in os.walk(source_path): for dirname, _subdirs, files in os.walk(source_path):
zf.write(dirname, os.path.relpath(dirname, prefix_path)) zf.write(dirname, os.path.relpath(dirname, prefix_path))
for filename in files: for filename in files:
filepath = os.path.join(dirname, filename) filepath = os.path.join(dirname, filename)

View File

@@ -2,7 +2,6 @@ import tensorflow as tf
import numpy as np import numpy as np
import baselines.common.tf_util as U import baselines.common.tf_util as U
from tensorflow.python.ops import math_ops from tensorflow.python.ops import math_ops
from tensorflow.python.ops import nn
class Pd(object): class Pd(object):
""" """

View File

@@ -4,7 +4,6 @@ import os
import pickle import pickle
import random import random
import tempfile import tempfile
import time
import zipfile import zipfile
@@ -153,76 +152,6 @@ class RunningAvg(object):
"""Get the current estimate""" """Get the current estimate"""
return self._value return self._value
class SimpleMonitor(gym.Wrapper):
def __init__(self, env):
"""Adds two qunatities to info returned by every step:
num_steps: int
Number of steps takes so far
rewards: [float]
All the cumulative rewards for the episodes completed so far.
"""
super().__init__(env)
# current episode state
self._current_reward = None
self._num_steps = None
# temporary monitor state that we do not save
self._time_offset = None
self._total_steps = None
# monitor state
self._episode_rewards = []
self._episode_lengths = []
self._episode_end_times = []
def _reset(self):
obs = self.env.reset()
# recompute temporary state if needed
if self._time_offset is None:
self._time_offset = time.time()
if len(self._episode_end_times) > 0:
self._time_offset -= self._episode_end_times[-1]
if self._total_steps is None:
self._total_steps = sum(self._episode_lengths)
# update monitor state
if self._current_reward is not None:
self._episode_rewards.append(self._current_reward)
self._episode_lengths.append(self._num_steps)
self._episode_end_times.append(time.time() - self._time_offset)
# reset episode state
self._current_reward = 0
self._num_steps = 0
return obs
def _step(self, action):
obs, rew, done, info = self.env.step(action)
self._current_reward += rew
self._num_steps += 1
self._total_steps += 1
info['steps'] = self._total_steps
info['rewards'] = self._episode_rewards
return (obs, rew, done, info)
def get_state(self):
return {
'env_id': self.env.unwrapped.spec.id,
'episode_data': {
'episode_rewards': self._episode_rewards,
'episode_lengths': self._episode_lengths,
'episode_end_times': self._episode_end_times,
'initial_reset_time': 0,
}
}
def set_state(self, state):
assert state['env_id'] == self.env.unwrapped.spec.id
ed = state['episode_data']
self._episode_rewards = ed['episode_rewards']
self._episode_lengths = ed['episode_lengths']
self._episode_end_times = ed['episode_end_times']
def boolean_flag(parser, name, default=False, help=None): def boolean_flag(parser, name, default=False, help=None):
"""Add a boolean flag to argparse parser. """Add a boolean flag to argparse parser.

View File

@@ -6,51 +6,41 @@ import copy
import os import os
import collections import collections
# ================================================================ # ================================================================
# Make consistent with numpy # Make consistent with numpy
# ================================================================ # ================================================================
clip = tf.clip_by_value clip = tf.clip_by_value
def sum(x, axis=None, keepdims=False): def sum(x, axis=None, keepdims=False):
axis = None if axis is None else [axis] axis = None if axis is None else [axis]
return tf.reduce_sum(x, axis=axis, keep_dims=keepdims) return tf.reduce_sum(x, axis=axis, keep_dims=keepdims)
def mean(x, axis=None, keepdims=False): def mean(x, axis=None, keepdims=False):
axis = None if axis is None else [axis] axis = None if axis is None else [axis]
return tf.reduce_mean(x, axis=axis, keep_dims=keepdims) return tf.reduce_mean(x, axis=axis, keep_dims=keepdims)
def var(x, axis=None, keepdims=False): def var(x, axis=None, keepdims=False):
meanx = mean(x, axis=axis, keepdims=keepdims) meanx = mean(x, axis=axis, keepdims=keepdims)
return mean(tf.square(x - meanx), axis=axis, keepdims=keepdims) return mean(tf.square(x - meanx), axis=axis, keepdims=keepdims)
def std(x, axis=None, keepdims=False): def std(x, axis=None, keepdims=False):
return tf.sqrt(var(x, axis=axis, keepdims=keepdims)) return tf.sqrt(var(x, axis=axis, keepdims=keepdims))
def max(x, axis=None, keepdims=False): def max(x, axis=None, keepdims=False):
axis = None if axis is None else [axis] axis = None if axis is None else [axis]
return tf.reduce_max(x, axis=axis, keep_dims=keepdims) return tf.reduce_max(x, axis=axis, keep_dims=keepdims)
def min(x, axis=None, keepdims=False): def min(x, axis=None, keepdims=False):
axis = None if axis is None else [axis] axis = None if axis is None else [axis]
return tf.reduce_min(x, axis=axis, keep_dims=keepdims) return tf.reduce_min(x, axis=axis, keep_dims=keepdims)
def concatenate(arrs, axis=0): def concatenate(arrs, axis=0):
return tf.concat(axis=axis, values=arrs) return tf.concat(axis=axis, values=arrs)
def argmax(x, axis=None): def argmax(x, axis=None):
return tf.argmax(x, axis=axis) return tf.argmax(x, axis=axis)
def switch(condition, then_expression, else_expression): def switch(condition, then_expression, else_expression):
"""Switches between two operations depending on a scalar value (int or bool). """Switches between two operations depending on a scalar value (int or bool).
Note that both `then_expression` and `else_expression` Note that both `then_expression` and `else_expression`
@@ -72,35 +62,29 @@ def switch(condition, then_expression, else_expression):
# Extras # Extras
# ================================================================ # ================================================================
def l2loss(params): def l2loss(params):
if len(params) == 0: if len(params) == 0:
return tf.constant(0.0) return tf.constant(0.0)
else: else:
return tf.add_n([sum(tf.square(p)) for p in params]) return tf.add_n([sum(tf.square(p)) for p in params])
def lrelu(x, leak=0.2): def lrelu(x, leak=0.2):
f1 = 0.5 * (1 + leak) f1 = 0.5 * (1 + leak)
f2 = 0.5 * (1 - leak) f2 = 0.5 * (1 - leak)
return f1 * x + f2 * abs(x) return f1 * x + f2 * abs(x)
def categorical_sample_logits(X): def categorical_sample_logits(X):
# https://github.com/tensorflow/tensorflow/issues/456 # https://github.com/tensorflow/tensorflow/issues/456
U = tf.random_uniform(tf.shape(X)) U = tf.random_uniform(tf.shape(X))
return argmax(X - tf.log(-tf.log(U)), axis=1) return argmax(X - tf.log(-tf.log(U)), axis=1)
# ================================================================ # ================================================================
# Inputs # Inputs
# ================================================================ # ================================================================
def is_placeholder(x): def is_placeholder(x):
return type(x) is tf.Tensor and len(x.op.inputs) == 0 return type(x) is tf.Tensor and len(x.op.inputs) == 0
class TfInput(object): class TfInput(object):
def __init__(self, name="(unnamed)"): def __init__(self, name="(unnamed)"):
"""Generalized Tensorflow placeholder. The main differences are: """Generalized Tensorflow placeholder. The main differences are:
@@ -119,7 +103,6 @@ class TfInput(object):
"""Given data input it to the placeholder(s).""" """Given data input it to the placeholder(s)."""
raise NotImplemented() raise NotImplemented()
class PlacholderTfInput(TfInput): class PlacholderTfInput(TfInput):
def __init__(self, placeholder): def __init__(self, placeholder):
"""Wrapper for regular tensorflow placeholder.""" """Wrapper for regular tensorflow placeholder."""
@@ -132,7 +115,6 @@ class PlacholderTfInput(TfInput):
def make_feed_dict(self, data): def make_feed_dict(self, data):
return {self._placeholder: data} return {self._placeholder: data}
class BatchInput(PlacholderTfInput): class BatchInput(PlacholderTfInput):
def __init__(self, shape, dtype=tf.float32, name=None): def __init__(self, shape, dtype=tf.float32, name=None):
"""Creates a placeholder for a batch of tensors of a given shape and dtype """Creates a placeholder for a batch of tensors of a given shape and dtype
@@ -148,7 +130,6 @@ class BatchInput(PlacholderTfInput):
""" """
super().__init__(tf.placeholder(dtype, [None] + list(shape), name=name)) super().__init__(tf.placeholder(dtype, [None] + list(shape), name=name))
class Uint8Input(PlacholderTfInput): class Uint8Input(PlacholderTfInput):
def __init__(self, shape, name=None): def __init__(self, shape, name=None):
"""Takes input in uint8 format which is cast to float32 and divided by 255 """Takes input in uint8 format which is cast to float32 and divided by 255
@@ -171,7 +152,6 @@ class Uint8Input(PlacholderTfInput):
def get(self): def get(self):
return self._output return self._output
def ensure_tf_input(thing): def ensure_tf_input(thing):
"""Takes either tf.placeholder of TfInput and outputs equivalent TfInput""" """Takes either tf.placeholder of TfInput and outputs equivalent TfInput"""
if isinstance(thing, TfInput): if isinstance(thing, TfInput):
@@ -185,7 +165,6 @@ def ensure_tf_input(thing):
# Mathematical utils # Mathematical utils
# ================================================================ # ================================================================
def huber_loss(x, delta=1.0): def huber_loss(x, delta=1.0):
"""Reference: https://en.wikipedia.org/wiki/Huber_loss""" """Reference: https://en.wikipedia.org/wiki/Huber_loss"""
return tf.where( return tf.where(
@@ -198,7 +177,6 @@ def huber_loss(x, delta=1.0):
# Optimizer utils # Optimizer utils
# ================================================================ # ================================================================
def minimize_and_clip(optimizer, objective, var_list, clip_val=10): def minimize_and_clip(optimizer, objective, var_list, clip_val=10):
"""Minimized `objective` using `optimizer` w.r.t. variables in """Minimized `objective` using `optimizer` w.r.t. variables in
`var_list` while ensure the norm of the gradients for each `var_list` while ensure the norm of the gradients for each
@@ -210,7 +188,6 @@ def minimize_and_clip(optimizer, objective, var_list, clip_val=10):
gradients[i] = (tf.clip_by_norm(grad, clip_val), var) gradients[i] = (tf.clip_by_norm(grad, clip_val), var)
return optimizer.apply_gradients(gradients) return optimizer.apply_gradients(gradients)
# ================================================================ # ================================================================
# Global session # Global session
# ================================================================ # ================================================================
@@ -219,7 +196,6 @@ def get_session():
"""Returns recently made Tensorflow session""" """Returns recently made Tensorflow session"""
return tf.get_default_session() return tf.get_default_session()
def make_session(num_cpu): def make_session(num_cpu):
"""Returns a session that will use <num_cpu> CPU's only""" """Returns a session that will use <num_cpu> CPU's only"""
tf_config = tf.ConfigProto( tf_config = tf.ConfigProto(
@@ -227,31 +203,25 @@ def make_session(num_cpu):
intra_op_parallelism_threads=num_cpu) intra_op_parallelism_threads=num_cpu)
return tf.Session(config=tf_config) return tf.Session(config=tf_config)
def single_threaded_session(): def single_threaded_session():
"""Returns a session which will only use a single CPU""" """Returns a session which will only use a single CPU"""
return make_session(1) return make_session(1)
ALREADY_INITIALIZED = set() ALREADY_INITIALIZED = set()
def initialize(): def initialize():
"""Initialize all the uninitialized variables in the global scope.""" """Initialize all the uninitialized variables in the global scope."""
new_variables = set(tf.global_variables()) - ALREADY_INITIALIZED new_variables = set(tf.global_variables()) - ALREADY_INITIALIZED
get_session().run(tf.variables_initializer(new_variables)) get_session().run(tf.variables_initializer(new_variables))
ALREADY_INITIALIZED.update(new_variables) ALREADY_INITIALIZED.update(new_variables)
def eval(expr, feed_dict=None): def eval(expr, feed_dict=None):
if feed_dict is None: if feed_dict is None:
feed_dict = {} feed_dict = {}
return get_session().run(expr, feed_dict=feed_dict) return get_session().run(expr, feed_dict=feed_dict)
VALUE_SETTERS = collections.OrderedDict() VALUE_SETTERS = collections.OrderedDict()
def set_value(v, val): def set_value(v, val):
global VALUE_SETTERS global VALUE_SETTERS
if v in VALUE_SETTERS: if v in VALUE_SETTERS:
@@ -262,17 +232,14 @@ def set_value(v, val):
VALUE_SETTERS[v] = (set_op, set_endpoint) VALUE_SETTERS[v] = (set_op, set_endpoint)
get_session().run(set_op, feed_dict={set_endpoint: val}) get_session().run(set_op, feed_dict={set_endpoint: val})
# ================================================================ # ================================================================
# Saving variables # Saving variables
# ================================================================ # ================================================================
def load_state(fname): def load_state(fname):
saver = tf.train.Saver() saver = tf.train.Saver()
saver.restore(get_session(), fname) saver.restore(get_session(), fname)
def save_state(fname): def save_state(fname):
os.makedirs(os.path.dirname(fname), exist_ok=True) os.makedirs(os.path.dirname(fname), exist_ok=True)
saver = tf.train.Saver() saver = tf.train.Saver()
@@ -282,7 +249,6 @@ def save_state(fname):
# Model components # Model components
# ================================================================ # ================================================================
def normc_initializer(std=1.0): def normc_initializer(std=1.0):
def _initializer(shape, dtype=None, partition_info=None): # pylint: disable=W0613 def _initializer(shape, dtype=None, partition_info=None): # pylint: disable=W0613
out = np.random.randn(*shape).astype(np.float32) out = np.random.randn(*shape).astype(np.float32)
@@ -290,7 +256,6 @@ def normc_initializer(std=1.0):
return tf.constant(out) return tf.constant(out)
return _initializer return _initializer
def conv2d(x, num_filters, name, filter_size=(3, 3), stride=(1, 1), pad="SAME", dtype=tf.float32, collections=None, def conv2d(x, num_filters, name, filter_size=(3, 3), stride=(1, 1), pad="SAME", dtype=tf.float32, collections=None,
summary_tag=None): summary_tag=None):
with tf.variable_scope(name): with tf.variable_scope(name):
@@ -320,7 +285,6 @@ def conv2d(x, num_filters, name, filter_size=(3, 3), stride=(1, 1), pad="SAME",
return tf.nn.conv2d(x, w, stride_shape, pad) + b return tf.nn.conv2d(x, w, stride_shape, pad) + b
def dense(x, size, name, weight_init=None, bias=True): def dense(x, size, name, weight_init=None, bias=True):
w = tf.get_variable(name + "/w", [x.get_shape()[1], size], initializer=weight_init) w = tf.get_variable(name + "/w", [x.get_shape()[1], size], initializer=weight_init)
ret = tf.matmul(x, w) ret = tf.matmul(x, w)
@@ -330,7 +294,6 @@ def dense(x, size, name, weight_init=None, bias=True):
else: else:
return ret return ret
def wndense(x, size, name, init_scale=1.0): def wndense(x, size, name, init_scale=1.0):
v = tf.get_variable(name + "/V", [int(x.get_shape()[1]), size], v = tf.get_variable(name + "/V", [int(x.get_shape()[1]), size],
initializer=tf.random_normal_initializer(0, 0.05)) initializer=tf.random_normal_initializer(0, 0.05))
@@ -342,11 +305,9 @@ def wndense(x, size, name, init_scale=1.0):
scaler = g / tf.sqrt(sum(tf.square(v), axis=0, keepdims=True)) scaler = g / tf.sqrt(sum(tf.square(v), axis=0, keepdims=True))
return tf.reshape(scaler, [1, size]) * x + tf.reshape(b, [1, size]) return tf.reshape(scaler, [1, size]) * x + tf.reshape(b, [1, size])
def densenobias(x, size, name, weight_init=None): def densenobias(x, size, name, weight_init=None):
return dense(x, size, name, weight_init=weight_init, bias=False) return dense(x, size, name, weight_init=weight_init, bias=False)
def dropout(x, pkeep, phase=None, mask=None): def dropout(x, pkeep, phase=None, mask=None):
mask = tf.floor(pkeep + tf.random_uniform(tf.shape(x))) if mask is None else mask mask = tf.floor(pkeep + tf.random_uniform(tf.shape(x))) if mask is None else mask
if phase is None: if phase is None:
@@ -354,13 +315,10 @@ def dropout(x, pkeep, phase=None, mask=None):
else: else:
return switch(phase, mask * x, pkeep * x) return switch(phase, mask * x, pkeep * x)
# ================================================================ # ================================================================
# Theano-like Function # Theano-like Function
# ================================================================ # ================================================================
def function(inputs, outputs, updates=None, givens=None): def function(inputs, outputs, updates=None, givens=None):
"""Just like Theano function. Take a bunch of tensorflow placeholders and expressions """Just like Theano function. Take a bunch of tensorflow placeholders and expressions
computed based on those placeholders and produces f(inputs) -> outputs. Function f takes computed based on those placeholders and produces f(inputs) -> outputs. Function f takes
@@ -401,7 +359,6 @@ def function(inputs, outputs, updates=None, givens=None):
f = _Function(inputs, [outputs], updates, givens=givens) f = _Function(inputs, [outputs], updates, givens=givens)
return lambda *args, **kwargs: f(*args, **kwargs)[0] return lambda *args, **kwargs: f(*args, **kwargs)[0]
class _Function(object): class _Function(object):
def __init__(self, inputs, outputs, updates, givens, check_nan=False): def __init__(self, inputs, outputs, updates, givens, check_nan=False):
for inpt in inputs: for inpt in inputs:
@@ -448,7 +405,6 @@ class _Function(object):
raise RuntimeError("Nan detected") raise RuntimeError("Nan detected")
return results return results
def mem_friendly_function(nondata_inputs, data_inputs, outputs, batch_size): def mem_friendly_function(nondata_inputs, data_inputs, outputs, batch_size):
if isinstance(outputs, list): if isinstance(outputs, list):
return _MemFriendlyFunction(nondata_inputs, data_inputs, outputs, batch_size) return _MemFriendlyFunction(nondata_inputs, data_inputs, outputs, batch_size)
@@ -456,7 +412,6 @@ def mem_friendly_function(nondata_inputs, data_inputs, outputs, batch_size):
f = _MemFriendlyFunction(nondata_inputs, data_inputs, [outputs], batch_size) f = _MemFriendlyFunction(nondata_inputs, data_inputs, [outputs], batch_size)
return lambda *inputs: f(*inputs)[0] return lambda *inputs: f(*inputs)[0]
class _MemFriendlyFunction(object): class _MemFriendlyFunction(object):
def __init__(self, nondata_inputs, data_inputs, outputs, batch_size): def __init__(self, nondata_inputs, data_inputs, outputs, batch_size):
self.nondata_inputs = nondata_inputs self.nondata_inputs = nondata_inputs
@@ -490,7 +445,6 @@ class _MemFriendlyFunction(object):
# Modules # Modules
# ================================================================ # ================================================================
class Module(object): class Module(object):
def __init__(self, name): def __init__(self, name):
self.name = name self.name = name
@@ -528,7 +482,6 @@ class Module(object):
assert self.scope is not None, "need to call module once before getting variables" assert self.scope is not None, "need to call module once before getting variables"
return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope) return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope)
def module(name): def module(name):
@functools.wraps @functools.wraps
def wrapper(f): def wrapper(f):
@@ -542,14 +495,11 @@ def module(name):
# Graph traversal # Graph traversal
# ================================================================ # ================================================================
VARIABLES = {} VARIABLES = {}
def get_parents(node): def get_parents(node):
return node.op.inputs return node.op.inputs
def topsorted(outputs): def topsorted(outputs):
""" """
Topological sort via non-recursive depth-first search Topological sort via non-recursive depth-first search
@@ -586,7 +536,6 @@ def topsorted(outputs):
stack.append((j, 0)) stack.append((j, 0))
return out return out
# ================================================================ # ================================================================
# Flat vectors # Flat vectors
# ================================================================ # ================================================================
@@ -597,15 +546,12 @@ def var_shape(x):
"shape function assumes that shape is fully known" "shape function assumes that shape is fully known"
return out return out
def numel(x): def numel(x):
return intprod(var_shape(x)) return intprod(var_shape(x))
def intprod(x): def intprod(x):
return int(np.prod(x)) return int(np.prod(x))
def flatgrad(loss, var_list, clip_norm=None): def flatgrad(loss, var_list, clip_norm=None):
grads = tf.gradients(loss, var_list) grads = tf.gradients(loss, var_list)
if clip_norm is not None: if clip_norm is not None:
@@ -615,7 +561,6 @@ def flatgrad(loss, var_list, clip_norm=None):
for (v, grad) in zip(var_list, grads) for (v, grad) in zip(var_list, grads)
]) ])
class SetFromFlat(object): class SetFromFlat(object):
def __init__(self, var_list, dtype=tf.float32): def __init__(self, var_list, dtype=tf.float32):
assigns = [] assigns = []
@@ -634,7 +579,6 @@ class SetFromFlat(object):
def __call__(self, theta): def __call__(self, theta):
get_session().run(self.op, feed_dict={self.theta: theta}) get_session().run(self.op, feed_dict={self.theta: theta})
class GetFlat(object): class GetFlat(object):
def __init__(self, var_list): def __init__(self, var_list):
self.op = tf.concat(axis=0, values=[tf.reshape(v, [numel(v)]) for v in var_list]) self.op = tf.concat(axis=0, values=[tf.reshape(v, [numel(v)]) for v in var_list])
@@ -646,7 +590,6 @@ class GetFlat(object):
# Misc # Misc
# ================================================================ # ================================================================
def fancy_slice_2d(X, inds0, inds1): def fancy_slice_2d(X, inds0, inds1):
""" """
like numpy X[inds0, inds1] like numpy X[inds0, inds1]
@@ -659,12 +602,10 @@ def fancy_slice_2d(X, inds0, inds1):
Xflat = tf.reshape(X, [-1]) Xflat = tf.reshape(X, [-1])
return tf.gather(Xflat, inds0 * ncols + inds1) return tf.gather(Xflat, inds0 * ncols + inds1)
# ================================================================ # ================================================================
# Scopes # Scopes
# ================================================================ # ================================================================
def scope_vars(scope, trainable_only=False): def scope_vars(scope, trainable_only=False):
""" """
Get variables inside a scope Get variables inside a scope
@@ -687,17 +628,14 @@ def scope_vars(scope, trainable_only=False):
scope=scope if isinstance(scope, str) else scope.name scope=scope if isinstance(scope, str) else scope.name
) )
def scope_name(): def scope_name():
"""Returns the name of current scope as a string, e.g. deepq/q_func""" """Returns the name of current scope as a string, e.g. deepq/q_func"""
return tf.get_variable_scope().name return tf.get_variable_scope().name
def absolute_scope_name(relative_scope_name): def absolute_scope_name(relative_scope_name):
"""Appends parent scope name to `relative_scope_name`""" """Appends parent scope name to `relative_scope_name`"""
return scope_name() + "/" + relative_scope_name return scope_name() + "/" + relative_scope_name
def lengths_to_mask(lengths_b, max_length): def lengths_to_mask(lengths_b, max_length):
""" """
Turns a vector of lengths into a boolean mask Turns a vector of lengths into a boolean mask
@@ -715,7 +653,6 @@ def lengths_to_mask(lengths_b, max_length):
mask_bt = tf.expand_dims(tf.range(max_length), 0) < tf.expand_dims(lengths_b, 1) mask_bt = tf.expand_dims(tf.range(max_length), 0) < tf.expand_dims(lengths_b, 1)
return mask_bt return mask_bt
def in_session(f): def in_session(f):
@functools.wraps(f) @functools.wraps(f)
def newfunc(*args, **kwargs): def newfunc(*args, **kwargs):
@@ -723,10 +660,8 @@ def in_session(f):
f(*args, **kwargs) f(*args, **kwargs)
return newfunc return newfunc
_PLACEHOLDER_CACHE = {} # name -> (placeholder, dtype, shape) _PLACEHOLDER_CACHE = {} # name -> (placeholder, dtype, shape)
def get_placeholder(name, dtype, shape): def get_placeholder(name, dtype, shape):
if name in _PLACEHOLDER_CACHE: if name in _PLACEHOLDER_CACHE:
out, dtype1, shape1 = _PLACEHOLDER_CACHE[name] out, dtype1, shape1 = _PLACEHOLDER_CACHE[name]
@@ -737,15 +672,12 @@ def get_placeholder(name, dtype, shape):
_PLACEHOLDER_CACHE[name] = (out, dtype, shape) _PLACEHOLDER_CACHE[name] = (out, dtype, shape)
return out return out
def get_placeholder_cached(name): def get_placeholder_cached(name):
return _PLACEHOLDER_CACHE[name][0] return _PLACEHOLDER_CACHE[name][0]
def flattenallbut0(x): def flattenallbut0(x):
return tf.reshape(x, [-1, intprod(x.get_shape().as_list()[1:])]) return tf.reshape(x, [-1, intprod(x.get_shape().as_list()[1:])])
def reset(): def reset():
global _PLACEHOLDER_CACHE global _PLACEHOLDER_CACHE
global VARIABLES global VARIABLES

View File

@@ -2,7 +2,9 @@ import numpy as np
from multiprocessing import Process, Pipe from multiprocessing import Process, Pipe
from baselines.common.vec_env import VecEnv from baselines.common.vec_env import VecEnv
def worker(remote, env_fn_wrapper):
def worker(remote, parent_remote, env_fn_wrapper):
parent_remote.close()
env = env_fn_wrapper.x() env = env_fn_wrapper.x()
while True: while True:
cmd, data = remote.recv() cmd, data = remote.recv()
@@ -14,6 +16,9 @@ def worker(remote, env_fn_wrapper):
elif cmd == 'reset': elif cmd == 'reset':
ob = env.reset() ob = env.reset()
remote.send(ob) remote.send(ob)
elif cmd == 'reset_task':
ob = env.reset_task()
remote.send(ob)
elif cmd == 'close': elif cmd == 'close':
remote.close() remote.close()
break break
@@ -22,6 +27,7 @@ def worker(remote, env_fn_wrapper):
else: else:
raise NotImplementedError raise NotImplementedError
class CloudpickleWrapper(object): class CloudpickleWrapper(object):
""" """
Uses cloudpickle to serialize contents (otherwise multiprocessing tries to use pickle) Uses cloudpickle to serialize contents (otherwise multiprocessing tries to use pickle)
@@ -35,17 +41,22 @@ class CloudpickleWrapper(object):
import pickle import pickle
self.x = pickle.loads(ob) self.x = pickle.loads(ob)
class SubprocVecEnv(VecEnv): class SubprocVecEnv(VecEnv):
def __init__(self, env_fns): def __init__(self, env_fns):
""" """
envs: list of gym environments to run in subprocesses envs: list of gym environments to run in subprocesses
""" """
self.closed = False
nenvs = len(env_fns) nenvs = len(env_fns)
self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(nenvs)]) self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(nenvs)])
self.ps = [Process(target=worker, args=(work_remote, CloudpickleWrapper(env_fn))) self.ps = [Process(target=worker, args=(work_remote, remote, CloudpickleWrapper(env_fn)))
for (work_remote, env_fn) in zip(self.work_remotes, env_fns)] for (work_remote, remote, env_fn) in zip(self.work_remotes, self.remotes, env_fns)]
for p in self.ps: for p in self.ps:
p.daemon = True # if the main process crashes, we should not cause things to hang
p.start() p.start()
for remote in self.work_remotes:
remote.close()
self.remotes[0].send(('get_spaces', None)) self.remotes[0].send(('get_spaces', None))
self.action_space, self.observation_space = self.remotes[0].recv() self.action_space, self.observation_space = self.remotes[0].recv()
@@ -63,11 +74,20 @@ class SubprocVecEnv(VecEnv):
remote.send(('reset', None)) remote.send(('reset', None))
return np.stack([remote.recv() for remote in self.remotes]) return np.stack([remote.recv() for remote in self.remotes])
def reset_task(self):
for remote in self.remotes:
remote.send(('reset_task', None))
return np.stack([remote.recv() for remote in self.remotes])
def close(self): def close(self):
if self.closed:
return
for remote in self.remotes: for remote in self.remotes:
remote.send(('close', None)) remote.send(('close', None))
for p in self.ps: for p in self.ps:
p.join() p.join()
self.closed = True
@property @property
def num_envs(self): def num_envs(self):

View File

@@ -19,11 +19,12 @@ from mpi4py import MPI
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs): def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs):
# Configure things. # Configure things.
rank = MPI.COMM_WORLD.Get_rank() rank = MPI.COMM_WORLD.Get_rank()
if rank != 0: logger.set_level(logger.DISABLED) if rank != 0:
logger.set_level(logger.DISABLED)
# Create envs. # Create envs.
env = gym.make(env_id) env = gym.make(env_id)
env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), "%i.monitor.json"%rank)) env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
gym.logger.setLevel(logging.WARN) gym.logger.setLevel(logging.WARN)
if evaluation and rank==0: if evaluation and rank==0:
@@ -81,7 +82,7 @@ def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs):
def parse_args(): def parse_args():
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--env-id', type=str, default='HalfCheetah-v1') parser.add_argument('--env-id', type=str, default='HalfCheetah-v1')
boolean_flag(parser, 'render-eval', default=False) boolean_flag(parser, 'render-eval', default=False)
boolean_flag(parser, 'layer-norm', default=True) boolean_flag(parser, 'layer-norm', default=True)
@@ -103,11 +104,21 @@ def parse_args():
parser.add_argument('--nb-eval-steps', type=int, default=100) # per epoch cycle and MPI worker parser.add_argument('--nb-eval-steps', type=int, default=100) # per epoch cycle and MPI worker
parser.add_argument('--nb-rollout-steps', type=int, default=100) # per epoch cycle and MPI worker parser.add_argument('--nb-rollout-steps', type=int, default=100) # per epoch cycle and MPI worker
parser.add_argument('--noise-type', type=str, default='adaptive-param_0.2') # choices are adaptive-param_xx, ou_xx, normal_xx, none parser.add_argument('--noise-type', type=str, default='adaptive-param_0.2') # choices are adaptive-param_xx, ou_xx, normal_xx, none
parser.add_argument('--num-timesteps', type=int, default=None)
boolean_flag(parser, 'evaluation', default=False) boolean_flag(parser, 'evaluation', default=False)
return vars(parser.parse_args()) args = parser.parse_args()
# we don't directly specify timesteps for this script, so make sure that if we do specify them
# they agree with the other parameters
if args.num_timesteps is not None:
assert(args.num_timesteps == args.nb_epochs * args.nb_epoch_cycles * args.nb_rollout_steps)
dict_args = vars(args)
del dict_args['num_timesteps']
return dict_args
if __name__ == '__main__': if __name__ == '__main__':
args = parse_args() args = parse_args()
if MPI.COMM_WORLD.Get_rank() == 0:
logger.configure()
# Run actual script. # Run actual script.
run(**args) run(**args)

View File

@@ -1,6 +1,3 @@
import time
import gym
import numpy as np import numpy as np
import tensorflow as tf import tensorflow as tf
from mpi4py import MPI from mpi4py import MPI

View File

@@ -1,5 +1,8 @@
from baselines.deepq import models # noqa from baselines.deepq import models # noqa
from baselines.deepq.build_graph import build_act, build_train # noqa from baselines.deepq.build_graph import build_act, build_train # noqa
from baselines.deepq.simple import learn, load # noqa from baselines.deepq.simple import learn, load # noqa
from baselines.deepq.replay_buffer import ReplayBuffer, PrioritizedReplayBuffer # noqa from baselines.deepq.replay_buffer import ReplayBuffer, PrioritizedReplayBuffer # noqa
def wrap_atari_dqn(env):
from baselines.common.atari_wrappers import wrap_deepmind
return wrap_deepmind(env, frame_stack=True, scale=True)

View File

@@ -10,8 +10,8 @@ import baselines.common.tf_util as U
from baselines import deepq from baselines import deepq
from baselines.common.misc_util import ( from baselines.common.misc_util import (
boolean_flag, boolean_flag,
SimpleMonitor,
) )
from baselines import bench
from baselines.common.atari_wrappers_deprecated import wrap_dqn from baselines.common.atari_wrappers_deprecated import wrap_dqn
from baselines.deepq.experiments.atari.model import model, dueling_model from baselines.deepq.experiments.atari.model import model, dueling_model
@@ -30,7 +30,7 @@ def parse_args():
def make_env(game_name): def make_env(game_name):
env = gym.make(game_name + "NoFrameskip-v4") env = gym.make(game_name + "NoFrameskip-v4")
env = SimpleMonitor(env) env = bench.Monitor(env, None)
env = wrap_dqn(env) env = wrap_dqn(env)
return env return env

View File

@@ -19,11 +19,9 @@ from baselines.common.misc_util import (
relatively_safe_pickle_dump, relatively_safe_pickle_dump,
set_global_seeds, set_global_seeds,
RunningAvg, RunningAvg,
SimpleMonitor
) )
from baselines.common.schedules import LinearSchedule, PiecewiseSchedule from baselines.common.schedules import LinearSchedule, PiecewiseSchedule
# when updating this to non-deperecated ones, it is important to from baselines import bench
# copy over LazyFrames
from baselines.common.atari_wrappers_deprecated import wrap_dqn from baselines.common.atari_wrappers_deprecated import wrap_dqn
from baselines.common.azure_utils import Container from baselines.common.azure_utils import Container
from .model import model, dueling_model from .model import model, dueling_model
@@ -64,7 +62,7 @@ def parse_args():
def make_env(game_name): def make_env(game_name):
env = gym.make(game_name + "NoFrameskip-v4") env = gym.make(game_name + "NoFrameskip-v4")
monitored_env = SimpleMonitor(env) # puts rewards and number of steps in info, before environment is wrapped monitored_env = bench.Monitor(env, logger.get_dir()) # puts rewards and number of steps in info, before environment is wrapped
env = wrap_dqn(monitored_env) # applies a bunch of modification to simplify the observation space (downsample, make b/w) env = wrap_dqn(monitored_env) # applies a bunch of modification to simplify the observation space (downsample, make b/w)
return env, monitored_env return env, monitored_env

View File

@@ -5,15 +5,15 @@ import os
import baselines.common.tf_util as U import baselines.common.tf_util as U
from baselines import deepq from baselines import deepq, bench
from baselines.common.misc_util import get_wrapper_by_name, SimpleMonitor, boolean_flag, set_global_seeds from baselines.common.misc_util import get_wrapper_by_name, boolean_flag, set_global_seeds
from baselines.common.atari_wrappers_deprecated import wrap_dqn from baselines.common.atari_wrappers_deprecated import wrap_dqn
from baselines.deepq.experiments.atari.model import model, dueling_model from baselines.deepq.experiments.atari.model import model, dueling_model
def make_env(game_name): def make_env(game_name):
env = gym.make(game_name + "NoFrameskip-v4") env = gym.make(game_name + "NoFrameskip-v4")
env_monitored = SimpleMonitor(env) env_monitored = bench.Monitor(env, None)
env = wrap_dqn(env_monitored) env = wrap_dqn(env_monitored)
return env_monitored, env return env_monitored, env
@@ -47,14 +47,14 @@ def wang2015_eval(game_name, act, stochastic):
eval_episode_steps += 1 eval_episode_steps += 1
action = act(np.array(obs)[None], stochastic=stochastic)[0] action = act(np.array(obs)[None], stochastic=stochastic)[0]
obs, reward, done, info = eval_env.step(action) obs, _reward, done, info = eval_env.step(action)
if done: if done:
obs = eval_env.reset() obs = eval_env.reset()
if len(info["rewards"]) > 0: if len(info["rewards"]) > 0:
episode_rewards.append(info["rewards"][0]) episode_rewards.append(info["rewards"][0])
break break
if info["steps"] > 108000: # 5 minutes of gameplay if info["steps"] > 108000: # 5 minutes of gameplay
episode_rewards.append(env_monitored._current_reward) episode_rewards.append(sum(env_monitored.rewards))
break break
print("Num steps in episode {} was {} yielding {} reward".format( print("Num steps in episode {} was {} yielding {} reward".format(
num_noops, eval_episode_steps, episode_rewards[-1]), flush=True) num_noops, eval_episode_steps, episode_rewards[-1]), flush=True)
@@ -66,7 +66,7 @@ def wang2015_eval(game_name, act, stochastic):
def main(): def main():
set_global_seeds(1) set_global_seeds(1)
args = parse_args() args = parse_args()
with U.make_session(4) as sess: # noqa with U.make_session(4): # noqa
_, env = make_env(args.env) _, env = make_env(args.env)
act = deepq.build_act( act = deepq.build_act(
make_obs_ph=lambda name: U.Uint8Input(env.observation_space.shape, name=name), make_obs_ph=lambda name: U.Uint8Input(env.observation_space.shape, name=name),

View File

@@ -1,12 +1,10 @@
import gym import gym
from baselines import deepq from baselines import deepq
from baselines.common.atari_wrappers_deprecated import wrap_dqn, ScaledFloatFrame
def main(): def main():
env = gym.make("PongNoFrameskip-v4") env = gym.make("PongNoFrameskip-v4")
env = ScaledFloatFrame(wrap_dqn(env)) env = deepq.wrap_atari_dqn(env)
act = deepq.load("pong_model.pkl") act = deepq.load("pong_model.pkl")
while True: while True:

View File

@@ -0,0 +1,47 @@
import gym
from baselines import deepq
from baselines.common import set_global_seeds
from baselines import bench
import argparse
from baselines import logger
from baselines.common.atari_wrappers import make_atari
def main():
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4')
parser.add_argument('--seed', help='RNG seed', type=int, default=0)
parser.add_argument('--prioritized', type=int, default=1)
parser.add_argument('--dueling', type=int, default=1)
parser.add_argument('--num-timesteps', type=int, default=int(10e6))
args = parser.parse_args()
logger.configure()
set_global_seeds(args.seed)
env = make_atari(args.env)
env = bench.Monitor(env, logger.get_dir())
env = deepq.wrap_atari_dqn(env)
model = deepq.models.cnn_to_mlp(
convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)],
hiddens=[256],
dueling=bool(args.dueling),
)
act = deepq.learn(
env,
q_func=model,
lr=1e-4,
max_timesteps=args.num_timesteps,
buffer_size=10000,
exploration_fraction=0.1,
exploration_final_eps=0.01,
train_freq=4,
learning_starts=10000,
target_network_update_freq=1000,
gamma=0.99,
prioritized_replay=bool(args.prioritized)
)
# act.save("pong_model.pkl") XXX
env.close()
if __name__ == '__main__':
main()

View File

@@ -1,34 +0,0 @@
import gym
from baselines import deepq
from baselines.common.atari_wrappers_deprecated import wrap_dqn, ScaledFloatFrame
def main():
env = gym.make("PongNoFrameskip-v4")
env = ScaledFloatFrame(wrap_dqn(env))
model = deepq.models.cnn_to_mlp(
convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)],
hiddens=[256],
dueling=True
)
act = deepq.learn(
env,
q_func=model,
lr=1e-4,
max_timesteps=2000000,
buffer_size=10000,
exploration_fraction=0.1,
exploration_final_eps=0.01,
train_freq=4,
learning_starts=10000,
target_network_update_freq=1000,
gamma=0.99,
prioritized_replay=True
)
act.save("pong_model.pkl")
env.close()
if __name__ == '__main__':
main()

View File

@@ -1,12 +1,13 @@
import numpy as np
import os import os
import dill
import tempfile import tempfile
import tensorflow as tf import tensorflow as tf
import zipfile import zipfile
import cloudpickle
import numpy as np
import gym
import baselines.common.tf_util as U import baselines.common.tf_util as U
from baselines import logger from baselines import logger
from baselines.common.schedules import LinearSchedule from baselines.common.schedules import LinearSchedule
from baselines import deepq from baselines import deepq
@@ -19,11 +20,11 @@ class ActWrapper(object):
self._act_params = act_params self._act_params = act_params
@staticmethod @staticmethod
def load(path, num_cpu=16): def load(path):
with open(path, "rb") as f: with open(path, "rb") as f:
model_data, act_params = dill.load(f) model_data, act_params = cloudpickle.load(f)
act = deepq.build_act(**act_params) act = deepq.build_act(**act_params)
sess = U.make_session(num_cpu=num_cpu) sess = tf.Session()
sess.__enter__() sess.__enter__()
with tempfile.TemporaryDirectory() as td: with tempfile.TemporaryDirectory() as td:
arc_path = os.path.join(td, "packed.zip") arc_path = os.path.join(td, "packed.zip")
@@ -38,8 +39,11 @@ class ActWrapper(object):
def __call__(self, *args, **kwargs): def __call__(self, *args, **kwargs):
return self._act(*args, **kwargs) return self._act(*args, **kwargs)
def save(self, path): def save(self, path=None):
"""Save model to a pickle located at `path`""" """Save model to a pickle located at `path`"""
if path is None:
path = os.path.join(logger.get_dir(), "model.pkl")
with tempfile.TemporaryDirectory() as td: with tempfile.TemporaryDirectory() as td:
U.save_state(os.path.join(td, "model")) U.save_state(os.path.join(td, "model"))
arc_name = os.path.join(td, "packed.zip") arc_name = os.path.join(td, "packed.zip")
@@ -52,18 +56,16 @@ class ActWrapper(object):
with open(arc_name, "rb") as f: with open(arc_name, "rb") as f:
model_data = f.read() model_data = f.read()
with open(path, "wb") as f: with open(path, "wb") as f:
dill.dump((model_data, self._act_params), f) cloudpickle.dump((model_data, self._act_params), f)
def load(path, num_cpu=16): def load(path):
"""Load act function that was returned by learn function. """Load act function that was returned by learn function.
Parameters Parameters
---------- ----------
path: str path: str
path to the act function pickle path to the act function pickle
num_cpu: int
number of cpus to use for executing the policy
Returns Returns
------- -------
@@ -71,7 +73,7 @@ def load(path, num_cpu=16):
function that takes a batch of observations function that takes a batch of observations
and returns actions. and returns actions.
""" """
return ActWrapper.load(path, num_cpu=num_cpu) return ActWrapper.load(path)
def learn(env, def learn(env,
@@ -83,7 +85,7 @@ def learn(env,
exploration_final_eps=0.02, exploration_final_eps=0.02,
train_freq=1, train_freq=1,
batch_size=32, batch_size=32,
print_freq=1, print_freq=100,
checkpoint_freq=10000, checkpoint_freq=10000,
learning_starts=1000, learning_starts=1000,
gamma=1.0, gamma=1.0,
@@ -93,7 +95,6 @@ def learn(env,
prioritized_replay_beta0=0.4, prioritized_replay_beta0=0.4,
prioritized_replay_beta_iters=None, prioritized_replay_beta_iters=None,
prioritized_replay_eps=1e-6, prioritized_replay_eps=1e-6,
num_cpu=16,
param_noise=False, param_noise=False,
callback=None): callback=None):
"""Train a deepq model. """Train a deepq model.
@@ -151,8 +152,6 @@ def learn(env,
to 1.0. If set to None equals to max_timesteps. to 1.0. If set to None equals to max_timesteps.
prioritized_replay_eps: float prioritized_replay_eps: float
epsilon to add to the TD errors when updating priorities. epsilon to add to the TD errors when updating priorities.
num_cpu: int
number of cpus to use for training
callback: (locals, globals) -> None callback: (locals, globals) -> None
function called at every steps with state of the algorithm. function called at every steps with state of the algorithm.
If callback returns true training stops. If callback returns true training stops.
@@ -165,11 +164,14 @@ def learn(env,
""" """
# Create all the functions necessary to train the model # Create all the functions necessary to train the model
sess = U.make_session(num_cpu=num_cpu) sess = tf.Session()
sess.__enter__() sess.__enter__()
# capture the shape outside the closure so that the env object is not serialized
# by cloudpickle when serializing make_obs_ph
observation_space_shape = env.observation_space.shape
def make_obs_ph(name): def make_obs_ph(name):
return U.BatchInput(env.observation_space.shape, name=name) return U.BatchInput(observation_space_shape, name=name)
act, train, update_target, debug = deepq.build_train( act, train, update_target, debug = deepq.build_train(
make_obs_ph=make_obs_ph, make_obs_ph=make_obs_ph,
@@ -180,12 +182,15 @@ def learn(env,
grad_norm_clipping=10, grad_norm_clipping=10,
param_noise=param_noise param_noise=param_noise
) )
act_params = { act_params = {
'make_obs_ph': make_obs_ph, 'make_obs_ph': make_obs_ph,
'q_func': q_func, 'q_func': q_func,
'num_actions': env.action_space.n, 'num_actions': env.action_space.n,
} }
act = ActWrapper(act, act_params)
# Create the replay buffer # Create the replay buffer
if prioritized_replay: if prioritized_replay:
replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha)
@@ -233,8 +238,13 @@ def learn(env,
kwargs['update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_threshold'] = update_param_noise_threshold
kwargs['update_param_noise_scale'] = True kwargs['update_param_noise_scale'] = True
action = act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] action = act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0]
if isinstance(env.action_space, gym.spaces.MultiBinary):
env_action = np.zeros(env.action_space.n)
env_action[action] = 1
else:
env_action = action
reset = False reset = False
new_obs, rew, done, _ = env.step(action) new_obs, rew, done, _ = env.step(env_action)
# Store transition in the replay buffer. # Store transition in the replay buffer.
replay_buffer.add(obs, action, rew, new_obs, float(done)) replay_buffer.add(obs, action, rew, new_obs, float(done))
obs = new_obs obs = new_obs
@@ -285,4 +295,4 @@ def learn(env,
logger.log("Restored model with mean reward: {}".format(saved_mean_reward)) logger.log("Restored model with mean reward: {}".format(saved_mean_reward))
U.load_state(model_file) U.load_state(model_file)
return ActWrapper(act, act_params) return act

View File

@@ -6,8 +6,10 @@ import json
import time import time
import datetime import datetime
import tempfile import tempfile
from mpi4py import MPI
LOG_OUTPUT_FORMATS = ['stdout', 'log', 'json'] LOG_OUTPUT_FORMATS = ['stdout', 'log', 'csv']
# Also valid: json, tensorboard
DEBUG = 10 DEBUG = 10
INFO = 20 INFO = 20
@@ -16,26 +18,23 @@ ERROR = 40
DISABLED = 50 DISABLED = 50
class OutputFormat(object): class KVWriter(object):
def writekvs(self, kvs): def writekvs(self, kvs):
"""
Write key-value pairs
"""
raise NotImplementedError raise NotImplementedError
def writeseq(self, args): class SeqWriter(object):
""" def writeseq(self, seq):
Write a sequence of other data (e.g. a logging message) raise NotImplementedError
"""
pass
def close(self): class HumanOutputFormat(KVWriter, SeqWriter):
return def __init__(self, filename_or_file):
if isinstance(filename_or_file, str):
self.file = open(filename_or_file, 'at')
class HumanOutputFormat(OutputFormat): self.own_file = True
def __init__(self, file): else:
self.file = file assert hasattr(filename_or_file, 'read'), 'expected file or str, got %s'%filename_or_file
self.file = filename_or_file
self.own_file = False
def writekvs(self, kvs): def writekvs(self, kvs):
# Create strings for printing # Create strings for printing
@@ -48,8 +47,12 @@ class HumanOutputFormat(OutputFormat):
key2str[self._truncate(key)] = self._truncate(valstr) key2str[self._truncate(key)] = self._truncate(valstr)
# Find max widths # Find max widths
keywidth = max(map(len, key2str.keys())) if len(key2str) == 0:
valwidth = max(map(len, key2str.values())) print('WARNING: tried to write empty key-value dict')
return
else:
keywidth = max(map(len, key2str.keys()))
valwidth = max(map(len, key2str.values()))
# Write out the data # Write out the data
dashes = '-' * (keywidth + valwidth + 7) dashes = '-' * (keywidth + valwidth + 7)
@@ -70,15 +73,19 @@ class HumanOutputFormat(OutputFormat):
def _truncate(self, s): def _truncate(self, s):
return s[:20] + '...' if len(s) > 23 else s return s[:20] + '...' if len(s) > 23 else s
def writeseq(self, args): def writeseq(self, seq):
for arg in args: for arg in seq:
self.file.write(arg) self.file.write(arg)
self.file.write('\n') self.file.write('\n')
self.file.flush() self.file.flush()
class JSONOutputFormat(OutputFormat): def close(self):
def __init__(self, file): if self.own_file:
self.file = file self.file.close()
class JSONOutputFormat(KVWriter):
def __init__(self, filename):
self.file = open(filename, 'at')
def writekvs(self, kvs): def writekvs(self, kvs):
for k, v in sorted(kvs.items()): for k, v in sorted(kvs.items()):
@@ -88,7 +95,46 @@ class JSONOutputFormat(OutputFormat):
self.file.write(json.dumps(kvs) + '\n') self.file.write(json.dumps(kvs) + '\n')
self.file.flush() self.file.flush()
class TensorBoardOutputFormat(OutputFormat): def close(self):
self.file.close()
class CSVOutputFormat(KVWriter):
def __init__(self, filename):
self.file = open(filename, 'a+t')
self.keys = []
self.sep = ','
def writekvs(self, kvs):
# Add our current row to the history
extra_keys = kvs.keys() - self.keys
if extra_keys:
self.keys.extend(extra_keys)
self.file.seek(0)
lines = self.file.readlines()
self.file.seek(0)
for (i, k) in enumerate(self.keys):
if i > 0:
self.file.write(',')
self.file.write(k)
self.file.write('\n')
for line in lines[1:]:
self.file.write(line[:-1])
self.file.write(self.sep * len(extra_keys))
self.file.write('\n')
for (i, k) in enumerate(self.keys):
if i > 0:
self.file.write(',')
v = kvs.get(k)
if v:
self.file.write(str(v))
self.file.write('\n')
self.file.flush()
def close(self):
self.file.close()
class TensorBoardOutputFormat(KVWriter):
""" """
Dumps key/value pairs into TensorBoard's numeric format. Dumps key/value pairs into TensorBoard's numeric format.
""" """
@@ -99,7 +145,7 @@ class TensorBoardOutputFormat(OutputFormat):
prefix = 'events' prefix = 'events'
path = osp.join(osp.abspath(dir), prefix) path = osp.join(osp.abspath(dir), prefix)
import tensorflow as tf import tensorflow as tf
from tensorflow.python import pywrap_tensorflow from tensorflow.python import pywrap_tensorflow
from tensorflow.core.util import event_pb2 from tensorflow.core.util import event_pb2
from tensorflow.python.util import compat from tensorflow.python.util import compat
self.tf = tf self.tf = tf
@@ -123,18 +169,22 @@ class TensorBoardOutputFormat(OutputFormat):
self.writer.Close() self.writer.Close()
self.writer = None self.writer = None
def make_output_format(format, ev_dir): def make_output_format(format, ev_dir):
os.makedirs(ev_dir, exist_ok=True) os.makedirs(ev_dir, exist_ok=True)
rank = MPI.COMM_WORLD.Get_rank()
if format == 'stdout': if format == 'stdout':
return HumanOutputFormat(sys.stdout) return HumanOutputFormat(sys.stdout)
elif format == 'log': elif format == 'log':
log_file = open(osp.join(ev_dir, 'log.txt'), 'wt') suffix = "" if rank==0 else ("-mpi%03i"%rank)
return HumanOutputFormat(log_file) return HumanOutputFormat(osp.join(ev_dir, 'log%s.txt' % suffix))
elif format == 'json': elif format == 'json':
json_file = open(osp.join(ev_dir, 'progress.json'), 'wt') assert rank==0
return JSONOutputFormat(json_file) return JSONOutputFormat(osp.join(ev_dir, 'progress.json'))
elif format == 'csv':
assert rank==0
return CSVOutputFormat(osp.join(ev_dir, 'progress.csv'))
elif format == 'tensorboard': elif format == 'tensorboard':
assert rank==0
return TensorBoardOutputFormat(osp.join(ev_dir, 'tb')) return TensorBoardOutputFormat(osp.join(ev_dir, 'tb'))
else: else:
raise ValueError('Unknown format specified: %s' % (format,)) raise ValueError('Unknown format specified: %s' % (format,))
@@ -167,7 +217,7 @@ def dumpkvs():
Logger.CURRENT.dumpkvs() Logger.CURRENT.dumpkvs()
def getkvs(): def getkvs():
return Logger.CURRENT.name2val return Logger.CURRENT.name2val
def log(*args, level=INFO): def log(*args, level=INFO):
@@ -176,19 +226,15 @@ def log(*args, level=INFO):
""" """
Logger.CURRENT.log(*args, level=level) Logger.CURRENT.log(*args, level=level)
def debug(*args): def debug(*args):
log(*args, level=DEBUG) log(*args, level=DEBUG)
def info(*args): def info(*args):
log(*args, level=INFO) log(*args, level=INFO)
def warn(*args): def warn(*args):
log(*args, level=WARN) log(*args, level=WARN)
def error(*args): def error(*args):
log(*args, level=ERROR) log(*args, level=ERROR)
@@ -232,7 +278,8 @@ class Logger(object):
def dumpkvs(self): def dumpkvs(self):
if self.level == DISABLED: return if self.level == DISABLED: return
for fmt in self.output_formats: for fmt in self.output_formats:
fmt.writekvs(self.name2val) if isinstance(fmt, KVWriter):
fmt.writekvs(self.name2val)
self.name2val.clear() self.name2val.clear()
def log(self, *args, level=INFO): def log(self, *args, level=INFO):
@@ -255,34 +302,45 @@ class Logger(object):
# ---------------------------------------- # ----------------------------------------
def _do_log(self, args): def _do_log(self, args):
for fmt in self.output_formats: for fmt in self.output_formats:
fmt.writeseq(args) if isinstance(fmt, SeqWriter):
fmt.writeseq(map(str, args))
Logger.DEFAULT = Logger.CURRENT = Logger(dir=None, output_formats=[HumanOutputFormat(sys.stdout)]) Logger.DEFAULT = Logger.CURRENT = Logger(dir=None, output_formats=[HumanOutputFormat(sys.stdout)])
def configure(dir=None, format_strs=None): def configure(dir=None, format_strs=None):
assert Logger.CURRENT is Logger.DEFAULT,\
"Only call logger.configure() when it's in the default state. Try calling logger.reset() first."
prevlogger = Logger.CURRENT
if dir is None: if dir is None:
dir = os.getenv('OPENAI_LOGDIR') dir = os.getenv('OPENAI_LOGDIR')
if dir is None: if dir is None:
dir = osp.join(tempfile.gettempdir(), dir = osp.join(tempfile.gettempdir(),
datetime.datetime.now().strftime("openai-%Y-%m-%d-%H-%M-%S-%f")) datetime.datetime.now().strftime("openai-%Y-%m-%d-%H-%M-%S-%f"))
assert isinstance(dir, str)
os.makedirs(dir, exist_ok=True)
if format_strs is None: if format_strs is None:
format_strs = LOG_OUTPUT_FORMATS strs = os.getenv('OPENAI_LOG_FORMAT')
format_strs = strs.split(',') if strs else LOG_OUTPUT_FORMATS
output_formats = [make_output_format(f, dir) for f in format_strs] output_formats = [make_output_format(f, dir) for f in format_strs]
Logger.CURRENT = Logger(dir=dir, output_formats=output_formats) Logger.CURRENT = Logger(dir=dir, output_formats=output_formats)
log('Logging to %s'%dir) log('Logging to %s'%dir)
if os.getenv('OPENAI_LOGDIR'):
# if OPENAI_LOGDIR is set, configure the logger on import
# this kind of nasty (unexpected to user), but I don't know how else to inject the logger
# to a script that's getting run in a subprocess
configure(dir=os.getenv('OPENAI_LOGDIR'))
def reset(): def reset():
Logger.CURRENT = Logger.DEFAULT if Logger.CURRENT is not Logger.DEFAULT:
log('Reset logger') Logger.CURRENT.close()
Logger.CURRENT = Logger.DEFAULT
log('Reset logger')
class scoped_configure(object):
def __init__(self, dir=None, format_strs=None):
self.dir = dir
self.format_strs = format_strs
self.prevlogger = None
def __enter__(self):
self.prevlogger = Logger.CURRENT
configure(dir=self.dir, format_strs=self.format_strs)
def __exit__(self, *args):
Logger.CURRENT.close()
Logger.CURRENT = self.prevlogger
# ================================================================ # ================================================================
@@ -294,14 +352,14 @@ def _demo():
dir = "/tmp/testlogging" dir = "/tmp/testlogging"
if os.path.exists(dir): if os.path.exists(dir):
shutil.rmtree(dir) shutil.rmtree(dir)
with session(dir=dir): configure(dir=dir)
logkv("a", 3) logkv("a", 3)
logkv("b", 2.5) logkv("b", 2.5)
dumpkvs() dumpkvs()
logkv("b", -2.5) logkv("b", -2.5)
logkv("a", 5.5) logkv("a", 5.5)
dumpkvs() dumpkvs()
info("^^^ should see a = 5.5") info("^^^ should see a = 5.5")
logkv("b", -2.5) logkv("b", -2.5)
dumpkvs() dumpkvs()
@@ -310,5 +368,55 @@ def _demo():
dumpkvs() dumpkvs()
# ================================================================
# Readers
# ================================================================
def read_json(fname):
import pandas
ds = []
with open(fname, 'rt') as fh:
for line in fh:
ds.append(json.loads(line))
return pandas.DataFrame(ds)
def read_csv(fname):
import pandas
return pandas.read_csv(fname, index_col=None, comment='#')
def read_tb(path):
"""
path : a tensorboard file OR a directory, where we will find all TB files
of the form events.*
"""
import pandas
import numpy as np
from glob import glob
from collections import defaultdict
import tensorflow as tf
if osp.isdir(path):
fnames = glob(osp.join(path, "events.*"))
elif osp.basename(path).startswith("events."):
fnames = [path]
else:
raise NotImplementedError("Expected tensorboard file or directory containing them. Got %s"%path)
tag2pairs = defaultdict(list)
maxstep = 0
for fname in fnames:
for summary in tf.train.summary_iterator(fname):
if summary.step > 0:
for v in summary.summary.value:
pair = (summary.step, v.simple_value)
tag2pairs[v.tag].append(pair)
maxstep = max(summary.step, maxstep)
data = np.empty((maxstep, len(tag2pairs)))
data[:] = np.nan
tags = sorted(tag2pairs.keys())
for (colidx,tag) in enumerate(tags):
pairs = tag2pairs[tag]
for (step, value) in pairs:
data[step-1, colidx] = value
return pandas.DataFrame(data, columns=tags)
if __name__ == "__main__": if __name__ == "__main__":
_demo() _demo()

View File

@@ -1,4 +1,3 @@
from baselines.common.mpi_running_mean_std import RunningMeanStd
import baselines.common.tf_util as U import baselines.common.tf_util as U
import tensorflow as tf import tensorflow as tf
import gym import gym
@@ -18,7 +17,7 @@ class CnnPolicy(object):
sequence_length = None sequence_length = None
ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))
x = ob / 255.0 x = ob / 255.0
if kind == 'small': # from A3C paper if kind == 'small': # from A3C paper
x = tf.nn.relu(U.conv2d(x, 16, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 16, "l1", [8, 8], [4, 4], pad="VALID"))

View File

@@ -18,7 +18,7 @@ class MlpPolicy(object):
sequence_length = None sequence_length = None
ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))
with tf.variable_scope("obfilter"): with tf.variable_scope("obfilter"):
self.ob_rms = RunningMeanStd(shape=ob_space.shape) self.ob_rms = RunningMeanStd(shape=ob_space.shape)
@@ -27,12 +27,12 @@ class MlpPolicy(object):
for i in range(num_hid_layers): for i in range(num_hid_layers):
last_out = tf.nn.tanh(U.dense(last_out, hid_size, "vffc%i"%(i+1), weight_init=U.normc_initializer(1.0))) last_out = tf.nn.tanh(U.dense(last_out, hid_size, "vffc%i"%(i+1), weight_init=U.normc_initializer(1.0)))
self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:,0] self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:,0]
last_out = obz last_out = obz
for i in range(num_hid_layers): for i in range(num_hid_layers):
last_out = tf.nn.tanh(U.dense(last_out, hid_size, "polfc%i"%(i+1), weight_init=U.normc_initializer(1.0))) last_out = tf.nn.tanh(U.dense(last_out, hid_size, "polfc%i"%(i+1), weight_init=U.normc_initializer(1.0)))
if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
mean = U.dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01)) mean = U.dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01))
logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer()) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer())
pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1) pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1)
else: else:

View File

@@ -78,7 +78,7 @@ def add_vtarg_and_adv(seg, gamma, lam):
seg["tdlamret"] = seg["adv"] + seg["vpred"] seg["tdlamret"] = seg["adv"] + seg["vpred"]
def learn(env, policy_func, *, def learn(env, policy_func, *,
timesteps_per_batch, # timesteps per actor per update timesteps_per_actorbatch, # timesteps per actor per update
clip_param, entcoeff, # clipping parameter epsilon, entropy coeff clip_param, entcoeff, # clipping parameter epsilon, entropy coeff
optim_epochs, optim_stepsize, optim_batchsize,# optimization hypers optim_epochs, optim_stepsize, optim_batchsize,# optimization hypers
gamma, lam, # advantage estimation gamma, lam, # advantage estimation
@@ -130,7 +130,7 @@ def learn(env, policy_func, *,
# Prepare for rollouts # Prepare for rollouts
# ---------------------------------------- # ----------------------------------------
seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True) seg_gen = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=True)
episodes_so_far = 0 episodes_so_far = 0
timesteps_so_far = 0 timesteps_so_far = 0

View File

@@ -6,37 +6,34 @@ from baselines import bench
import os.path as osp import os.path as osp
import gym, logging import gym, logging
from baselines import logger from baselines import logger
from baselines.common.atari_wrappers import make_atari, wrap_deepmind
def wrap_train(env): def train(env_id, num_timesteps, seed):
from baselines.common.atari_wrappers import (wrap_deepmind, FrameStack)
env = wrap_deepmind(env, clip_rewards=True)
env = FrameStack(env, 4)
return env
def train(env_id, num_frames, seed):
from baselines.ppo1 import pposgd_simple, cnn_policy from baselines.ppo1 import pposgd_simple, cnn_policy
import baselines.common.tf_util as U import baselines.common.tf_util as U
rank = MPI.COMM_WORLD.Get_rank() rank = MPI.COMM_WORLD.Get_rank()
sess = U.single_threaded_session() sess = U.single_threaded_session()
sess.__enter__() sess.__enter__()
if rank != 0: logger.set_level(logger.DISABLED) if rank == 0:
logger.configure()
else:
logger.configure(format_strs=[])
workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
set_global_seeds(workerseed) set_global_seeds(workerseed)
env = gym.make(env_id) env = make_atari(env_id)
def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613 def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613
return cnn_policy.CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space) return cnn_policy.CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space)
env = bench.Monitor(env, logger.get_dir() and env = bench.Monitor(env, logger.get_dir() and
osp.join(logger.get_dir(), "%i.monitor.json" % rank)) osp.join(logger.get_dir(), str(rank)))
env.seed(workerseed) env.seed(workerseed)
gym.logger.setLevel(logging.WARN) gym.logger.setLevel(logging.WARN)
env = wrap_train(env) env = wrap_deepmind(env)
num_timesteps = int(num_frames / 4 * 1.1)
env.seed(workerseed) env.seed(workerseed)
pposgd_simple.learn(env, policy_fn, pposgd_simple.learn(env, policy_fn,
max_timesteps=num_timesteps, max_timesteps=int(num_timesteps * 1.1),
timesteps_per_batch=256, timesteps_per_actorbatch=256,
clip_param=0.2, entcoeff=0.01, clip_param=0.2, entcoeff=0.01,
optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=64, optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=64,
gamma=0.99, lam=0.95, gamma=0.99, lam=0.95,
@@ -49,8 +46,9 @@ def main():
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--env', help='environment ID', default='PongNoFrameskip-v4') parser.add_argument('--env', help='environment ID', default='PongNoFrameskip-v4')
parser.add_argument('--seed', help='RNG seed', type=int, default=0) parser.add_argument('--seed', help='RNG seed', type=int, default=0)
parser.add_argument('--num-timesteps', type=int, default=int(10e6))
args = parser.parse_args() args = parser.parse_args()
train(args.env, num_frames=40e6, seed=args.seed) train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
if __name__ == '__main__': if __name__ == '__main__':
main() main()

View File

@@ -1,10 +1,8 @@
#!/usr/bin/env python #!/usr/bin/env python
from baselines.common import set_global_seeds, tf_util as U from baselines.common import set_global_seeds, tf_util as U
from baselines import bench from baselines import bench
import os.path as osp
import gym, logging import gym, logging
from baselines import logger from baselines import logger
import sys
def train(env_id, num_timesteps, seed): def train(env_id, num_timesteps, seed):
from baselines.ppo1 import mlp_policy, pposgd_simple from baselines.ppo1 import mlp_policy, pposgd_simple
@@ -14,13 +12,12 @@ def train(env_id, num_timesteps, seed):
def policy_fn(name, ob_space, ac_space): def policy_fn(name, ob_space, ac_space):
return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
hid_size=64, num_hid_layers=2) hid_size=64, num_hid_layers=2)
env = bench.Monitor(env, logger.get_dir() and env = bench.Monitor(env, logger.get_dir())
osp.join(logger.get_dir(), "monitor.json"))
env.seed(seed) env.seed(seed)
gym.logger.setLevel(logging.WARN) gym.logger.setLevel(logging.WARN)
pposgd_simple.learn(env, policy_fn, pposgd_simple.learn(env, policy_fn,
max_timesteps=num_timesteps, max_timesteps=num_timesteps,
timesteps_per_batch=2048, timesteps_per_actorbatch=2048,
clip_param=0.2, entcoeff=0.0, clip_param=0.2, entcoeff=0.0,
optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64,
gamma=0.99, lam=0.95, schedule='linear', gamma=0.99, lam=0.95, schedule='linear',
@@ -32,8 +29,10 @@ def main():
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--env', help='environment ID', default='Hopper-v1') parser.add_argument('--env', help='environment ID', default='Hopper-v1')
parser.add_argument('--seed', help='RNG seed', type=int, default=0) parser.add_argument('--seed', help='RNG seed', type=int, default=0)
parser.add_argument('--num-timesteps', type=int, default=int(1e6))
args = parser.parse_args() args = parser.parse_args()
train(args.env, num_timesteps=1e6, seed=args.seed) logger.configure()
train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
if __name__ == '__main__': if __name__ == '__main__':

View File

@@ -1,4 +1,3 @@
from baselines.common.mpi_running_mean_std import RunningMeanStd
import baselines.common.tf_util as U import baselines.common.tf_util as U
import tensorflow as tf import tensorflow as tf
import gym import gym
@@ -18,7 +17,7 @@ class CnnPolicy(object):
sequence_length = None sequence_length = None
ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))
obscaled = ob / 255.0 obscaled = ob / 255.0
with tf.variable_scope("pol"): with tf.variable_scope("pol"):
@@ -42,7 +41,7 @@ class CnnPolicy(object):
self.state_out = [] self.state_out = []
stochastic = tf.placeholder(dtype=tf.bool, shape=()) stochastic = tf.placeholder(dtype=tf.bool, shape=())
ac = self.pd.sample() # XXX ac = self.pd.sample()
self._act = U.function([stochastic, ob], [ac, self.vpred]) self._act = U.function([stochastic, ob], [ac, self.vpred])
def act(self, stochastic, ob): def act(self, stochastic, ob):

View File

@@ -1,45 +1,38 @@
#!/usr/bin/env python #!/usr/bin/env python
from mpi4py import MPI from mpi4py import MPI
from baselines.common import set_global_seeds from baselines.common import set_global_seeds
import os.path as osp import os.path as osp
import gym, logging import gym, logging
from baselines import logger from baselines import logger
from baselines import bench from baselines import bench
import sys from baselines.common.atari_wrappers import make_atari, wrap_deepmind
def wrap_train(env): def train(env_id, num_timesteps, seed):
from baselines.common.atari_wrappers import (wrap_deepmind, FrameStack)
env = wrap_deepmind(env, clip_rewards=False)
env = FrameStack(env, 3)
return env
def train(env_id, num_frames, seed):
from baselines.trpo_mpi.nosharing_cnn_policy import CnnPolicy from baselines.trpo_mpi.nosharing_cnn_policy import CnnPolicy
from baselines.trpo_mpi import trpo_mpi from baselines.trpo_mpi import trpo_mpi
import baselines.common.tf_util as U import baselines.common.tf_util as U
rank = MPI.COMM_WORLD.Get_rank() rank = MPI.COMM_WORLD.Get_rank()
sess = U.single_threaded_session() sess = U.single_threaded_session()
sess.__enter__() sess.__enter__()
if rank != 0: if rank == 0:
logger.set_level(logger.DISABLED) logger.configure()
else:
logger.configure(format_strs=[])
workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
set_global_seeds(workerseed) set_global_seeds(workerseed)
env = gym.make(env_id) env = make_atari(env_id)
def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613 def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613
return CnnPolicy(name=name, ob_space=env.observation_space, ac_space=env.action_space) return CnnPolicy(name=name, ob_space=env.observation_space, ac_space=env.action_space)
env = bench.Monitor(env, logger.get_dir() and env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), str(rank)))
osp.join(logger.get_dir(), "%i.monitor.json"%rank))
env.seed(workerseed) env.seed(workerseed)
gym.logger.setLevel(logging.WARN) gym.logger.setLevel(logging.WARN)
env = wrap_train(env) env = wrap_deepmind(env)
num_timesteps = int(num_frames / 4 * 1.1)
env.seed(workerseed) env.seed(workerseed)
trpo_mpi.learn(env, policy_fn, timesteps_per_batch=512, max_kl=0.001, cg_iters=10, cg_damping=1e-3, trpo_mpi.learn(env, policy_fn, timesteps_per_batch=512, max_kl=0.001, cg_iters=10, cg_damping=1e-3,
max_timesteps=num_timesteps, gamma=0.98, lam=1.0, vf_iters=3, vf_stepsize=1e-4, entcoeff=0.00) max_timesteps=int(num_timesteps * 1.1), gamma=0.98, lam=1.0, vf_iters=3, vf_stepsize=1e-4, entcoeff=0.00)
env.close() env.close()
def main(): def main():
@@ -47,8 +40,9 @@ def main():
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--env', help='environment ID', default='PongNoFrameskip-v4') parser.add_argument('--env', help='environment ID', default='PongNoFrameskip-v4')
parser.add_argument('--seed', help='RNG seed', type=int, default=0) parser.add_argument('--seed', help='RNG seed', type=int, default=0)
parser.add_argument('--num-timesteps', type=int, default=int(10e6))
args = parser.parse_args() args = parser.parse_args()
train(args.env, num_frames=40e6, seed=args.seed) train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
if __name__ == "__main__": if __name__ == "__main__":

View File

@@ -27,8 +27,8 @@ def train(env_id, num_timesteps, seed):
def policy_fn(name, ob_space, ac_space): def policy_fn(name, ob_space, ac_space):
return MlpPolicy(name=name, ob_space=env.observation_space, ac_space=env.action_space, return MlpPolicy(name=name, ob_space=env.observation_space, ac_space=env.action_space,
hid_size=32, num_hid_layers=2) hid_size=32, num_hid_layers=2)
env = bench.Monitor(env, logger.get_dir() and env = bench.Monitor(env, logger.get_dir() and
osp.join(logger.get_dir(), "%i.monitor.json" % rank)) osp.join(logger.get_dir(), str(rank)))
env.seed(workerseed) env.seed(workerseed)
gym.logger.setLevel(logging.WARN) gym.logger.setLevel(logging.WARN)
@@ -41,8 +41,10 @@ def main():
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--env', help='environment ID', default='Hopper-v1') parser.add_argument('--env', help='environment ID', default='Hopper-v1')
parser.add_argument('--seed', help='RNG seed', type=int, default=0) parser.add_argument('--seed', help='RNG seed', type=int, default=0)
parser.add_argument('--num-timesteps', type=int, default=int(1e6))
args = parser.parse_args() args = parser.parse_args()
train(args.env, num_timesteps=1e6, seed=args.seed) logger.configure()
train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
if __name__ == '__main__': if __name__ == '__main__':

View File

@@ -288,4 +288,4 @@ def learn(env, policy_func, *,
logger.dump_tabular() logger.dump_tabular()
def flatten_lists(listoflists): def flatten_lists(listoflists):
return [el for list_ in listoflists for el in list_] return [el for list_ in listoflists for el in list_]