change atari preprocessing to use faster opencv

some logger changes
This commit is contained in:
John Schulman
2017-10-25 09:21:29 -04:00
parent 4993286230
commit bb40378118
40 changed files with 600 additions and 823 deletions

3
.gitignore vendored
View File

@@ -30,3 +30,6 @@ src
*.egg-info
.cache
MUJOCO_LOG.TXT

View File

@@ -20,3 +20,14 @@ pip install -e .
- [DQN](baselines/deepq)
- [PPO](baselines/ppo1)
- [TRPO](baselines/trpo_mpi)
To cite this repository in publications:
@misc{baselines,
author = {Hesse, Christopher and Plappert, Matthias and Radford, Alec and Schulman, John and Sidor, Szymon and Wu, Yuhuai},
title = {OpenAI Baselines},
year = {2017},
publisher = {GitHub},
journal = {GitHub repository},
howpublished = {\url{https://github.com/openai/baselines}},
}

View File

@@ -1,9 +1,6 @@
import numpy as np
import tensorflow as tf
from baselines.a2c.utils import conv, fc, conv_to_fc, batch_to_seq, seq_to_batch, lstm, lnlstm, sample, check_shape
from baselines.common.distributions import make_pdtype
import baselines.common.tf_util as U
import gym
from baselines.a2c.utils import conv, fc, conv_to_fc, batch_to_seq, seq_to_batch, lstm, lnlstm, sample
class LnLstmPolicy(object):
def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, nlstm=256, reuse=False):

View File

@@ -5,18 +5,15 @@ from baselines.common import set_global_seeds
from baselines import bench
from baselines.a2c.a2c import learn
from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
from baselines.common.atari_wrappers import wrap_deepmind
from baselines.common.atari_wrappers import make_atari, wrap_deepmind
from baselines.a2c.policies import CnnPolicy, LstmPolicy, LnLstmPolicy
def train(env_id, num_frames, seed, policy, lrschedule, num_cpu):
num_timesteps = int(num_frames / 4 * 1.1)
# divide by 4 due to frameskip, then do a little extras so episodes end
def train(env_id, num_timesteps, seed, policy, lrschedule, num_cpu):
def make_env(rank):
def _thunk():
env = gym.make(env_id)
env = make_atari(env_id)
env.seed(seed + rank)
env = bench.Monitor(env, logger.get_dir() and
os.path.join(logger.get_dir(), "{}.monitor.json".format(rank)))
env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
gym.logger.setLevel(logging.WARN)
return wrap_deepmind(env)
return _thunk
@@ -28,7 +25,7 @@ def train(env_id, num_frames, seed, policy, lrschedule, num_cpu):
policy_fn = LstmPolicy
elif policy == 'lnlstm':
policy_fn = LnLstmPolicy
learn(policy_fn, env, seed, total_timesteps=num_timesteps, lrschedule=lrschedule)
learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), lrschedule=lrschedule)
env.close()
def main():
@@ -38,10 +35,10 @@ def main():
parser.add_argument('--seed', help='RNG seed', type=int, default=0)
parser.add_argument('--policy', help='Policy architecture', choices=['cnn', 'lstm', 'lnlstm'], default='cnn')
parser.add_argument('--lrschedule', help='Learning rate schedule', choices=['constant', 'linear'], default='constant')
parser.add_argument('--million_frames', help='How many frames to train (/ 1e6). '
'This number gets divided by 4 due to frameskip', type=int, default=40)
parser.add_argument('--num-timesteps', type=int, default=int(10e6))
args = parser.parse_args()
train(args.env, num_frames=1e6 * args.million_frames, seed=args.seed,
logger.configure()
train(args.env, num_timesteps=args.num_timesteps, seed=args.seed,
policy=args.policy, lrschedule=args.lrschedule, num_cpu=16)
if __name__ == '__main__':

View File

@@ -110,7 +110,6 @@ def learn(env, policy, vf, gamma, lam, timesteps_per_batch, num_timesteps,
ob_no = np.concatenate([path["observation"] for path in paths])
action_na = np.concatenate([path["action"] for path in paths])
oldac_dist = np.concatenate([path["action_dist"] for path in paths])
logp_n = np.concatenate([path["logp"] for path in paths])
adv_n = np.concatenate(advs)
standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8)
@@ -138,3 +137,6 @@ def learn(env, policy, vf, gamma, lam, timesteps_per_batch, num_timesteps,
callback()
logger.dump_tabular()
i += 1
coord.request_stop()
coord.join(enqueue_threads)

View File

@@ -113,7 +113,6 @@ class Runner(object):
nenv = env.num_envs
self.batch_ob_shape = (nenv*nsteps, nh, nw, nc*nstack)
self.obs = np.zeros((nenv, nh, nw, nc*nstack), dtype=np.uint8)
self.nc = nc
obs = env.reset()
self.update_obs(obs)
self.gamma = gamma
@@ -122,8 +121,8 @@ class Runner(object):
self.dones = [False for _ in range(nenv)]
def update_obs(self, obs):
self.obs = np.roll(self.obs, shift=-self.nc, axis=3)
self.obs[:, :, :, -self.nc:] = obs
self.obs = np.roll(self.obs, shift=-1, axis=3)
self.obs[:, :, :, -1] = obs[:, :, :, 0]
def run(self):
mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[]
@@ -189,7 +188,8 @@ def learn(policy, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interval
runner = Runner(env, model, nsteps=nsteps, nstack=nstack, gamma=gamma)
nbatch = nenvs*nsteps
tstart = time.time()
enqueue_threads = model.q_runner.create_threads(model.sess, coord=tf.train.Coordinator(), start=True)
coord = tf.train.Coordinator()
enqueue_threads = model.q_runner.create_threads(model.sess, coord=coord, start=True)
for update in range(1, total_timesteps//nbatch+1):
obs, states, rewards, masks, actions, values = runner.run()
policy_loss, value_loss, policy_entropy = model.train(obs, states, rewards, masks, actions, values)
@@ -211,5 +211,6 @@ def learn(policy, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interval
savepath = osp.join(logger.get_dir(), 'checkpoint%.5i'%update)
print('Saving to', savepath)
model.save(savepath)
coord.request_stop()
coord.join(enqueue_threads)
env.close()

View File

@@ -1,9 +1,7 @@
import numpy as np
import tensorflow as tf
from baselines.acktr.utils import conv, fc, dense, conv_to_fc, sample, kl_div
from baselines.common.distributions import make_pdtype
import baselines.common.tf_util as U
import gym
class CnnPolicy(object):
@@ -51,7 +49,6 @@ class GaussianMlpPolicy(object):
oldac_na = tf.placeholder(tf.float32, shape=[None, ac_dim], name="ac") # batch of actions previous actions
oldac_dist = tf.placeholder(tf.float32, shape=[None, ac_dim*2], name="oldac_dist") # batch of actions previous action distributions
adv_n = tf.placeholder(tf.float32, shape=[None], name="adv") # advantage function estimate
oldlogprob_n = tf.placeholder(tf.float32, shape=[None], name='oldlogprob') # log probability of previous actions
wd_dict = {}
h1 = tf.nn.tanh(dense(ob_no, 64, "h1", weight_init=U.normc_initializer(1.0), bias_init=0.0, weight_loss_dict=wd_dict))
h2 = tf.nn.tanh(dense(h1, 64, "h2", weight_init=U.normc_initializer(1.0), bias_init=0.0, weight_loss_dict=wd_dict))

View File

@@ -5,24 +5,22 @@ from baselines.common import set_global_seeds
from baselines import bench
from baselines.acktr.acktr_disc import learn
from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
from baselines.common.atari_wrappers import wrap_deepmind
from baselines.common.atari_wrappers import make_atari, wrap_deepmind
from baselines.acktr.policies import CnnPolicy
def train(env_id, num_frames, seed, num_cpu):
num_timesteps = int(num_frames / 4 * 1.1)
def train(env_id, num_timesteps, seed, num_cpu):
def make_env(rank):
def _thunk():
env = gym.make(env_id)
env = make_atari(env_id)
env.seed(seed + rank)
if logger.get_dir():
env = bench.Monitor(env, os.path.join(logger.get_dir(), "{}.monitor.json".format(rank)))
env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
gym.logger.setLevel(logging.WARN)
return wrap_deepmind(env)
return _thunk
set_global_seeds(seed)
env = SubprocVecEnv([make_env(i) for i in range(num_cpu)])
policy_fn = CnnPolicy
learn(policy_fn, env, seed, total_timesteps=num_timesteps, nprocs=num_cpu)
learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), nprocs=num_cpu)
env.close()
def main():
@@ -30,10 +28,10 @@ def main():
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4')
parser.add_argument('--seed', help='RNG seed', type=int, default=0)
parser.add_argument('--million_frames', help='How many frames to train (/ 1e6). '
'This number gets divided by 4 due to frameskip', type=int, default=40)
parser.add_argument('--num-timesteps', type=int, default=int(10e6))
args = parser.parse_args()
train(args.env, num_frames=1e6 * args.million_frames, seed=args.seed, num_cpu=32)
logger.configure()
train(args.env, num_timesteps=args.num_timesteps, seed=args.seed, num_cpu=32)
if __name__ == '__main__':

View File

@@ -13,13 +13,12 @@ from baselines.acktr.value_functions import NeuralNetValueFunction
def train(env_id, num_timesteps, seed):
env=gym.make(env_id)
if logger.get_dir():
env = bench.Monitor(env, os.path.join(logger.get_dir(), "monitor.json"))
env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
set_global_seeds(seed)
env.seed(seed)
gym.logger.setLevel(logging.WARN)
with tf.Session(config=tf.ConfigProto()) as session:
with tf.Session(config=tf.ConfigProto()):
ob_dim = env.observation_space.shape[0]
ac_dim = env.action_space.shape[0]
with tf.variable_scope("vf"):
@@ -38,5 +37,7 @@ if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Run Mujoco benchmark.')
parser.add_argument('--seed', help='RNG seed', type=int, default=0)
parser.add_argument('--env', help='environment ID', type=str, default="Reacher-v1")
parser.add_argument('--num-timesteps', type=int, default=int(1e6))
args = parser.parse_args()
train(args.env, num_timesteps=1e6, seed=args.seed)
logger.configure()
train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)

View File

@@ -13,7 +13,7 @@ class NeuralNetValueFunction(object):
wd_dict = {}
h1 = tf.nn.elu(dense(X, 64, "h1", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict))
h2 = tf.nn.elu(dense(h1, 64, "h2", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict))
vpred_n = dense(h2, 1, "hfinal", weight_init=None, bias_init=0, weight_loss_dict=wd_dict)[:,0]
vpred_n = dense(h2, 1, "hfinal", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)[:,0]
sample_vpred_n = vpred_n + tf.random_normal(tf.shape(vpred_n))
wd_loss = tf.get_collection("vf_losses", None)
loss = U.mean(tf.square(vpred_n - vtarg_n)) + tf.add_n(wd_loss)
@@ -22,7 +22,7 @@ class NeuralNetValueFunction(object):
optim = kfac.KfacOptimizer(learning_rate=0.001, cold_lr=0.001*(1-0.9), momentum=0.9, \
clip_kl=0.3, epsilon=0.1, stats_decay=0.95, \
async=1, kfac_update=2, cold_iter=50, \
weight_decay_dict=wd_dict, max_grad_norm=1.0)
weight_decay_dict=wd_dict, max_grad_norm=None)
vf_var_list = []
for var in tf.trainable_variables():
if "vf" in var.name:

View File

@@ -1,3 +1,3 @@
from baselines.bench.benchmarks import *
from baselines.bench.monitor import *
from baselines.bench.simple_bench import simple_bench

View File

@@ -1,61 +1,71 @@
import os.path as osp
_atari7 = ['BeamRider', 'Breakout', 'Enduro', 'Pong', 'Qbert', 'Seaquest', 'SpaceInvaders']
_atariexpl7 = ['Freeway', 'Gravitar', 'MontezumaRevenge', 'Pitfall', 'PrivateEye', 'Solaris', 'Venture']
_BENCHMARKS = []
def register_benchmark(benchmark):
for b in _BENCHMARKS:
if b['name'] == benchmark['name']:
raise ValueError('Benchmark with name %s already registered!'%b['name'])
raise ValueError('Benchmark with name %s already registered!' % b['name'])
_BENCHMARKS.append(benchmark)
def list_benchmarks():
return [b['name'] for b in _BENCHMARKS]
def get_benchmark(benchmark_name):
for b in _BENCHMARKS:
if b['name'] == benchmark_name:
return b
raise ValueError('%s not found! Known benchmarks: %s' % (benchmark_name, list_benchmarks()))
def get_task(benchmark, env_id):
"""Get a task by env_id. Return None if the benchmark doesn't have the env"""
return next(filter(lambda task: task['env_id'] == env_id, benchmark['tasks']), None)
def find_task_for_env_id_in_any_benchmark(env_id):
for bm in _BENCHMARKS:
for task in bm["tasks"]:
if task["env_id"]==env_id:
if task["env_id"] == env_id:
return bm, task
return None, None
_ATARI_SUFFIX = 'NoFrameskip-v4'
register_benchmark({
'name' : 'Atari200M',
'description' :'7 Atari games from Mnih et al. (2013), with pixel observations, 200M frames',
'tasks' : [{'env_id' : _game + _ATARI_SUFFIX, 'trials' : 2, 'num_timesteps' : int(200e6)} for _game in _atari7]
'name': 'Atari50M',
'description': '7 Atari games from Mnih et al. (2013), with pixel observations, 50M timesteps',
'tasks': [{'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(50e6)} for _game in _atari7]
})
register_benchmark({
'name' : 'Atari40M',
'description' :'7 Atari games from Mnih et al. (2013), with pixel observations, 40M frames',
'tasks' : [{'env_id' : _game + _ATARI_SUFFIX, 'trials' : 2, 'num_timesteps' : int(40e6)} for _game in _atari7]
'name': 'Atari10M',
'description': '7 Atari games from Mnih et al. (2013), with pixel observations, 10M timesteps',
'tasks': [{'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(10e6)} for _game in _atari7]
})
register_benchmark({
'name' : 'Atari1Hr',
'description' :'7 Atari games from Mnih et al. (2013), with pixel observations, 1 hour of walltime',
'tasks' : [{'env_id' : _game + _ATARI_SUFFIX, 'trials' : 2, 'num_seconds' : 60*60} for _game in _atari7]
'name': 'Atari1Hr',
'description': '7 Atari games from Mnih et al. (2013), with pixel observations, 1 hour of walltime',
'tasks': [{'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_seconds': 60 * 60} for _game in _atari7]
})
register_benchmark({
'name' : 'AtariExploration40M',
'description' :'7 Atari games emphasizing exploration, with pixel observations, 40M frames',
'tasks' : [{'env_id' : _game + _ATARI_SUFFIX, 'trials' : 2, 'num_timesteps' : int(40e6)} for _game in _atariexpl7]
'name': 'AtariExploration10M',
'description': '7 Atari games emphasizing exploration, with pixel observations, 10M timesteps',
'tasks': [{'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(10e6)} for _game in _atariexpl7]
})
# MuJoCo
_mujocosmall = [
@@ -63,78 +73,60 @@ _mujocosmall = [
'HalfCheetah-v1', 'Hopper-v1', 'Walker2d-v1',
'Reacher-v1', 'Swimmer-v1']
register_benchmark({
'name' : 'Mujoco1M',
'description' : 'Some small 2D MuJoCo tasks, run for 1M timesteps',
'tasks' : [{'env_id' : _envid, 'trials' : 3, 'num_timesteps' : int(1e6)} for _envid in _mujocosmall]
'name': 'Mujoco1M',
'description': 'Some small 2D MuJoCo tasks, run for 1M timesteps',
'tasks': [{'env_id': _envid, 'trials': 3, 'num_timesteps': int(1e6)} for _envid in _mujocosmall]
})
register_benchmark({
'name' : 'MujocoWalkers',
'description' : 'MuJoCo forward walkers, run for 8M, humanoid 100M',
'tasks' : [
{'env_id' : "Hopper-v1", 'trials' : 4, 'num_timesteps' : 8*1000000 },
{'env_id' : "Walker2d-v1", 'trials' : 4, 'num_timesteps' : 8*1000000 },
{'env_id' : "Humanoid-v1", 'trials' : 4, 'num_timesteps' : 100*1000000 },
'name': 'MujocoWalkers',
'description': 'MuJoCo forward walkers, run for 8M, humanoid 100M',
'tasks': [
{'env_id': "Hopper-v1", 'trials': 4, 'num_timesteps': 8 * 1000000},
{'env_id': "Walker2d-v1", 'trials': 4, 'num_timesteps': 8 * 1000000},
{'env_id': "Humanoid-v1", 'trials': 4, 'num_timesteps': 100 * 1000000},
]
})
# To reproduce:
# python3 baselines/baselines/ppo2/ppo2_run_benchmark.py gce MujocoWalkers myrun_ppo2_whiteobs1_cpu8
# (observation input filters necessary)
# Roboschool
register_benchmark({
'name' : 'Roboschool8M',
'description' : 'Small 2D tasks, up to 30 minutes to complete on 8 cores',
'tasks' : [
{'env_id' : "RoboschoolReacher-v1", 'trials' : 4, 'num_timesteps' : 2*1000000 },
{'env_id' : "RoboschoolAnt-v1", 'trials' : 4, 'num_timesteps' : 8*1000000 },
{'env_id' : "RoboschoolHalfCheetah-v1", 'trials' : 4, 'num_timesteps' : 8*1000000 },
{'env_id' : "RoboschoolHopper-v1", 'trials' : 4, 'num_timesteps' : 8*1000000 },
{'env_id' : "RoboschoolWalker2d-v1", 'trials' : 4, 'num_timesteps' : 8*1000000 },
'name': 'Roboschool8M',
'description': 'Small 2D tasks, up to 30 minutes to complete on 8 cores',
'tasks': [
{'env_id': "RoboschoolReacher-v1", 'trials': 4, 'num_timesteps': 2 * 1000000},
{'env_id': "RoboschoolAnt-v1", 'trials': 4, 'num_timesteps': 8 * 1000000},
{'env_id': "RoboschoolHalfCheetah-v1", 'trials': 4, 'num_timesteps': 8 * 1000000},
{'env_id': "RoboschoolHopper-v1", 'trials': 4, 'num_timesteps': 8 * 1000000},
{'env_id': "RoboschoolWalker2d-v1", 'trials': 4, 'num_timesteps': 8 * 1000000},
]
})
register_benchmark({
'name' : 'RoboschoolHarder',
'description' : 'Test your might!!! Up to 12 hours on 32 cores',
'tasks' : [
{'env_id' : "RoboschoolHumanoid-v1", 'trials' : 4, 'num_timesteps' : 100*1000000 },
{'env_id' : "RoboschoolHumanoidFlagrun-v1", 'trials' : 4, 'num_timesteps' : 200*1000000 },
{'env_id' : "RoboschoolHumanoidFlagrunHarder-v1", 'trials' : 4, 'num_timesteps' : 400*1000000 },
'name': 'RoboschoolHarder',
'description': 'Test your might!!! Up to 12 hours on 32 cores',
'tasks': [
{'env_id': "RoboschoolHumanoid-v1", 'trials': 4, 'num_timesteps': 100 * 1000000},
{'env_id': "RoboschoolHumanoidFlagrun-v1", 'trials': 4, 'num_timesteps': 200 * 1000000},
{'env_id': "RoboschoolHumanoidFlagrunHarder-v1", 'trials': 4, 'num_timesteps': 400 * 1000000},
]
})
# To reproduce:
# python3 baselines/baselines/ppo2/ppo2_run_benchmark.py gce Roboschool8M myrun_ppo2_cpu8
# python3 baselines/baselines/ppo2/ppo2_run_benchmark.py gce RoboschoolHarder myrun_ppo2_cpu32_large_samples65536
# (Large network, train on 65536 samples each iteration. Also, _large is really necessary only for Harder)
# Other
_atari50 = [ # actually 49
_atari50 = [ # actually 47
'Alien', 'Amidar', 'Assault', 'Asterix', 'Asteroids',
'Atlantis', 'BankHeist', 'BattleZone', 'BeamRider', 'Bowling',
'Boxing', 'Breakout', 'Centipede', 'ChopperCommand', 'CrazyClimber',
'Breakout', 'Centipede', 'ChopperCommand', 'CrazyClimber',
'DemonAttack', 'DoubleDunk', 'Enduro', 'FishingDerby', 'Freeway',
'Frostbite', 'Gopher', 'Gravitar', 'IceHockey', 'Jamesbond',
'Kangaroo', 'Krull', 'KungFuMaster', 'MontezumaRevenge', 'MsPacman',
'NameThisGame', 'Pitfall', 'Pong', 'PrivateEye', 'Qbert',
'Riverraid', 'RoadRunner', 'Robotank', 'Seaquest', 'SpaceInvaders',
'StarGunner', 'Tennis', 'TimePilot', 'Tutankham', 'UpNDown',
'Venture', 'VideoPinball', 'WizardOfWor', 'Zaxxon',
'RoadRunner', 'Robotank', 'Seaquest', 'SpaceInvaders', 'StarGunner',
'Tennis', 'TimePilot', 'Tutankham', 'UpNDown', 'Venture',
'VideoPinball', 'WizardOfWor', 'Zaxxon',
]
register_benchmark({
'name' : 'Atari50_40M',
'description' :'7 Atari games from Mnih et al. (2013), with pixel observations, 40M frames',
'tasks' : [{'env_id' : _game + _ATARI_SUFFIX, 'trials' : 3, 'num_timesteps' : int(40e6)} for _game in _atari50]
'name': 'Atari50_10M',
'description': '47 Atari games from Mnih et al. (2013), with pixel observations, 10M timesteps',
'tasks': [{'env_id': _game + _ATARI_SUFFIX, 'trials': 3, 'num_timesteps': int(10e6)} for _game in _atari50]
})
def env_shortname(s):
"Make typical names above shorter, while keeping recognizable"
s = s.replace("NoFrameskip", "")
if s[:10]=="Roboschool": s = s[10:]
i = s.rfind("-v")
if i!=-1: s = s[:i]
return s.lower()

View File

@@ -2,20 +2,17 @@ __all__ = ['Monitor', 'get_monitor_files', 'load_results']
import gym
from gym.core import Wrapper
from os import path
import time
from glob import glob
try:
import ujson as json # Not necessary for monitor writing, but very useful for monitor loading
except ImportError:
import json
import csv
import os.path as osp
import json
class Monitor(Wrapper):
EXT = "monitor.json"
EXT = "monitor.csv"
f = None
def __init__(self, env, filename, allow_early_resets=False):
def __init__(self, env, filename, allow_early_resets=False, reset_keywords=()):
Wrapper.__init__(self, env=env)
self.tstart = time.time()
if filename is None:
@@ -23,50 +20,38 @@ class Monitor(Wrapper):
self.logger = None
else:
if not filename.endswith(Monitor.EXT):
if osp.isdir(filename):
filename = osp.join(filename, Monitor.EXT)
else:
filename = filename + "." + Monitor.EXT
self.f = open(filename, "wt")
self.logger = JSONLogger(self.f)
self.logger.writekvs({"t_start": self.tstart, "gym_version": gym.__version__,
"env_id": env.spec.id if env.spec else 'Unknown'})
self.f.write('#%s\n'%json.dumps({"t_start": self.tstart, "gym_version": gym.__version__,
"env_id": env.spec.id if env.spec else 'Unknown'}))
self.logger = csv.DictWriter(self.f, fieldnames=('r', 'l', 't')+reset_keywords)
self.logger.writeheader()
self.reset_keywords = reset_keywords
self.allow_early_resets = allow_early_resets
self.rewards = None
self.needs_reset = True
self.episode_rewards = []
self.episode_lengths = []
self.total_steps = 0
self.current_metadata = {} # extra info that gets injected into each log entry
# Useful for metalearning where we're modifying the environment externally
# But want our logs to know about these modifications
self.current_reset_info = {} # extra info about the current episode, that was passed in during reset()
def __getstate__(self): # XXX
d = self.__dict__.copy()
if self.f:
del d['f'], d['logger']
d['_filename'] = self.f.name
d['_num_episodes'] = len(self.episode_rewards)
else:
d['_filename'] = None
return d
def __setstate__(self, d):
filename = d.pop('_filename')
self.__dict__ = d
if filename is not None:
nlines = d.pop('_num_episodes') + 1
self.f = open(filename, "r+t")
for _ in range(nlines):
self.f.readline()
self.f.truncate()
self.logger = JSONLogger(self.f)
def reset(self):
def _reset(self, **kwargs):
if not self.allow_early_resets and not self.needs_reset:
raise RuntimeError("Tried to reset an environment before done. If you want to allow early resets, wrap your env with Monitor(env, path, allow_early_resets=True)")
self.rewards = []
self.needs_reset = False
return self.env.reset()
for k in self.reset_keywords:
v = kwargs.get(k)
if v is None:
raise ValueError('Expected you to pass kwarg %s into reset'%k)
self.current_reset_info[k] = v
return self.env.reset(**kwargs)
def step(self, action):
def _step(self, action):
if self.needs_reset:
raise RuntimeError("Tried to step environment that needs reset")
ob, rew, done, info = self.env.step(action)
@@ -75,10 +60,11 @@ class Monitor(Wrapper):
self.needs_reset = True
eprew = sum(self.rewards)
eplen = len(self.rewards)
epinfo = {"r": eprew, "l": eplen, "t": round(time.time() - self.tstart, 6)}
epinfo.update(self.current_metadata)
epinfo = {"r": round(eprew, 6), "l": eplen, "t": round(time.time() - self.tstart, 6)}
epinfo.update(self.current_reset_info)
if self.logger:
self.logger.writekvs(epinfo)
self.logger.writerow(epinfo)
self.f.flush()
self.episode_rewards.append(eprew)
self.episode_lengths.append(eplen)
info['episode'] = epinfo
@@ -98,52 +84,40 @@ class Monitor(Wrapper):
def get_episode_lengths(self):
return self.episode_lengths
class JSONLogger(object):
def __init__(self, file):
self.file = file
def writekvs(self, kvs):
for k,v in kvs.items():
if hasattr(v, 'dtype'):
v = v.tolist()
kvs[k] = float(v)
self.file.write(json.dumps(kvs) + '\n')
self.file.flush()
class LoadMonitorResultsError(Exception):
pass
def get_monitor_files(dir):
return glob(path.join(dir, "*" + Monitor.EXT))
return glob(osp.join(dir, "*" + Monitor.EXT))
def load_results(dir, raw_episodes=False):
fnames = get_monitor_files(dir)
if not fnames:
def load_results(dir):
import pandas
monitor_files = glob(osp.join(dir, "*monitor.*")) # get both csv and (old) json files
if not monitor_files:
raise LoadMonitorResultsError("no monitor files of the form *%s found in %s" % (Monitor.EXT, dir))
episodes = []
dfs = []
headers = []
for fname in fnames:
for fname in monitor_files:
with open(fname, 'rt') as fh:
if fname.endswith('csv'):
firstline = fh.readline()
assert firstline[0] == '#'
header = json.loads(firstline[1:])
df = pandas.read_csv(fh, index_col=None)
headers.append(header)
elif fname.endswith('json'): # Deprecated json format
episodes = []
lines = fh.readlines()
header = json.loads(lines[0])
headers.append(header)
for line in lines[1:]:
episode = json.loads(line)
episode['abstime'] = header['t_start'] + episode['t']
del episode['t']
episodes.append(episode)
header0 = headers[0]
for header in headers[1:]:
assert header['env_id'] == header0['env_id'], "mixing data from two envs"
episodes = sorted(episodes, key=lambda e: e['abstime'])
if raw_episodes:
return episodes
else:
return {
'env_info': {'env_id': header0['env_id'], 'gym_version': header0['gym_version']},
'episode_end_times': [e['abstime'] for e in episodes],
'episode_lengths': [e['l'] for e in episodes],
'episode_rewards': [e['r'] for e in episodes],
'initial_reset_time': min([min(header['t_start'] for header in headers)])
}
df = pandas.DataFrame(episodes)
df['t'] += header['t_start']
dfs.append(df)
df = pandas.concat(dfs)
df.sort_values('t', inplace=True)
df['t'] -= min(header['t_start'] for header in headers)
df.headers = headers # HACK to preserve backwards compatibility
return df

View File

@@ -1,9 +1,8 @@
import numpy as np
from collections import deque
from PIL import Image
import gym
from gym import spaces
import cv2
class NoopResetEnv(gym.Wrapper):
def __init__(self, env, noop_max=30):
@@ -13,11 +12,16 @@ class NoopResetEnv(gym.Wrapper):
gym.Wrapper.__init__(self, env)
self.noop_max = noop_max
self.override_num_noops = None
if isinstance(env.action_space, gym.spaces.MultiBinary):
self.noop_action = np.zeros(self.env.action_space.n, dtype=np.int64)
else:
# used for atari environments
self.noop_action = 0
assert env.unwrapped.get_action_meanings()[0] == 'NOOP'
def _reset(self):
def _reset(self, **kwargs):
""" Do no-op action for a number of steps in [1, noop_max]."""
self.env.reset()
self.env.reset(**kwargs)
if self.override_num_noops is not None:
noops = self.override_num_noops
else:
@@ -25,9 +29,9 @@ class NoopResetEnv(gym.Wrapper):
assert noops > 0
obs = None
for _ in range(noops):
obs, _, done, _ = self.env.step(0)
obs, _, done, _ = self.env.step(self.noop_action)
if done:
obs = self.env.reset()
obs = self.env.reset(**kwargs)
return obs
class FireResetEnv(gym.Wrapper):
@@ -37,14 +41,14 @@ class FireResetEnv(gym.Wrapper):
assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
assert len(env.unwrapped.get_action_meanings()) >= 3
def _reset(self):
self.env.reset()
def _reset(self, **kwargs):
self.env.reset(**kwargs)
obs, _, done, _ = self.env.step(1)
if done:
self.env.reset()
self.env.reset(**kwargs)
obs, _, done, _ = self.env.step(2)
if done:
self.env.reset()
self.env.reset(**kwargs)
return obs
class EpisodicLifeEnv(gym.Wrapper):
@@ -70,13 +74,13 @@ class EpisodicLifeEnv(gym.Wrapper):
self.lives = lives
return obs, reward, done, info
def _reset(self):
def _reset(self, **kwargs):
"""Reset only when lives are exhausted.
This way all states are still reachable even though lives are episodic,
and the learner need not know about any of this behind-the-scenes.
"""
if self.was_real_done:
obs = self.env.reset()
obs = self.env.reset(**kwargs)
else:
# no-op step to advance from terminal/lost life state
obs, _, _, _ = self.env.step(0)
@@ -88,30 +92,26 @@ class MaxAndSkipEnv(gym.Wrapper):
"""Return only every `skip`-th frame"""
gym.Wrapper.__init__(self, env)
# most recent raw observations (for max pooling across time steps)
self._obs_buffer = deque(maxlen=2)
self._obs_buffer = np.zeros((2,)+env.observation_space.shape, dtype='uint8')
self._skip = skip
def _step(self, action):
"""Repeat action, sum reward, and max over last observations."""
total_reward = 0.0
done = None
for _ in range(self._skip):
for i in range(self._skip):
obs, reward, done, info = self.env.step(action)
self._obs_buffer.append(obs)
if i == self._skip - 2: self._obs_buffer[0] = obs
if i == self._skip - 1: self._obs_buffer[1] = obs
total_reward += reward
if done:
break
max_frame = np.max(np.stack(self._obs_buffer), axis=0)
# Note that the observation on the done=True frame
# doesn't matter
max_frame = self._obs_buffer.max(axis=0)
return max_frame, total_reward, done, info
def _reset(self):
"""Clear past frame buffer and init. to first obs. from inner env."""
self._obs_buffer.clear()
obs = self.env.reset()
self._obs_buffer.append(obs)
return obs
class ClipRewardEnv(gym.RewardWrapper):
def _reward(self, reward):
"""Bin reward to {+1, 0, -1} by its sign."""
@@ -121,52 +121,89 @@ class WarpFrame(gym.ObservationWrapper):
def __init__(self, env):
"""Warp frames to 84x84 as done in the Nature paper and later work."""
gym.ObservationWrapper.__init__(self, env)
self.res = 84
self.observation_space = spaces.Box(low=0, high=255, shape=(self.res, self.res, 1))
self.width = 84
self.height = 84
self.observation_space = spaces.Box(low=0, high=255, shape=(self.height, self.width, 1))
def _observation(self, obs):
frame = np.dot(obs.astype('float32'), np.array([0.299, 0.587, 0.114], 'float32'))
frame = np.array(Image.fromarray(frame).resize((self.res, self.res),
resample=Image.BILINEAR), dtype=np.uint8)
return frame.reshape((self.res, self.res, 1))
def _observation(self, frame):
frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
frame = cv2.resize(frame, (self.width, self.height), interpolation=cv2.INTER_AREA)
return frame[:, :, None]
class FrameStack(gym.Wrapper):
def __init__(self, env, k):
"""Buffer observations and stack across channels (last axis)."""
"""Stack k last frames.
Returns lazy array, which is much more memory efficient.
See Also
--------
baselines.common.atari_wrappers.LazyFrames
"""
gym.Wrapper.__init__(self, env)
self.k = k
self.frames = deque([], maxlen=k)
shp = env.observation_space.shape
assert shp[2] == 1 # can only stack 1-channel frames
self.observation_space = spaces.Box(low=0, high=255, shape=(shp[0], shp[1], k))
self.observation_space = spaces.Box(low=0, high=255, shape=(shp[0], shp[1], shp[2] * k))
def _reset(self):
"""Clear buffer and re-fill by duplicating the first observation."""
ob = self.env.reset()
for _ in range(self.k): self.frames.append(ob)
return self._observation()
for _ in range(self.k):
self.frames.append(ob)
return self._get_ob()
def _step(self, action):
ob, reward, done, info = self.env.step(action)
self.frames.append(ob)
return self._observation(), reward, done, info
return self._get_ob(), reward, done, info
def _observation(self):
def _get_ob(self):
assert len(self.frames) == self.k
return np.concatenate(self.frames, axis=2)
return LazyFrames(list(self.frames))
def wrap_deepmind(env, episode_life=True, clip_rewards=True):
"""Configure environment for DeepMind-style Atari.
class ScaledFloatFrame(gym.ObservationWrapper):
def _observation(self, observation):
# careful! This undoes the memory optimization, use
# with smaller replay buffers only.
return np.array(observation).astype(np.float32) / 255.0
Note: this does not include frame stacking!"""
assert 'NoFrameskip' in env.spec.id # required for DeepMind-style skip
if episode_life:
env = EpisodicLifeEnv(env)
class LazyFrames(object):
def __init__(self, frames):
"""This object ensures that common frames between the observations are only stored once.
It exists purely to optimize memory usage which can be huge for DQN's 1M frames replay
buffers.
This object should only be converted to numpy array before being passed to the model.
You'd not belive how complex the previous solution was."""
self._frames = frames
def __array__(self, dtype=None):
out = np.concatenate(self._frames, axis=2)
if dtype is not None:
out = out.astype(dtype)
return out
def make_atari(env_id):
env = gym.make(env_id)
assert 'NoFrameskip' in env.spec.id
env = NoopResetEnv(env, noop_max=30)
env = MaxAndSkipEnv(env, skip=4)
return env
def wrap_deepmind(env, episode_life=True, clip_rewards=True, frame_stack=False, scale=False):
"""Configure environment for DeepMind-style Atari.
"""
if episode_life:
env = EpisodicLifeEnv(env)
if 'FIRE' in env.unwrapped.get_action_meanings():
env = FireResetEnv(env)
env = WarpFrame(env)
if scale:
env = ScaledFloatFrame(env)
if clip_rewards:
env = ClipRewardEnv(env)
if frame_stack:
env = FrameStack(env, 4)
return env

View File

@@ -1,239 +0,0 @@
import cv2
import gym
import numpy as np
from collections import deque
from gym import spaces
class NoopResetEnv(gym.Wrapper):
def __init__(self, env=None, noop_max=30):
"""Sample initial states by taking random number of no-ops on reset.
No-op is assumed to be action 0.
"""
super(NoopResetEnv, self).__init__(env)
self.noop_max = noop_max
self.override_num_noops = None
assert env.unwrapped.get_action_meanings()[0] == 'NOOP'
def _reset(self):
""" Do no-op action for a number of steps in [1, noop_max]."""
self.env.reset()
if self.override_num_noops is not None:
noops = self.override_num_noops
else:
noops = np.random.randint(1, self.noop_max + 1)
assert noops > 0
obs = None
for _ in range(noops):
obs, _, done, _ = self.env.step(0)
if done:
obs = self.env.reset()
return obs
class FireResetEnv(gym.Wrapper):
def __init__(self, env=None):
"""For environments where the user need to press FIRE for the game to start."""
super(FireResetEnv, self).__init__(env)
assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
assert len(env.unwrapped.get_action_meanings()) >= 3
def _reset(self):
self.env.reset()
obs, _, done, _ = self.env.step(1)
if done:
self.env.reset()
obs, _, done, _ = self.env.step(2)
if done:
self.env.reset()
return obs
class EpisodicLifeEnv(gym.Wrapper):
def __init__(self, env=None):
"""Make end-of-life == end-of-episode, but only reset on true game over.
Done by DeepMind for the DQN and co. since it helps value estimation.
"""
super(EpisodicLifeEnv, self).__init__(env)
self.lives = 0
self.was_real_done = True
self.was_real_reset = False
def _step(self, action):
obs, reward, done, info = self.env.step(action)
self.was_real_done = done
# check current lives, make loss of life terminal,
# then update lives to handle bonus lives
lives = self.env.unwrapped.ale.lives()
if lives < self.lives and lives > 0:
# for Qbert somtimes we stay in lives == 0 condtion for a few frames
# so its important to keep lives > 0, so that we only reset once
# the environment advertises done.
done = True
self.lives = lives
return obs, reward, done, info
def _reset(self):
"""Reset only when lives are exhausted.
This way all states are still reachable even though lives are episodic,
and the learner need not know about any of this behind-the-scenes.
"""
if self.was_real_done:
obs = self.env.reset()
self.was_real_reset = True
else:
# no-op step to advance from terminal/lost life state
obs, _, _, _ = self.env.step(0)
self.was_real_reset = False
self.lives = self.env.unwrapped.ale.lives()
return obs
class MaxAndSkipEnv(gym.Wrapper):
def __init__(self, env=None, skip=4):
"""Return only every `skip`-th frame"""
super(MaxAndSkipEnv, self).__init__(env)
# most recent raw observations (for max pooling across time steps)
self._obs_buffer = deque(maxlen=2)
self._skip = skip
def _step(self, action):
total_reward = 0.0
done = None
for _ in range(self._skip):
obs, reward, done, info = self.env.step(action)
self._obs_buffer.append(obs)
total_reward += reward
if done:
break
max_frame = np.max(np.stack(self._obs_buffer), axis=0)
return max_frame, total_reward, done, info
def _reset(self):
"""Clear past frame buffer and init. to first obs. from inner env."""
self._obs_buffer.clear()
obs = self.env.reset()
self._obs_buffer.append(obs)
return obs
class ProcessFrame84(gym.ObservationWrapper):
def __init__(self, env=None):
super(ProcessFrame84, self).__init__(env)
self.observation_space = spaces.Box(low=0, high=255, shape=(84, 84, 1))
def _observation(self, obs):
return ProcessFrame84.process(obs)
@staticmethod
def process(frame):
if frame.size == 210 * 160 * 3:
img = np.reshape(frame, [210, 160, 3]).astype(np.float32)
elif frame.size == 250 * 160 * 3:
img = np.reshape(frame, [250, 160, 3]).astype(np.float32)
else:
assert False, "Unknown resolution."
img = img[:, :, 0] * 0.299 + img[:, :, 1] * 0.587 + img[:, :, 2] * 0.114
resized_screen = cv2.resize(img, (84, 110), interpolation=cv2.INTER_AREA)
x_t = resized_screen[18:102, :]
x_t = np.reshape(x_t, [84, 84, 1])
return x_t.astype(np.uint8)
class ClippedRewardsWrapper(gym.RewardWrapper):
def _reward(self, reward):
"""Change all the positive rewards to 1, negative to -1 and keep zero."""
return np.sign(reward)
class LazyFrames(object):
def __init__(self, frames):
"""This object ensures that common frames between the observations are only stored once.
It exists purely to optimize memory usage which can be huge for DQN's 1M frames replay
buffers.
This object should only be converted to numpy array before being passed to the model.
You'd not belive how complex the previous solution was."""
self._frames = frames
def __array__(self, dtype=None):
out = np.concatenate(self._frames, axis=2)
if dtype is not None:
out = out.astype(dtype)
return out
class FrameStack(gym.Wrapper):
def __init__(self, env, k):
"""Stack k last frames.
Returns lazy array, which is much more memory efficient.
See Also
--------
baselines.common.atari_wrappers.LazyFrames
"""
gym.Wrapper.__init__(self, env)
self.k = k
self.frames = deque([], maxlen=k)
shp = env.observation_space.shape
self.observation_space = spaces.Box(low=0, high=255, shape=(shp[0], shp[1], shp[2] * k))
def _reset(self):
ob = self.env.reset()
for _ in range(self.k):
self.frames.append(ob)
return self._get_ob()
def _step(self, action):
ob, reward, done, info = self.env.step(action)
self.frames.append(ob)
return self._get_ob(), reward, done, info
def _get_ob(self):
assert len(self.frames) == self.k
return LazyFrames(list(self.frames))
class ScaledFloatFrame(gym.ObservationWrapper):
def _observation(self, obs):
# careful! This undoes the memory optimization, use
# with smaller replay buffers only.
return np.array(obs).astype(np.float32) / 255.0
def wrap_dqn(env):
"""Apply a common set of wrappers for Atari games."""
assert 'NoFrameskip' in env.spec.id
env = EpisodicLifeEnv(env)
env = NoopResetEnv(env, noop_max=30)
env = MaxAndSkipEnv(env, skip=4)
if 'FIRE' in env.unwrapped.get_action_meanings():
env = FireResetEnv(env)
env = ProcessFrame84(env)
env = FrameStack(env, 4)
env = ClippedRewardsWrapper(env)
return env
class A2cProcessFrame(gym.Wrapper):
def __init__(self, env):
gym.Wrapper.__init__(self, env)
self.observation_space = spaces.Box(low=0, high=255, shape=(84, 84, 1))
def _step(self, action):
ob, reward, done, info = self.env.step(action)
return A2cProcessFrame.process(ob), reward, done, info
def _reset(self):
return A2cProcessFrame.process(self.env.reset())
@staticmethod
def process(frame):
frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
frame = cv2.resize(frame, (84, 84), interpolation=cv2.INTER_AREA)
return frame.reshape(84, 84, 1)

View File

@@ -10,10 +10,7 @@ except ImportError:
from shutil import unpack_archive
from threading import Event
"""TODOS:
- use Azure snapshots instead of hacky backups
"""
# TODOS: use Azure snapshots instead of hacky backups
def fixed_list_blobs(service, *args, **kwargs):
"""By defualt list_containers only returns a subset of results.
@@ -37,7 +34,7 @@ def make_archive(source_path, dest_path):
prefix_path = os.path.dirname(source_path)
with zipfile.ZipFile(dest_path, "w", compression=zipfile.ZIP_STORED) as zf:
if os.path.isdir(source_path):
for dirname, subdirs, files in os.walk(source_path):
for dirname, _subdirs, files in os.walk(source_path):
zf.write(dirname, os.path.relpath(dirname, prefix_path))
for filename in files:
filepath = os.path.join(dirname, filename)

View File

@@ -2,7 +2,6 @@ import tensorflow as tf
import numpy as np
import baselines.common.tf_util as U
from tensorflow.python.ops import math_ops
from tensorflow.python.ops import nn
class Pd(object):
"""

View File

@@ -4,7 +4,6 @@ import os
import pickle
import random
import tempfile
import time
import zipfile
@@ -153,76 +152,6 @@ class RunningAvg(object):
"""Get the current estimate"""
return self._value
class SimpleMonitor(gym.Wrapper):
def __init__(self, env):
"""Adds two qunatities to info returned by every step:
num_steps: int
Number of steps takes so far
rewards: [float]
All the cumulative rewards for the episodes completed so far.
"""
super().__init__(env)
# current episode state
self._current_reward = None
self._num_steps = None
# temporary monitor state that we do not save
self._time_offset = None
self._total_steps = None
# monitor state
self._episode_rewards = []
self._episode_lengths = []
self._episode_end_times = []
def _reset(self):
obs = self.env.reset()
# recompute temporary state if needed
if self._time_offset is None:
self._time_offset = time.time()
if len(self._episode_end_times) > 0:
self._time_offset -= self._episode_end_times[-1]
if self._total_steps is None:
self._total_steps = sum(self._episode_lengths)
# update monitor state
if self._current_reward is not None:
self._episode_rewards.append(self._current_reward)
self._episode_lengths.append(self._num_steps)
self._episode_end_times.append(time.time() - self._time_offset)
# reset episode state
self._current_reward = 0
self._num_steps = 0
return obs
def _step(self, action):
obs, rew, done, info = self.env.step(action)
self._current_reward += rew
self._num_steps += 1
self._total_steps += 1
info['steps'] = self._total_steps
info['rewards'] = self._episode_rewards
return (obs, rew, done, info)
def get_state(self):
return {
'env_id': self.env.unwrapped.spec.id,
'episode_data': {
'episode_rewards': self._episode_rewards,
'episode_lengths': self._episode_lengths,
'episode_end_times': self._episode_end_times,
'initial_reset_time': 0,
}
}
def set_state(self, state):
assert state['env_id'] == self.env.unwrapped.spec.id
ed = state['episode_data']
self._episode_rewards = ed['episode_rewards']
self._episode_lengths = ed['episode_lengths']
self._episode_end_times = ed['episode_end_times']
def boolean_flag(parser, name, default=False, help=None):
"""Add a boolean flag to argparse parser.

View File

@@ -6,51 +6,41 @@ import copy
import os
import collections
# ================================================================
# Make consistent with numpy
# ================================================================
clip = tf.clip_by_value
def sum(x, axis=None, keepdims=False):
axis = None if axis is None else [axis]
return tf.reduce_sum(x, axis=axis, keep_dims=keepdims)
def mean(x, axis=None, keepdims=False):
axis = None if axis is None else [axis]
return tf.reduce_mean(x, axis=axis, keep_dims=keepdims)
def var(x, axis=None, keepdims=False):
meanx = mean(x, axis=axis, keepdims=keepdims)
return mean(tf.square(x - meanx), axis=axis, keepdims=keepdims)
def std(x, axis=None, keepdims=False):
return tf.sqrt(var(x, axis=axis, keepdims=keepdims))
def max(x, axis=None, keepdims=False):
axis = None if axis is None else [axis]
return tf.reduce_max(x, axis=axis, keep_dims=keepdims)
def min(x, axis=None, keepdims=False):
axis = None if axis is None else [axis]
return tf.reduce_min(x, axis=axis, keep_dims=keepdims)
def concatenate(arrs, axis=0):
return tf.concat(axis=axis, values=arrs)
def argmax(x, axis=None):
return tf.argmax(x, axis=axis)
def switch(condition, then_expression, else_expression):
"""Switches between two operations depending on a scalar value (int or bool).
Note that both `then_expression` and `else_expression`
@@ -72,35 +62,29 @@ def switch(condition, then_expression, else_expression):
# Extras
# ================================================================
def l2loss(params):
if len(params) == 0:
return tf.constant(0.0)
else:
return tf.add_n([sum(tf.square(p)) for p in params])
def lrelu(x, leak=0.2):
f1 = 0.5 * (1 + leak)
f2 = 0.5 * (1 - leak)
return f1 * x + f2 * abs(x)
def categorical_sample_logits(X):
# https://github.com/tensorflow/tensorflow/issues/456
U = tf.random_uniform(tf.shape(X))
return argmax(X - tf.log(-tf.log(U)), axis=1)
# ================================================================
# Inputs
# ================================================================
def is_placeholder(x):
return type(x) is tf.Tensor and len(x.op.inputs) == 0
class TfInput(object):
def __init__(self, name="(unnamed)"):
"""Generalized Tensorflow placeholder. The main differences are:
@@ -119,7 +103,6 @@ class TfInput(object):
"""Given data input it to the placeholder(s)."""
raise NotImplemented()
class PlacholderTfInput(TfInput):
def __init__(self, placeholder):
"""Wrapper for regular tensorflow placeholder."""
@@ -132,7 +115,6 @@ class PlacholderTfInput(TfInput):
def make_feed_dict(self, data):
return {self._placeholder: data}
class BatchInput(PlacholderTfInput):
def __init__(self, shape, dtype=tf.float32, name=None):
"""Creates a placeholder for a batch of tensors of a given shape and dtype
@@ -148,7 +130,6 @@ class BatchInput(PlacholderTfInput):
"""
super().__init__(tf.placeholder(dtype, [None] + list(shape), name=name))
class Uint8Input(PlacholderTfInput):
def __init__(self, shape, name=None):
"""Takes input in uint8 format which is cast to float32 and divided by 255
@@ -171,7 +152,6 @@ class Uint8Input(PlacholderTfInput):
def get(self):
return self._output
def ensure_tf_input(thing):
"""Takes either tf.placeholder of TfInput and outputs equivalent TfInput"""
if isinstance(thing, TfInput):
@@ -185,7 +165,6 @@ def ensure_tf_input(thing):
# Mathematical utils
# ================================================================
def huber_loss(x, delta=1.0):
"""Reference: https://en.wikipedia.org/wiki/Huber_loss"""
return tf.where(
@@ -198,7 +177,6 @@ def huber_loss(x, delta=1.0):
# Optimizer utils
# ================================================================
def minimize_and_clip(optimizer, objective, var_list, clip_val=10):
"""Minimized `objective` using `optimizer` w.r.t. variables in
`var_list` while ensure the norm of the gradients for each
@@ -210,7 +188,6 @@ def minimize_and_clip(optimizer, objective, var_list, clip_val=10):
gradients[i] = (tf.clip_by_norm(grad, clip_val), var)
return optimizer.apply_gradients(gradients)
# ================================================================
# Global session
# ================================================================
@@ -219,7 +196,6 @@ def get_session():
"""Returns recently made Tensorflow session"""
return tf.get_default_session()
def make_session(num_cpu):
"""Returns a session that will use <num_cpu> CPU's only"""
tf_config = tf.ConfigProto(
@@ -227,31 +203,25 @@ def make_session(num_cpu):
intra_op_parallelism_threads=num_cpu)
return tf.Session(config=tf_config)
def single_threaded_session():
"""Returns a session which will only use a single CPU"""
return make_session(1)
ALREADY_INITIALIZED = set()
def initialize():
"""Initialize all the uninitialized variables in the global scope."""
new_variables = set(tf.global_variables()) - ALREADY_INITIALIZED
get_session().run(tf.variables_initializer(new_variables))
ALREADY_INITIALIZED.update(new_variables)
def eval(expr, feed_dict=None):
if feed_dict is None:
feed_dict = {}
return get_session().run(expr, feed_dict=feed_dict)
VALUE_SETTERS = collections.OrderedDict()
def set_value(v, val):
global VALUE_SETTERS
if v in VALUE_SETTERS:
@@ -262,17 +232,14 @@ def set_value(v, val):
VALUE_SETTERS[v] = (set_op, set_endpoint)
get_session().run(set_op, feed_dict={set_endpoint: val})
# ================================================================
# Saving variables
# ================================================================
def load_state(fname):
saver = tf.train.Saver()
saver.restore(get_session(), fname)
def save_state(fname):
os.makedirs(os.path.dirname(fname), exist_ok=True)
saver = tf.train.Saver()
@@ -282,7 +249,6 @@ def save_state(fname):
# Model components
# ================================================================
def normc_initializer(std=1.0):
def _initializer(shape, dtype=None, partition_info=None): # pylint: disable=W0613
out = np.random.randn(*shape).astype(np.float32)
@@ -290,7 +256,6 @@ def normc_initializer(std=1.0):
return tf.constant(out)
return _initializer
def conv2d(x, num_filters, name, filter_size=(3, 3), stride=(1, 1), pad="SAME", dtype=tf.float32, collections=None,
summary_tag=None):
with tf.variable_scope(name):
@@ -320,7 +285,6 @@ def conv2d(x, num_filters, name, filter_size=(3, 3), stride=(1, 1), pad="SAME",
return tf.nn.conv2d(x, w, stride_shape, pad) + b
def dense(x, size, name, weight_init=None, bias=True):
w = tf.get_variable(name + "/w", [x.get_shape()[1], size], initializer=weight_init)
ret = tf.matmul(x, w)
@@ -330,7 +294,6 @@ def dense(x, size, name, weight_init=None, bias=True):
else:
return ret
def wndense(x, size, name, init_scale=1.0):
v = tf.get_variable(name + "/V", [int(x.get_shape()[1]), size],
initializer=tf.random_normal_initializer(0, 0.05))
@@ -342,11 +305,9 @@ def wndense(x, size, name, init_scale=1.0):
scaler = g / tf.sqrt(sum(tf.square(v), axis=0, keepdims=True))
return tf.reshape(scaler, [1, size]) * x + tf.reshape(b, [1, size])
def densenobias(x, size, name, weight_init=None):
return dense(x, size, name, weight_init=weight_init, bias=False)
def dropout(x, pkeep, phase=None, mask=None):
mask = tf.floor(pkeep + tf.random_uniform(tf.shape(x))) if mask is None else mask
if phase is None:
@@ -354,13 +315,10 @@ def dropout(x, pkeep, phase=None, mask=None):
else:
return switch(phase, mask * x, pkeep * x)
# ================================================================
# Theano-like Function
# ================================================================
def function(inputs, outputs, updates=None, givens=None):
"""Just like Theano function. Take a bunch of tensorflow placeholders and expressions
computed based on those placeholders and produces f(inputs) -> outputs. Function f takes
@@ -401,7 +359,6 @@ def function(inputs, outputs, updates=None, givens=None):
f = _Function(inputs, [outputs], updates, givens=givens)
return lambda *args, **kwargs: f(*args, **kwargs)[0]
class _Function(object):
def __init__(self, inputs, outputs, updates, givens, check_nan=False):
for inpt in inputs:
@@ -448,7 +405,6 @@ class _Function(object):
raise RuntimeError("Nan detected")
return results
def mem_friendly_function(nondata_inputs, data_inputs, outputs, batch_size):
if isinstance(outputs, list):
return _MemFriendlyFunction(nondata_inputs, data_inputs, outputs, batch_size)
@@ -456,7 +412,6 @@ def mem_friendly_function(nondata_inputs, data_inputs, outputs, batch_size):
f = _MemFriendlyFunction(nondata_inputs, data_inputs, [outputs], batch_size)
return lambda *inputs: f(*inputs)[0]
class _MemFriendlyFunction(object):
def __init__(self, nondata_inputs, data_inputs, outputs, batch_size):
self.nondata_inputs = nondata_inputs
@@ -490,7 +445,6 @@ class _MemFriendlyFunction(object):
# Modules
# ================================================================
class Module(object):
def __init__(self, name):
self.name = name
@@ -528,7 +482,6 @@ class Module(object):
assert self.scope is not None, "need to call module once before getting variables"
return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope)
def module(name):
@functools.wraps
def wrapper(f):
@@ -542,14 +495,11 @@ def module(name):
# Graph traversal
# ================================================================
VARIABLES = {}
def get_parents(node):
return node.op.inputs
def topsorted(outputs):
"""
Topological sort via non-recursive depth-first search
@@ -586,7 +536,6 @@ def topsorted(outputs):
stack.append((j, 0))
return out
# ================================================================
# Flat vectors
# ================================================================
@@ -597,15 +546,12 @@ def var_shape(x):
"shape function assumes that shape is fully known"
return out
def numel(x):
return intprod(var_shape(x))
def intprod(x):
return int(np.prod(x))
def flatgrad(loss, var_list, clip_norm=None):
grads = tf.gradients(loss, var_list)
if clip_norm is not None:
@@ -615,7 +561,6 @@ def flatgrad(loss, var_list, clip_norm=None):
for (v, grad) in zip(var_list, grads)
])
class SetFromFlat(object):
def __init__(self, var_list, dtype=tf.float32):
assigns = []
@@ -634,7 +579,6 @@ class SetFromFlat(object):
def __call__(self, theta):
get_session().run(self.op, feed_dict={self.theta: theta})
class GetFlat(object):
def __init__(self, var_list):
self.op = tf.concat(axis=0, values=[tf.reshape(v, [numel(v)]) for v in var_list])
@@ -646,7 +590,6 @@ class GetFlat(object):
# Misc
# ================================================================
def fancy_slice_2d(X, inds0, inds1):
"""
like numpy X[inds0, inds1]
@@ -659,12 +602,10 @@ def fancy_slice_2d(X, inds0, inds1):
Xflat = tf.reshape(X, [-1])
return tf.gather(Xflat, inds0 * ncols + inds1)
# ================================================================
# Scopes
# ================================================================
def scope_vars(scope, trainable_only=False):
"""
Get variables inside a scope
@@ -687,17 +628,14 @@ def scope_vars(scope, trainable_only=False):
scope=scope if isinstance(scope, str) else scope.name
)
def scope_name():
"""Returns the name of current scope as a string, e.g. deepq/q_func"""
return tf.get_variable_scope().name
def absolute_scope_name(relative_scope_name):
"""Appends parent scope name to `relative_scope_name`"""
return scope_name() + "/" + relative_scope_name
def lengths_to_mask(lengths_b, max_length):
"""
Turns a vector of lengths into a boolean mask
@@ -715,7 +653,6 @@ def lengths_to_mask(lengths_b, max_length):
mask_bt = tf.expand_dims(tf.range(max_length), 0) < tf.expand_dims(lengths_b, 1)
return mask_bt
def in_session(f):
@functools.wraps(f)
def newfunc(*args, **kwargs):
@@ -723,10 +660,8 @@ def in_session(f):
f(*args, **kwargs)
return newfunc
_PLACEHOLDER_CACHE = {} # name -> (placeholder, dtype, shape)
def get_placeholder(name, dtype, shape):
if name in _PLACEHOLDER_CACHE:
out, dtype1, shape1 = _PLACEHOLDER_CACHE[name]
@@ -737,15 +672,12 @@ def get_placeholder(name, dtype, shape):
_PLACEHOLDER_CACHE[name] = (out, dtype, shape)
return out
def get_placeholder_cached(name):
return _PLACEHOLDER_CACHE[name][0]
def flattenallbut0(x):
return tf.reshape(x, [-1, intprod(x.get_shape().as_list()[1:])])
def reset():
global _PLACEHOLDER_CACHE
global VARIABLES

View File

@@ -2,7 +2,9 @@ import numpy as np
from multiprocessing import Process, Pipe
from baselines.common.vec_env import VecEnv
def worker(remote, env_fn_wrapper):
def worker(remote, parent_remote, env_fn_wrapper):
parent_remote.close()
env = env_fn_wrapper.x()
while True:
cmd, data = remote.recv()
@@ -14,6 +16,9 @@ def worker(remote, env_fn_wrapper):
elif cmd == 'reset':
ob = env.reset()
remote.send(ob)
elif cmd == 'reset_task':
ob = env.reset_task()
remote.send(ob)
elif cmd == 'close':
remote.close()
break
@@ -22,6 +27,7 @@ def worker(remote, env_fn_wrapper):
else:
raise NotImplementedError
class CloudpickleWrapper(object):
"""
Uses cloudpickle to serialize contents (otherwise multiprocessing tries to use pickle)
@@ -35,17 +41,22 @@ class CloudpickleWrapper(object):
import pickle
self.x = pickle.loads(ob)
class SubprocVecEnv(VecEnv):
def __init__(self, env_fns):
"""
envs: list of gym environments to run in subprocesses
"""
self.closed = False
nenvs = len(env_fns)
self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(nenvs)])
self.ps = [Process(target=worker, args=(work_remote, CloudpickleWrapper(env_fn)))
for (work_remote, env_fn) in zip(self.work_remotes, env_fns)]
self.ps = [Process(target=worker, args=(work_remote, remote, CloudpickleWrapper(env_fn)))
for (work_remote, remote, env_fn) in zip(self.work_remotes, self.remotes, env_fns)]
for p in self.ps:
p.daemon = True # if the main process crashes, we should not cause things to hang
p.start()
for remote in self.work_remotes:
remote.close()
self.remotes[0].send(('get_spaces', None))
self.action_space, self.observation_space = self.remotes[0].recv()
@@ -63,11 +74,20 @@ class SubprocVecEnv(VecEnv):
remote.send(('reset', None))
return np.stack([remote.recv() for remote in self.remotes])
def reset_task(self):
for remote in self.remotes:
remote.send(('reset_task', None))
return np.stack([remote.recv() for remote in self.remotes])
def close(self):
if self.closed:
return
for remote in self.remotes:
remote.send(('close', None))
for p in self.ps:
p.join()
self.closed = True
@property
def num_envs(self):

View File

@@ -19,11 +19,12 @@ from mpi4py import MPI
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs):
# Configure things.
rank = MPI.COMM_WORLD.Get_rank()
if rank != 0: logger.set_level(logger.DISABLED)
if rank != 0:
logger.set_level(logger.DISABLED)
# Create envs.
env = gym.make(env_id)
env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), "%i.monitor.json"%rank))
env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
gym.logger.setLevel(logging.WARN)
if evaluation and rank==0:
@@ -103,11 +104,21 @@ def parse_args():
parser.add_argument('--nb-eval-steps', type=int, default=100) # per epoch cycle and MPI worker
parser.add_argument('--nb-rollout-steps', type=int, default=100) # per epoch cycle and MPI worker
parser.add_argument('--noise-type', type=str, default='adaptive-param_0.2') # choices are adaptive-param_xx, ou_xx, normal_xx, none
parser.add_argument('--num-timesteps', type=int, default=None)
boolean_flag(parser, 'evaluation', default=False)
return vars(parser.parse_args())
args = parser.parse_args()
# we don't directly specify timesteps for this script, so make sure that if we do specify them
# they agree with the other parameters
if args.num_timesteps is not None:
assert(args.num_timesteps == args.nb_epochs * args.nb_epoch_cycles * args.nb_rollout_steps)
dict_args = vars(args)
del dict_args['num_timesteps']
return dict_args
if __name__ == '__main__':
args = parse_args()
if MPI.COMM_WORLD.Get_rank() == 0:
logger.configure()
# Run actual script.
run(**args)

View File

@@ -1,6 +1,3 @@
import time
import gym
import numpy as np
import tensorflow as tf
from mpi4py import MPI

View File

@@ -1,5 +1,8 @@
from baselines.deepq import models # noqa
from baselines.deepq.build_graph import build_act, build_train # noqa
from baselines.deepq.simple import learn, load # noqa
from baselines.deepq.replay_buffer import ReplayBuffer, PrioritizedReplayBuffer # noqa
def wrap_atari_dqn(env):
from baselines.common.atari_wrappers import wrap_deepmind
return wrap_deepmind(env, frame_stack=True, scale=True)

View File

@@ -10,8 +10,8 @@ import baselines.common.tf_util as U
from baselines import deepq
from baselines.common.misc_util import (
boolean_flag,
SimpleMonitor,
)
from baselines import bench
from baselines.common.atari_wrappers_deprecated import wrap_dqn
from baselines.deepq.experiments.atari.model import model, dueling_model
@@ -30,7 +30,7 @@ def parse_args():
def make_env(game_name):
env = gym.make(game_name + "NoFrameskip-v4")
env = SimpleMonitor(env)
env = bench.Monitor(env, None)
env = wrap_dqn(env)
return env

View File

@@ -19,11 +19,9 @@ from baselines.common.misc_util import (
relatively_safe_pickle_dump,
set_global_seeds,
RunningAvg,
SimpleMonitor
)
from baselines.common.schedules import LinearSchedule, PiecewiseSchedule
# when updating this to non-deperecated ones, it is important to
# copy over LazyFrames
from baselines import bench
from baselines.common.atari_wrappers_deprecated import wrap_dqn
from baselines.common.azure_utils import Container
from .model import model, dueling_model
@@ -64,7 +62,7 @@ def parse_args():
def make_env(game_name):
env = gym.make(game_name + "NoFrameskip-v4")
monitored_env = SimpleMonitor(env) # puts rewards and number of steps in info, before environment is wrapped
monitored_env = bench.Monitor(env, logger.get_dir()) # puts rewards and number of steps in info, before environment is wrapped
env = wrap_dqn(monitored_env) # applies a bunch of modification to simplify the observation space (downsample, make b/w)
return env, monitored_env

View File

@@ -5,15 +5,15 @@ import os
import baselines.common.tf_util as U
from baselines import deepq
from baselines.common.misc_util import get_wrapper_by_name, SimpleMonitor, boolean_flag, set_global_seeds
from baselines import deepq, bench
from baselines.common.misc_util import get_wrapper_by_name, boolean_flag, set_global_seeds
from baselines.common.atari_wrappers_deprecated import wrap_dqn
from baselines.deepq.experiments.atari.model import model, dueling_model
def make_env(game_name):
env = gym.make(game_name + "NoFrameskip-v4")
env_monitored = SimpleMonitor(env)
env_monitored = bench.Monitor(env, None)
env = wrap_dqn(env_monitored)
return env_monitored, env
@@ -47,14 +47,14 @@ def wang2015_eval(game_name, act, stochastic):
eval_episode_steps += 1
action = act(np.array(obs)[None], stochastic=stochastic)[0]
obs, reward, done, info = eval_env.step(action)
obs, _reward, done, info = eval_env.step(action)
if done:
obs = eval_env.reset()
if len(info["rewards"]) > 0:
episode_rewards.append(info["rewards"][0])
break
if info["steps"] > 108000: # 5 minutes of gameplay
episode_rewards.append(env_monitored._current_reward)
episode_rewards.append(sum(env_monitored.rewards))
break
print("Num steps in episode {} was {} yielding {} reward".format(
num_noops, eval_episode_steps, episode_rewards[-1]), flush=True)
@@ -66,7 +66,7 @@ def wang2015_eval(game_name, act, stochastic):
def main():
set_global_seeds(1)
args = parse_args()
with U.make_session(4) as sess: # noqa
with U.make_session(4): # noqa
_, env = make_env(args.env)
act = deepq.build_act(
make_obs_ph=lambda name: U.Uint8Input(env.observation_space.shape, name=name),

View File

@@ -1,12 +1,10 @@
import gym
from baselines import deepq
from baselines.common.atari_wrappers_deprecated import wrap_dqn, ScaledFloatFrame
def main():
env = gym.make("PongNoFrameskip-v4")
env = ScaledFloatFrame(wrap_dqn(env))
env = deepq.wrap_atari_dqn(env)
act = deepq.load("pong_model.pkl")
while True:

View File

@@ -0,0 +1,47 @@
import gym
from baselines import deepq
from baselines.common import set_global_seeds
from baselines import bench
import argparse
from baselines import logger
from baselines.common.atari_wrappers import make_atari
def main():
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4')
parser.add_argument('--seed', help='RNG seed', type=int, default=0)
parser.add_argument('--prioritized', type=int, default=1)
parser.add_argument('--dueling', type=int, default=1)
parser.add_argument('--num-timesteps', type=int, default=int(10e6))
args = parser.parse_args()
logger.configure()
set_global_seeds(args.seed)
env = make_atari(args.env)
env = bench.Monitor(env, logger.get_dir())
env = deepq.wrap_atari_dqn(env)
model = deepq.models.cnn_to_mlp(
convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)],
hiddens=[256],
dueling=bool(args.dueling),
)
act = deepq.learn(
env,
q_func=model,
lr=1e-4,
max_timesteps=args.num_timesteps,
buffer_size=10000,
exploration_fraction=0.1,
exploration_final_eps=0.01,
train_freq=4,
learning_starts=10000,
target_network_update_freq=1000,
gamma=0.99,
prioritized_replay=bool(args.prioritized)
)
# act.save("pong_model.pkl") XXX
env.close()
if __name__ == '__main__':
main()

View File

@@ -1,34 +0,0 @@
import gym
from baselines import deepq
from baselines.common.atari_wrappers_deprecated import wrap_dqn, ScaledFloatFrame
def main():
env = gym.make("PongNoFrameskip-v4")
env = ScaledFloatFrame(wrap_dqn(env))
model = deepq.models.cnn_to_mlp(
convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)],
hiddens=[256],
dueling=True
)
act = deepq.learn(
env,
q_func=model,
lr=1e-4,
max_timesteps=2000000,
buffer_size=10000,
exploration_fraction=0.1,
exploration_final_eps=0.01,
train_freq=4,
learning_starts=10000,
target_network_update_freq=1000,
gamma=0.99,
prioritized_replay=True
)
act.save("pong_model.pkl")
env.close()
if __name__ == '__main__':
main()

View File

@@ -1,12 +1,13 @@
import numpy as np
import os
import dill
import tempfile
import tensorflow as tf
import zipfile
import cloudpickle
import numpy as np
import gym
import baselines.common.tf_util as U
from baselines import logger
from baselines.common.schedules import LinearSchedule
from baselines import deepq
@@ -19,11 +20,11 @@ class ActWrapper(object):
self._act_params = act_params
@staticmethod
def load(path, num_cpu=16):
def load(path):
with open(path, "rb") as f:
model_data, act_params = dill.load(f)
model_data, act_params = cloudpickle.load(f)
act = deepq.build_act(**act_params)
sess = U.make_session(num_cpu=num_cpu)
sess = tf.Session()
sess.__enter__()
with tempfile.TemporaryDirectory() as td:
arc_path = os.path.join(td, "packed.zip")
@@ -38,8 +39,11 @@ class ActWrapper(object):
def __call__(self, *args, **kwargs):
return self._act(*args, **kwargs)
def save(self, path):
def save(self, path=None):
"""Save model to a pickle located at `path`"""
if path is None:
path = os.path.join(logger.get_dir(), "model.pkl")
with tempfile.TemporaryDirectory() as td:
U.save_state(os.path.join(td, "model"))
arc_name = os.path.join(td, "packed.zip")
@@ -52,18 +56,16 @@ class ActWrapper(object):
with open(arc_name, "rb") as f:
model_data = f.read()
with open(path, "wb") as f:
dill.dump((model_data, self._act_params), f)
cloudpickle.dump((model_data, self._act_params), f)
def load(path, num_cpu=16):
def load(path):
"""Load act function that was returned by learn function.
Parameters
----------
path: str
path to the act function pickle
num_cpu: int
number of cpus to use for executing the policy
Returns
-------
@@ -71,7 +73,7 @@ def load(path, num_cpu=16):
function that takes a batch of observations
and returns actions.
"""
return ActWrapper.load(path, num_cpu=num_cpu)
return ActWrapper.load(path)
def learn(env,
@@ -83,7 +85,7 @@ def learn(env,
exploration_final_eps=0.02,
train_freq=1,
batch_size=32,
print_freq=1,
print_freq=100,
checkpoint_freq=10000,
learning_starts=1000,
gamma=1.0,
@@ -93,7 +95,6 @@ def learn(env,
prioritized_replay_beta0=0.4,
prioritized_replay_beta_iters=None,
prioritized_replay_eps=1e-6,
num_cpu=16,
param_noise=False,
callback=None):
"""Train a deepq model.
@@ -151,8 +152,6 @@ def learn(env,
to 1.0. If set to None equals to max_timesteps.
prioritized_replay_eps: float
epsilon to add to the TD errors when updating priorities.
num_cpu: int
number of cpus to use for training
callback: (locals, globals) -> None
function called at every steps with state of the algorithm.
If callback returns true training stops.
@@ -165,11 +164,14 @@ def learn(env,
"""
# Create all the functions necessary to train the model
sess = U.make_session(num_cpu=num_cpu)
sess = tf.Session()
sess.__enter__()
# capture the shape outside the closure so that the env object is not serialized
# by cloudpickle when serializing make_obs_ph
observation_space_shape = env.observation_space.shape
def make_obs_ph(name):
return U.BatchInput(env.observation_space.shape, name=name)
return U.BatchInput(observation_space_shape, name=name)
act, train, update_target, debug = deepq.build_train(
make_obs_ph=make_obs_ph,
@@ -180,12 +182,15 @@ def learn(env,
grad_norm_clipping=10,
param_noise=param_noise
)
act_params = {
'make_obs_ph': make_obs_ph,
'q_func': q_func,
'num_actions': env.action_space.n,
}
act = ActWrapper(act, act_params)
# Create the replay buffer
if prioritized_replay:
replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha)
@@ -233,8 +238,13 @@ def learn(env,
kwargs['update_param_noise_threshold'] = update_param_noise_threshold
kwargs['update_param_noise_scale'] = True
action = act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0]
if isinstance(env.action_space, gym.spaces.MultiBinary):
env_action = np.zeros(env.action_space.n)
env_action[action] = 1
else:
env_action = action
reset = False
new_obs, rew, done, _ = env.step(action)
new_obs, rew, done, _ = env.step(env_action)
# Store transition in the replay buffer.
replay_buffer.add(obs, action, rew, new_obs, float(done))
obs = new_obs
@@ -285,4 +295,4 @@ def learn(env,
logger.log("Restored model with mean reward: {}".format(saved_mean_reward))
U.load_state(model_file)
return ActWrapper(act, act_params)
return act

View File

@@ -6,8 +6,10 @@ import json
import time
import datetime
import tempfile
from mpi4py import MPI
LOG_OUTPUT_FORMATS = ['stdout', 'log', 'json']
LOG_OUTPUT_FORMATS = ['stdout', 'log', 'csv']
# Also valid: json, tensorboard
DEBUG = 10
INFO = 20
@@ -16,26 +18,23 @@ ERROR = 40
DISABLED = 50
class OutputFormat(object):
class KVWriter(object):
def writekvs(self, kvs):
"""
Write key-value pairs
"""
raise NotImplementedError
def writeseq(self, args):
"""
Write a sequence of other data (e.g. a logging message)
"""
pass
class SeqWriter(object):
def writeseq(self, seq):
raise NotImplementedError
def close(self):
return
class HumanOutputFormat(OutputFormat):
def __init__(self, file):
self.file = file
class HumanOutputFormat(KVWriter, SeqWriter):
def __init__(self, filename_or_file):
if isinstance(filename_or_file, str):
self.file = open(filename_or_file, 'at')
self.own_file = True
else:
assert hasattr(filename_or_file, 'read'), 'expected file or str, got %s'%filename_or_file
self.file = filename_or_file
self.own_file = False
def writekvs(self, kvs):
# Create strings for printing
@@ -48,6 +47,10 @@ class HumanOutputFormat(OutputFormat):
key2str[self._truncate(key)] = self._truncate(valstr)
# Find max widths
if len(key2str) == 0:
print('WARNING: tried to write empty key-value dict')
return
else:
keywidth = max(map(len, key2str.keys()))
valwidth = max(map(len, key2str.values()))
@@ -70,15 +73,19 @@ class HumanOutputFormat(OutputFormat):
def _truncate(self, s):
return s[:20] + '...' if len(s) > 23 else s
def writeseq(self, args):
for arg in args:
def writeseq(self, seq):
for arg in seq:
self.file.write(arg)
self.file.write('\n')
self.file.flush()
class JSONOutputFormat(OutputFormat):
def __init__(self, file):
self.file = file
def close(self):
if self.own_file:
self.file.close()
class JSONOutputFormat(KVWriter):
def __init__(self, filename):
self.file = open(filename, 'at')
def writekvs(self, kvs):
for k, v in sorted(kvs.items()):
@@ -88,7 +95,46 @@ class JSONOutputFormat(OutputFormat):
self.file.write(json.dumps(kvs) + '\n')
self.file.flush()
class TensorBoardOutputFormat(OutputFormat):
def close(self):
self.file.close()
class CSVOutputFormat(KVWriter):
def __init__(self, filename):
self.file = open(filename, 'a+t')
self.keys = []
self.sep = ','
def writekvs(self, kvs):
# Add our current row to the history
extra_keys = kvs.keys() - self.keys
if extra_keys:
self.keys.extend(extra_keys)
self.file.seek(0)
lines = self.file.readlines()
self.file.seek(0)
for (i, k) in enumerate(self.keys):
if i > 0:
self.file.write(',')
self.file.write(k)
self.file.write('\n')
for line in lines[1:]:
self.file.write(line[:-1])
self.file.write(self.sep * len(extra_keys))
self.file.write('\n')
for (i, k) in enumerate(self.keys):
if i > 0:
self.file.write(',')
v = kvs.get(k)
if v:
self.file.write(str(v))
self.file.write('\n')
self.file.flush()
def close(self):
self.file.close()
class TensorBoardOutputFormat(KVWriter):
"""
Dumps key/value pairs into TensorBoard's numeric format.
"""
@@ -123,18 +169,22 @@ class TensorBoardOutputFormat(OutputFormat):
self.writer.Close()
self.writer = None
def make_output_format(format, ev_dir):
os.makedirs(ev_dir, exist_ok=True)
rank = MPI.COMM_WORLD.Get_rank()
if format == 'stdout':
return HumanOutputFormat(sys.stdout)
elif format == 'log':
log_file = open(osp.join(ev_dir, 'log.txt'), 'wt')
return HumanOutputFormat(log_file)
suffix = "" if rank==0 else ("-mpi%03i"%rank)
return HumanOutputFormat(osp.join(ev_dir, 'log%s.txt' % suffix))
elif format == 'json':
json_file = open(osp.join(ev_dir, 'progress.json'), 'wt')
return JSONOutputFormat(json_file)
assert rank==0
return JSONOutputFormat(osp.join(ev_dir, 'progress.json'))
elif format == 'csv':
assert rank==0
return CSVOutputFormat(osp.join(ev_dir, 'progress.csv'))
elif format == 'tensorboard':
assert rank==0
return TensorBoardOutputFormat(osp.join(ev_dir, 'tb'))
else:
raise ValueError('Unknown format specified: %s' % (format,))
@@ -176,19 +226,15 @@ def log(*args, level=INFO):
"""
Logger.CURRENT.log(*args, level=level)
def debug(*args):
log(*args, level=DEBUG)
def info(*args):
log(*args, level=INFO)
def warn(*args):
log(*args, level=WARN)
def error(*args):
log(*args, level=ERROR)
@@ -232,6 +278,7 @@ class Logger(object):
def dumpkvs(self):
if self.level == DISABLED: return
for fmt in self.output_formats:
if isinstance(fmt, KVWriter):
fmt.writekvs(self.name2val)
self.name2val.clear()
@@ -255,35 +302,46 @@ class Logger(object):
# ----------------------------------------
def _do_log(self, args):
for fmt in self.output_formats:
fmt.writeseq(args)
if isinstance(fmt, SeqWriter):
fmt.writeseq(map(str, args))
Logger.DEFAULT = Logger.CURRENT = Logger(dir=None, output_formats=[HumanOutputFormat(sys.stdout)])
def configure(dir=None, format_strs=None):
assert Logger.CURRENT is Logger.DEFAULT,\
"Only call logger.configure() when it's in the default state. Try calling logger.reset() first."
prevlogger = Logger.CURRENT
if dir is None:
dir = os.getenv('OPENAI_LOGDIR')
if dir is None:
dir = osp.join(tempfile.gettempdir(),
datetime.datetime.now().strftime("openai-%Y-%m-%d-%H-%M-%S-%f"))
assert isinstance(dir, str)
os.makedirs(dir, exist_ok=True)
if format_strs is None:
format_strs = LOG_OUTPUT_FORMATS
strs = os.getenv('OPENAI_LOG_FORMAT')
format_strs = strs.split(',') if strs else LOG_OUTPUT_FORMATS
output_formats = [make_output_format(f, dir) for f in format_strs]
Logger.CURRENT = Logger(dir=dir, output_formats=output_formats)
log('Logging to %s'%dir)
if os.getenv('OPENAI_LOGDIR'):
# if OPENAI_LOGDIR is set, configure the logger on import
# this kind of nasty (unexpected to user), but I don't know how else to inject the logger
# to a script that's getting run in a subprocess
configure(dir=os.getenv('OPENAI_LOGDIR'))
def reset():
if Logger.CURRENT is not Logger.DEFAULT:
Logger.CURRENT.close()
Logger.CURRENT = Logger.DEFAULT
log('Reset logger')
class scoped_configure(object):
def __init__(self, dir=None, format_strs=None):
self.dir = dir
self.format_strs = format_strs
self.prevlogger = None
def __enter__(self):
self.prevlogger = Logger.CURRENT
configure(dir=self.dir, format_strs=self.format_strs)
def __exit__(self, *args):
Logger.CURRENT.close()
Logger.CURRENT = self.prevlogger
# ================================================================
def _demo():
@@ -294,7 +352,7 @@ def _demo():
dir = "/tmp/testlogging"
if os.path.exists(dir):
shutil.rmtree(dir)
with session(dir=dir):
configure(dir=dir)
logkv("a", 3)
logkv("b", 2.5)
dumpkvs()
@@ -310,5 +368,55 @@ def _demo():
dumpkvs()
# ================================================================
# Readers
# ================================================================
def read_json(fname):
import pandas
ds = []
with open(fname, 'rt') as fh:
for line in fh:
ds.append(json.loads(line))
return pandas.DataFrame(ds)
def read_csv(fname):
import pandas
return pandas.read_csv(fname, index_col=None, comment='#')
def read_tb(path):
"""
path : a tensorboard file OR a directory, where we will find all TB files
of the form events.*
"""
import pandas
import numpy as np
from glob import glob
from collections import defaultdict
import tensorflow as tf
if osp.isdir(path):
fnames = glob(osp.join(path, "events.*"))
elif osp.basename(path).startswith("events."):
fnames = [path]
else:
raise NotImplementedError("Expected tensorboard file or directory containing them. Got %s"%path)
tag2pairs = defaultdict(list)
maxstep = 0
for fname in fnames:
for summary in tf.train.summary_iterator(fname):
if summary.step > 0:
for v in summary.summary.value:
pair = (summary.step, v.simple_value)
tag2pairs[v.tag].append(pair)
maxstep = max(summary.step, maxstep)
data = np.empty((maxstep, len(tag2pairs)))
data[:] = np.nan
tags = sorted(tag2pairs.keys())
for (colidx,tag) in enumerate(tags):
pairs = tag2pairs[tag]
for (step, value) in pairs:
data[step-1, colidx] = value
return pandas.DataFrame(data, columns=tags)
if __name__ == "__main__":
_demo()

View File

@@ -1,4 +1,3 @@
from baselines.common.mpi_running_mean_std import RunningMeanStd
import baselines.common.tf_util as U
import tensorflow as tf
import gym

View File

@@ -78,7 +78,7 @@ def add_vtarg_and_adv(seg, gamma, lam):
seg["tdlamret"] = seg["adv"] + seg["vpred"]
def learn(env, policy_func, *,
timesteps_per_batch, # timesteps per actor per update
timesteps_per_actorbatch, # timesteps per actor per update
clip_param, entcoeff, # clipping parameter epsilon, entropy coeff
optim_epochs, optim_stepsize, optim_batchsize,# optimization hypers
gamma, lam, # advantage estimation
@@ -130,7 +130,7 @@ def learn(env, policy_func, *,
# Prepare for rollouts
# ----------------------------------------
seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True)
seg_gen = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=True)
episodes_so_far = 0
timesteps_so_far = 0

View File

@@ -6,37 +6,34 @@ from baselines import bench
import os.path as osp
import gym, logging
from baselines import logger
from baselines.common.atari_wrappers import make_atari, wrap_deepmind
def wrap_train(env):
from baselines.common.atari_wrappers import (wrap_deepmind, FrameStack)
env = wrap_deepmind(env, clip_rewards=True)
env = FrameStack(env, 4)
return env
def train(env_id, num_frames, seed):
def train(env_id, num_timesteps, seed):
from baselines.ppo1 import pposgd_simple, cnn_policy
import baselines.common.tf_util as U
rank = MPI.COMM_WORLD.Get_rank()
sess = U.single_threaded_session()
sess.__enter__()
if rank != 0: logger.set_level(logger.DISABLED)
if rank == 0:
logger.configure()
else:
logger.configure(format_strs=[])
workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
set_global_seeds(workerseed)
env = gym.make(env_id)
env = make_atari(env_id)
def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613
return cnn_policy.CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space)
env = bench.Monitor(env, logger.get_dir() and
osp.join(logger.get_dir(), "%i.monitor.json" % rank))
osp.join(logger.get_dir(), str(rank)))
env.seed(workerseed)
gym.logger.setLevel(logging.WARN)
env = wrap_train(env)
num_timesteps = int(num_frames / 4 * 1.1)
env = wrap_deepmind(env)
env.seed(workerseed)
pposgd_simple.learn(env, policy_fn,
max_timesteps=num_timesteps,
timesteps_per_batch=256,
max_timesteps=int(num_timesteps * 1.1),
timesteps_per_actorbatch=256,
clip_param=0.2, entcoeff=0.01,
optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=64,
gamma=0.99, lam=0.95,
@@ -49,8 +46,9 @@ def main():
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--env', help='environment ID', default='PongNoFrameskip-v4')
parser.add_argument('--seed', help='RNG seed', type=int, default=0)
parser.add_argument('--num-timesteps', type=int, default=int(10e6))
args = parser.parse_args()
train(args.env, num_frames=40e6, seed=args.seed)
train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
if __name__ == '__main__':
main()

View File

@@ -1,10 +1,8 @@
#!/usr/bin/env python
from baselines.common import set_global_seeds, tf_util as U
from baselines import bench
import os.path as osp
import gym, logging
from baselines import logger
import sys
def train(env_id, num_timesteps, seed):
from baselines.ppo1 import mlp_policy, pposgd_simple
@@ -14,13 +12,12 @@ def train(env_id, num_timesteps, seed):
def policy_fn(name, ob_space, ac_space):
return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
hid_size=64, num_hid_layers=2)
env = bench.Monitor(env, logger.get_dir() and
osp.join(logger.get_dir(), "monitor.json"))
env = bench.Monitor(env, logger.get_dir())
env.seed(seed)
gym.logger.setLevel(logging.WARN)
pposgd_simple.learn(env, policy_fn,
max_timesteps=num_timesteps,
timesteps_per_batch=2048,
timesteps_per_actorbatch=2048,
clip_param=0.2, entcoeff=0.0,
optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64,
gamma=0.99, lam=0.95, schedule='linear',
@@ -32,8 +29,10 @@ def main():
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--env', help='environment ID', default='Hopper-v1')
parser.add_argument('--seed', help='RNG seed', type=int, default=0)
parser.add_argument('--num-timesteps', type=int, default=int(1e6))
args = parser.parse_args()
train(args.env, num_timesteps=1e6, seed=args.seed)
logger.configure()
train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
if __name__ == '__main__':

View File

@@ -1,4 +1,3 @@
from baselines.common.mpi_running_mean_std import RunningMeanStd
import baselines.common.tf_util as U
import tensorflow as tf
import gym
@@ -42,7 +41,7 @@ class CnnPolicy(object):
self.state_out = []
stochastic = tf.placeholder(dtype=tf.bool, shape=())
ac = self.pd.sample() # XXX
ac = self.pd.sample()
self._act = U.function([stochastic, ob], [ac, self.vpred])
def act(self, stochastic, ob):

View File

@@ -1,45 +1,38 @@
#!/usr/bin/env python
#!/usr/bin/env python
from mpi4py import MPI
from baselines.common import set_global_seeds
import os.path as osp
import gym, logging
from baselines import logger
from baselines import bench
import sys
from baselines.common.atari_wrappers import make_atari, wrap_deepmind
def wrap_train(env):
from baselines.common.atari_wrappers import (wrap_deepmind, FrameStack)
env = wrap_deepmind(env, clip_rewards=False)
env = FrameStack(env, 3)
return env
def train(env_id, num_frames, seed):
def train(env_id, num_timesteps, seed):
from baselines.trpo_mpi.nosharing_cnn_policy import CnnPolicy
from baselines.trpo_mpi import trpo_mpi
import baselines.common.tf_util as U
rank = MPI.COMM_WORLD.Get_rank()
sess = U.single_threaded_session()
sess.__enter__()
if rank != 0:
logger.set_level(logger.DISABLED)
if rank == 0:
logger.configure()
else:
logger.configure(format_strs=[])
workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
set_global_seeds(workerseed)
env = gym.make(env_id)
env = make_atari(env_id)
def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613
return CnnPolicy(name=name, ob_space=env.observation_space, ac_space=env.action_space)
env = bench.Monitor(env, logger.get_dir() and
osp.join(logger.get_dir(), "%i.monitor.json"%rank))
env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), str(rank)))
env.seed(workerseed)
gym.logger.setLevel(logging.WARN)
env = wrap_train(env)
num_timesteps = int(num_frames / 4 * 1.1)
env = wrap_deepmind(env)
env.seed(workerseed)
trpo_mpi.learn(env, policy_fn, timesteps_per_batch=512, max_kl=0.001, cg_iters=10, cg_damping=1e-3,
max_timesteps=num_timesteps, gamma=0.98, lam=1.0, vf_iters=3, vf_stepsize=1e-4, entcoeff=0.00)
max_timesteps=int(num_timesteps * 1.1), gamma=0.98, lam=1.0, vf_iters=3, vf_stepsize=1e-4, entcoeff=0.00)
env.close()
def main():
@@ -47,8 +40,9 @@ def main():
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--env', help='environment ID', default='PongNoFrameskip-v4')
parser.add_argument('--seed', help='RNG seed', type=int, default=0)
parser.add_argument('--num-timesteps', type=int, default=int(10e6))
args = parser.parse_args()
train(args.env, num_frames=40e6, seed=args.seed)
train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
if __name__ == "__main__":

View File

@@ -28,7 +28,7 @@ def train(env_id, num_timesteps, seed):
return MlpPolicy(name=name, ob_space=env.observation_space, ac_space=env.action_space,
hid_size=32, num_hid_layers=2)
env = bench.Monitor(env, logger.get_dir() and
osp.join(logger.get_dir(), "%i.monitor.json" % rank))
osp.join(logger.get_dir(), str(rank)))
env.seed(workerseed)
gym.logger.setLevel(logging.WARN)
@@ -41,8 +41,10 @@ def main():
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--env', help='environment ID', default='Hopper-v1')
parser.add_argument('--seed', help='RNG seed', type=int, default=0)
parser.add_argument('--num-timesteps', type=int, default=int(1e6))
args = parser.parse_args()
train(args.env, num_timesteps=1e6, seed=args.seed)
logger.configure()
train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
if __name__ == '__main__':