change atari preprocessing to use faster opencv
some logger changes
This commit is contained in:
3
.gitignore
vendored
3
.gitignore
vendored
@@ -30,3 +30,6 @@ src
|
|||||||
|
|
||||||
*.egg-info
|
*.egg-info
|
||||||
.cache
|
.cache
|
||||||
|
|
||||||
|
MUJOCO_LOG.TXT
|
||||||
|
|
||||||
|
11
README.md
11
README.md
@@ -20,3 +20,14 @@ pip install -e .
|
|||||||
- [DQN](baselines/deepq)
|
- [DQN](baselines/deepq)
|
||||||
- [PPO](baselines/ppo1)
|
- [PPO](baselines/ppo1)
|
||||||
- [TRPO](baselines/trpo_mpi)
|
- [TRPO](baselines/trpo_mpi)
|
||||||
|
|
||||||
|
To cite this repository in publications:
|
||||||
|
|
||||||
|
@misc{baselines,
|
||||||
|
author = {Hesse, Christopher and Plappert, Matthias and Radford, Alec and Schulman, John and Sidor, Szymon and Wu, Yuhuai},
|
||||||
|
title = {OpenAI Baselines},
|
||||||
|
year = {2017},
|
||||||
|
publisher = {GitHub},
|
||||||
|
journal = {GitHub repository},
|
||||||
|
howpublished = {\url{https://github.com/openai/baselines}},
|
||||||
|
}
|
||||||
|
@@ -1,9 +1,6 @@
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
import tensorflow as tf
|
import tensorflow as tf
|
||||||
from baselines.a2c.utils import conv, fc, conv_to_fc, batch_to_seq, seq_to_batch, lstm, lnlstm, sample, check_shape
|
from baselines.a2c.utils import conv, fc, conv_to_fc, batch_to_seq, seq_to_batch, lstm, lnlstm, sample
|
||||||
from baselines.common.distributions import make_pdtype
|
|
||||||
import baselines.common.tf_util as U
|
|
||||||
import gym
|
|
||||||
|
|
||||||
class LnLstmPolicy(object):
|
class LnLstmPolicy(object):
|
||||||
def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, nlstm=256, reuse=False):
|
def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, nlstm=256, reuse=False):
|
||||||
|
@@ -5,18 +5,15 @@ from baselines.common import set_global_seeds
|
|||||||
from baselines import bench
|
from baselines import bench
|
||||||
from baselines.a2c.a2c import learn
|
from baselines.a2c.a2c import learn
|
||||||
from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
|
from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
|
||||||
from baselines.common.atari_wrappers import wrap_deepmind
|
from baselines.common.atari_wrappers import make_atari, wrap_deepmind
|
||||||
from baselines.a2c.policies import CnnPolicy, LstmPolicy, LnLstmPolicy
|
from baselines.a2c.policies import CnnPolicy, LstmPolicy, LnLstmPolicy
|
||||||
|
|
||||||
def train(env_id, num_frames, seed, policy, lrschedule, num_cpu):
|
def train(env_id, num_timesteps, seed, policy, lrschedule, num_cpu):
|
||||||
num_timesteps = int(num_frames / 4 * 1.1)
|
|
||||||
# divide by 4 due to frameskip, then do a little extras so episodes end
|
|
||||||
def make_env(rank):
|
def make_env(rank):
|
||||||
def _thunk():
|
def _thunk():
|
||||||
env = gym.make(env_id)
|
env = make_atari(env_id)
|
||||||
env.seed(seed + rank)
|
env.seed(seed + rank)
|
||||||
env = bench.Monitor(env, logger.get_dir() and
|
env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
|
||||||
os.path.join(logger.get_dir(), "{}.monitor.json".format(rank)))
|
|
||||||
gym.logger.setLevel(logging.WARN)
|
gym.logger.setLevel(logging.WARN)
|
||||||
return wrap_deepmind(env)
|
return wrap_deepmind(env)
|
||||||
return _thunk
|
return _thunk
|
||||||
@@ -28,7 +25,7 @@ def train(env_id, num_frames, seed, policy, lrschedule, num_cpu):
|
|||||||
policy_fn = LstmPolicy
|
policy_fn = LstmPolicy
|
||||||
elif policy == 'lnlstm':
|
elif policy == 'lnlstm':
|
||||||
policy_fn = LnLstmPolicy
|
policy_fn = LnLstmPolicy
|
||||||
learn(policy_fn, env, seed, total_timesteps=num_timesteps, lrschedule=lrschedule)
|
learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), lrschedule=lrschedule)
|
||||||
env.close()
|
env.close()
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
@@ -38,10 +35,10 @@ def main():
|
|||||||
parser.add_argument('--seed', help='RNG seed', type=int, default=0)
|
parser.add_argument('--seed', help='RNG seed', type=int, default=0)
|
||||||
parser.add_argument('--policy', help='Policy architecture', choices=['cnn', 'lstm', 'lnlstm'], default='cnn')
|
parser.add_argument('--policy', help='Policy architecture', choices=['cnn', 'lstm', 'lnlstm'], default='cnn')
|
||||||
parser.add_argument('--lrschedule', help='Learning rate schedule', choices=['constant', 'linear'], default='constant')
|
parser.add_argument('--lrschedule', help='Learning rate schedule', choices=['constant', 'linear'], default='constant')
|
||||||
parser.add_argument('--million_frames', help='How many frames to train (/ 1e6). '
|
parser.add_argument('--num-timesteps', type=int, default=int(10e6))
|
||||||
'This number gets divided by 4 due to frameskip', type=int, default=40)
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
train(args.env, num_frames=1e6 * args.million_frames, seed=args.seed,
|
logger.configure()
|
||||||
|
train(args.env, num_timesteps=args.num_timesteps, seed=args.seed,
|
||||||
policy=args.policy, lrschedule=args.lrschedule, num_cpu=16)
|
policy=args.policy, lrschedule=args.lrschedule, num_cpu=16)
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
@@ -110,7 +110,6 @@ def learn(env, policy, vf, gamma, lam, timesteps_per_batch, num_timesteps,
|
|||||||
ob_no = np.concatenate([path["observation"] for path in paths])
|
ob_no = np.concatenate([path["observation"] for path in paths])
|
||||||
action_na = np.concatenate([path["action"] for path in paths])
|
action_na = np.concatenate([path["action"] for path in paths])
|
||||||
oldac_dist = np.concatenate([path["action_dist"] for path in paths])
|
oldac_dist = np.concatenate([path["action_dist"] for path in paths])
|
||||||
logp_n = np.concatenate([path["logp"] for path in paths])
|
|
||||||
adv_n = np.concatenate(advs)
|
adv_n = np.concatenate(advs)
|
||||||
standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8)
|
standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8)
|
||||||
|
|
||||||
@@ -126,7 +125,7 @@ def learn(env, policy, vf, gamma, lam, timesteps_per_batch, num_timesteps,
|
|||||||
U.eval(tf.assign(stepsize, tf.maximum(min_stepsize, stepsize / 1.5)))
|
U.eval(tf.assign(stepsize, tf.maximum(min_stepsize, stepsize / 1.5)))
|
||||||
elif kl < desired_kl / 2:
|
elif kl < desired_kl / 2:
|
||||||
logger.log("kl too low")
|
logger.log("kl too low")
|
||||||
U.eval(tf.assign(stepsize, tf.minimum(max_stepsize, stepsize * 1.5)))
|
U.eval(tf.assign(stepsize, tf.minimum(max_stepsize, stepsize * 1.5)))
|
||||||
else:
|
else:
|
||||||
logger.log("kl just right!")
|
logger.log("kl just right!")
|
||||||
|
|
||||||
@@ -138,3 +137,6 @@ def learn(env, policy, vf, gamma, lam, timesteps_per_batch, num_timesteps,
|
|||||||
callback()
|
callback()
|
||||||
logger.dump_tabular()
|
logger.dump_tabular()
|
||||||
i += 1
|
i += 1
|
||||||
|
|
||||||
|
coord.request_stop()
|
||||||
|
coord.join(enqueue_threads)
|
||||||
|
@@ -113,7 +113,6 @@ class Runner(object):
|
|||||||
nenv = env.num_envs
|
nenv = env.num_envs
|
||||||
self.batch_ob_shape = (nenv*nsteps, nh, nw, nc*nstack)
|
self.batch_ob_shape = (nenv*nsteps, nh, nw, nc*nstack)
|
||||||
self.obs = np.zeros((nenv, nh, nw, nc*nstack), dtype=np.uint8)
|
self.obs = np.zeros((nenv, nh, nw, nc*nstack), dtype=np.uint8)
|
||||||
self.nc = nc
|
|
||||||
obs = env.reset()
|
obs = env.reset()
|
||||||
self.update_obs(obs)
|
self.update_obs(obs)
|
||||||
self.gamma = gamma
|
self.gamma = gamma
|
||||||
@@ -122,8 +121,8 @@ class Runner(object):
|
|||||||
self.dones = [False for _ in range(nenv)]
|
self.dones = [False for _ in range(nenv)]
|
||||||
|
|
||||||
def update_obs(self, obs):
|
def update_obs(self, obs):
|
||||||
self.obs = np.roll(self.obs, shift=-self.nc, axis=3)
|
self.obs = np.roll(self.obs, shift=-1, axis=3)
|
||||||
self.obs[:, :, :, -self.nc:] = obs
|
self.obs[:, :, :, -1] = obs[:, :, :, 0]
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[]
|
mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[]
|
||||||
@@ -189,7 +188,8 @@ def learn(policy, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interval
|
|||||||
runner = Runner(env, model, nsteps=nsteps, nstack=nstack, gamma=gamma)
|
runner = Runner(env, model, nsteps=nsteps, nstack=nstack, gamma=gamma)
|
||||||
nbatch = nenvs*nsteps
|
nbatch = nenvs*nsteps
|
||||||
tstart = time.time()
|
tstart = time.time()
|
||||||
enqueue_threads = model.q_runner.create_threads(model.sess, coord=tf.train.Coordinator(), start=True)
|
coord = tf.train.Coordinator()
|
||||||
|
enqueue_threads = model.q_runner.create_threads(model.sess, coord=coord, start=True)
|
||||||
for update in range(1, total_timesteps//nbatch+1):
|
for update in range(1, total_timesteps//nbatch+1):
|
||||||
obs, states, rewards, masks, actions, values = runner.run()
|
obs, states, rewards, masks, actions, values = runner.run()
|
||||||
policy_loss, value_loss, policy_entropy = model.train(obs, states, rewards, masks, actions, values)
|
policy_loss, value_loss, policy_entropy = model.train(obs, states, rewards, masks, actions, values)
|
||||||
@@ -211,5 +211,6 @@ def learn(policy, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interval
|
|||||||
savepath = osp.join(logger.get_dir(), 'checkpoint%.5i'%update)
|
savepath = osp.join(logger.get_dir(), 'checkpoint%.5i'%update)
|
||||||
print('Saving to', savepath)
|
print('Saving to', savepath)
|
||||||
model.save(savepath)
|
model.save(savepath)
|
||||||
|
coord.request_stop()
|
||||||
|
coord.join(enqueue_threads)
|
||||||
env.close()
|
env.close()
|
||||||
|
@@ -1,9 +1,7 @@
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
import tensorflow as tf
|
import tensorflow as tf
|
||||||
from baselines.acktr.utils import conv, fc, dense, conv_to_fc, sample, kl_div
|
from baselines.acktr.utils import conv, fc, dense, conv_to_fc, sample, kl_div
|
||||||
from baselines.common.distributions import make_pdtype
|
|
||||||
import baselines.common.tf_util as U
|
import baselines.common.tf_util as U
|
||||||
import gym
|
|
||||||
|
|
||||||
class CnnPolicy(object):
|
class CnnPolicy(object):
|
||||||
|
|
||||||
@@ -51,7 +49,6 @@ class GaussianMlpPolicy(object):
|
|||||||
oldac_na = tf.placeholder(tf.float32, shape=[None, ac_dim], name="ac") # batch of actions previous actions
|
oldac_na = tf.placeholder(tf.float32, shape=[None, ac_dim], name="ac") # batch of actions previous actions
|
||||||
oldac_dist = tf.placeholder(tf.float32, shape=[None, ac_dim*2], name="oldac_dist") # batch of actions previous action distributions
|
oldac_dist = tf.placeholder(tf.float32, shape=[None, ac_dim*2], name="oldac_dist") # batch of actions previous action distributions
|
||||||
adv_n = tf.placeholder(tf.float32, shape=[None], name="adv") # advantage function estimate
|
adv_n = tf.placeholder(tf.float32, shape=[None], name="adv") # advantage function estimate
|
||||||
oldlogprob_n = tf.placeholder(tf.float32, shape=[None], name='oldlogprob') # log probability of previous actions
|
|
||||||
wd_dict = {}
|
wd_dict = {}
|
||||||
h1 = tf.nn.tanh(dense(ob_no, 64, "h1", weight_init=U.normc_initializer(1.0), bias_init=0.0, weight_loss_dict=wd_dict))
|
h1 = tf.nn.tanh(dense(ob_no, 64, "h1", weight_init=U.normc_initializer(1.0), bias_init=0.0, weight_loss_dict=wd_dict))
|
||||||
h2 = tf.nn.tanh(dense(h1, 64, "h2", weight_init=U.normc_initializer(1.0), bias_init=0.0, weight_loss_dict=wd_dict))
|
h2 = tf.nn.tanh(dense(h1, 64, "h2", weight_init=U.normc_initializer(1.0), bias_init=0.0, weight_loss_dict=wd_dict))
|
||||||
|
@@ -5,24 +5,22 @@ from baselines.common import set_global_seeds
|
|||||||
from baselines import bench
|
from baselines import bench
|
||||||
from baselines.acktr.acktr_disc import learn
|
from baselines.acktr.acktr_disc import learn
|
||||||
from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
|
from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
|
||||||
from baselines.common.atari_wrappers import wrap_deepmind
|
from baselines.common.atari_wrappers import make_atari, wrap_deepmind
|
||||||
from baselines.acktr.policies import CnnPolicy
|
from baselines.acktr.policies import CnnPolicy
|
||||||
|
|
||||||
def train(env_id, num_frames, seed, num_cpu):
|
def train(env_id, num_timesteps, seed, num_cpu):
|
||||||
num_timesteps = int(num_frames / 4 * 1.1)
|
|
||||||
def make_env(rank):
|
def make_env(rank):
|
||||||
def _thunk():
|
def _thunk():
|
||||||
env = gym.make(env_id)
|
env = make_atari(env_id)
|
||||||
env.seed(seed + rank)
|
env.seed(seed + rank)
|
||||||
if logger.get_dir():
|
env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
|
||||||
env = bench.Monitor(env, os.path.join(logger.get_dir(), "{}.monitor.json".format(rank)))
|
|
||||||
gym.logger.setLevel(logging.WARN)
|
gym.logger.setLevel(logging.WARN)
|
||||||
return wrap_deepmind(env)
|
return wrap_deepmind(env)
|
||||||
return _thunk
|
return _thunk
|
||||||
set_global_seeds(seed)
|
set_global_seeds(seed)
|
||||||
env = SubprocVecEnv([make_env(i) for i in range(num_cpu)])
|
env = SubprocVecEnv([make_env(i) for i in range(num_cpu)])
|
||||||
policy_fn = CnnPolicy
|
policy_fn = CnnPolicy
|
||||||
learn(policy_fn, env, seed, total_timesteps=num_timesteps, nprocs=num_cpu)
|
learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), nprocs=num_cpu)
|
||||||
env.close()
|
env.close()
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
@@ -30,10 +28,10 @@ def main():
|
|||||||
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||||
parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4')
|
parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4')
|
||||||
parser.add_argument('--seed', help='RNG seed', type=int, default=0)
|
parser.add_argument('--seed', help='RNG seed', type=int, default=0)
|
||||||
parser.add_argument('--million_frames', help='How many frames to train (/ 1e6). '
|
parser.add_argument('--num-timesteps', type=int, default=int(10e6))
|
||||||
'This number gets divided by 4 due to frameskip', type=int, default=40)
|
args = parser.parse_args()
|
||||||
args = parser.parse_args()
|
logger.configure()
|
||||||
train(args.env, num_frames=1e6 * args.million_frames, seed=args.seed, num_cpu=32)
|
train(args.env, num_timesteps=args.num_timesteps, seed=args.seed, num_cpu=32)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
@@ -13,13 +13,12 @@ from baselines.acktr.value_functions import NeuralNetValueFunction
|
|||||||
|
|
||||||
def train(env_id, num_timesteps, seed):
|
def train(env_id, num_timesteps, seed):
|
||||||
env=gym.make(env_id)
|
env=gym.make(env_id)
|
||||||
if logger.get_dir():
|
env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
|
||||||
env = bench.Monitor(env, os.path.join(logger.get_dir(), "monitor.json"))
|
|
||||||
set_global_seeds(seed)
|
set_global_seeds(seed)
|
||||||
env.seed(seed)
|
env.seed(seed)
|
||||||
gym.logger.setLevel(logging.WARN)
|
gym.logger.setLevel(logging.WARN)
|
||||||
|
|
||||||
with tf.Session(config=tf.ConfigProto()) as session:
|
with tf.Session(config=tf.ConfigProto()):
|
||||||
ob_dim = env.observation_space.shape[0]
|
ob_dim = env.observation_space.shape[0]
|
||||||
ac_dim = env.action_space.shape[0]
|
ac_dim = env.action_space.shape[0]
|
||||||
with tf.variable_scope("vf"):
|
with tf.variable_scope("vf"):
|
||||||
@@ -38,5 +37,7 @@ if __name__ == "__main__":
|
|||||||
parser = argparse.ArgumentParser(description='Run Mujoco benchmark.')
|
parser = argparse.ArgumentParser(description='Run Mujoco benchmark.')
|
||||||
parser.add_argument('--seed', help='RNG seed', type=int, default=0)
|
parser.add_argument('--seed', help='RNG seed', type=int, default=0)
|
||||||
parser.add_argument('--env', help='environment ID', type=str, default="Reacher-v1")
|
parser.add_argument('--env', help='environment ID', type=str, default="Reacher-v1")
|
||||||
|
parser.add_argument('--num-timesteps', type=int, default=int(1e6))
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
train(args.env, num_timesteps=1e6, seed=args.seed)
|
logger.configure()
|
||||||
|
train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
|
||||||
|
@@ -13,7 +13,7 @@ class NeuralNetValueFunction(object):
|
|||||||
wd_dict = {}
|
wd_dict = {}
|
||||||
h1 = tf.nn.elu(dense(X, 64, "h1", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict))
|
h1 = tf.nn.elu(dense(X, 64, "h1", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict))
|
||||||
h2 = tf.nn.elu(dense(h1, 64, "h2", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict))
|
h2 = tf.nn.elu(dense(h1, 64, "h2", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict))
|
||||||
vpred_n = dense(h2, 1, "hfinal", weight_init=None, bias_init=0, weight_loss_dict=wd_dict)[:,0]
|
vpred_n = dense(h2, 1, "hfinal", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)[:,0]
|
||||||
sample_vpred_n = vpred_n + tf.random_normal(tf.shape(vpred_n))
|
sample_vpred_n = vpred_n + tf.random_normal(tf.shape(vpred_n))
|
||||||
wd_loss = tf.get_collection("vf_losses", None)
|
wd_loss = tf.get_collection("vf_losses", None)
|
||||||
loss = U.mean(tf.square(vpred_n - vtarg_n)) + tf.add_n(wd_loss)
|
loss = U.mean(tf.square(vpred_n - vtarg_n)) + tf.add_n(wd_loss)
|
||||||
@@ -22,7 +22,7 @@ class NeuralNetValueFunction(object):
|
|||||||
optim = kfac.KfacOptimizer(learning_rate=0.001, cold_lr=0.001*(1-0.9), momentum=0.9, \
|
optim = kfac.KfacOptimizer(learning_rate=0.001, cold_lr=0.001*(1-0.9), momentum=0.9, \
|
||||||
clip_kl=0.3, epsilon=0.1, stats_decay=0.95, \
|
clip_kl=0.3, epsilon=0.1, stats_decay=0.95, \
|
||||||
async=1, kfac_update=2, cold_iter=50, \
|
async=1, kfac_update=2, cold_iter=50, \
|
||||||
weight_decay_dict=wd_dict, max_grad_norm=1.0)
|
weight_decay_dict=wd_dict, max_grad_norm=None)
|
||||||
vf_var_list = []
|
vf_var_list = []
|
||||||
for var in tf.trainable_variables():
|
for var in tf.trainable_variables():
|
||||||
if "vf" in var.name:
|
if "vf" in var.name:
|
||||||
|
@@ -1,3 +1,3 @@
|
|||||||
from baselines.bench.benchmarks import *
|
from baselines.bench.benchmarks import *
|
||||||
from baselines.bench.monitor import *
|
from baselines.bench.monitor import *
|
||||||
|
from baselines.bench.simple_bench import simple_bench
|
||||||
|
@@ -1,61 +1,71 @@
|
|||||||
|
import os.path as osp
|
||||||
|
|
||||||
_atari7 = ['BeamRider', 'Breakout', 'Enduro', 'Pong', 'Qbert', 'Seaquest', 'SpaceInvaders']
|
_atari7 = ['BeamRider', 'Breakout', 'Enduro', 'Pong', 'Qbert', 'Seaquest', 'SpaceInvaders']
|
||||||
_atariexpl7 = ['Freeway', 'Gravitar', 'MontezumaRevenge', 'Pitfall', 'PrivateEye', 'Solaris', 'Venture']
|
_atariexpl7 = ['Freeway', 'Gravitar', 'MontezumaRevenge', 'Pitfall', 'PrivateEye', 'Solaris', 'Venture']
|
||||||
|
|
||||||
_BENCHMARKS = []
|
_BENCHMARKS = []
|
||||||
|
|
||||||
|
|
||||||
def register_benchmark(benchmark):
|
def register_benchmark(benchmark):
|
||||||
for b in _BENCHMARKS:
|
for b in _BENCHMARKS:
|
||||||
if b['name'] == benchmark['name']:
|
if b['name'] == benchmark['name']:
|
||||||
raise ValueError('Benchmark with name %s already registered!'%b['name'])
|
raise ValueError('Benchmark with name %s already registered!' % b['name'])
|
||||||
_BENCHMARKS.append(benchmark)
|
_BENCHMARKS.append(benchmark)
|
||||||
|
|
||||||
|
|
||||||
def list_benchmarks():
|
def list_benchmarks():
|
||||||
return [b['name'] for b in _BENCHMARKS]
|
return [b['name'] for b in _BENCHMARKS]
|
||||||
|
|
||||||
|
|
||||||
def get_benchmark(benchmark_name):
|
def get_benchmark(benchmark_name):
|
||||||
for b in _BENCHMARKS:
|
for b in _BENCHMARKS:
|
||||||
if b['name'] == benchmark_name:
|
if b['name'] == benchmark_name:
|
||||||
return b
|
return b
|
||||||
raise ValueError('%s not found! Known benchmarks: %s' % (benchmark_name, list_benchmarks()))
|
raise ValueError('%s not found! Known benchmarks: %s' % (benchmark_name, list_benchmarks()))
|
||||||
|
|
||||||
|
|
||||||
def get_task(benchmark, env_id):
|
def get_task(benchmark, env_id):
|
||||||
"""Get a task by env_id. Return None if the benchmark doesn't have the env"""
|
"""Get a task by env_id. Return None if the benchmark doesn't have the env"""
|
||||||
return next(filter(lambda task: task['env_id'] == env_id, benchmark['tasks']), None)
|
return next(filter(lambda task: task['env_id'] == env_id, benchmark['tasks']), None)
|
||||||
|
|
||||||
|
|
||||||
def find_task_for_env_id_in_any_benchmark(env_id):
|
def find_task_for_env_id_in_any_benchmark(env_id):
|
||||||
for bm in _BENCHMARKS:
|
for bm in _BENCHMARKS:
|
||||||
for task in bm["tasks"]:
|
for task in bm["tasks"]:
|
||||||
if task["env_id"]==env_id:
|
if task["env_id"] == env_id:
|
||||||
return bm, task
|
return bm, task
|
||||||
return None, None
|
return None, None
|
||||||
|
|
||||||
|
|
||||||
_ATARI_SUFFIX = 'NoFrameskip-v4'
|
_ATARI_SUFFIX = 'NoFrameskip-v4'
|
||||||
|
|
||||||
register_benchmark({
|
register_benchmark({
|
||||||
'name' : 'Atari200M',
|
'name': 'Atari50M',
|
||||||
'description' :'7 Atari games from Mnih et al. (2013), with pixel observations, 200M frames',
|
'description': '7 Atari games from Mnih et al. (2013), with pixel observations, 50M timesteps',
|
||||||
'tasks' : [{'env_id' : _game + _ATARI_SUFFIX, 'trials' : 2, 'num_timesteps' : int(200e6)} for _game in _atari7]
|
'tasks': [{'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(50e6)} for _game in _atari7]
|
||||||
})
|
})
|
||||||
|
|
||||||
register_benchmark({
|
register_benchmark({
|
||||||
'name' : 'Atari40M',
|
'name': 'Atari10M',
|
||||||
'description' :'7 Atari games from Mnih et al. (2013), with pixel observations, 40M frames',
|
'description': '7 Atari games from Mnih et al. (2013), with pixel observations, 10M timesteps',
|
||||||
'tasks' : [{'env_id' : _game + _ATARI_SUFFIX, 'trials' : 2, 'num_timesteps' : int(40e6)} for _game in _atari7]
|
'tasks': [{'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(10e6)} for _game in _atari7]
|
||||||
})
|
})
|
||||||
|
|
||||||
register_benchmark({
|
register_benchmark({
|
||||||
'name' : 'Atari1Hr',
|
'name': 'Atari1Hr',
|
||||||
'description' :'7 Atari games from Mnih et al. (2013), with pixel observations, 1 hour of walltime',
|
'description': '7 Atari games from Mnih et al. (2013), with pixel observations, 1 hour of walltime',
|
||||||
'tasks' : [{'env_id' : _game + _ATARI_SUFFIX, 'trials' : 2, 'num_seconds' : 60*60} for _game in _atari7]
|
'tasks': [{'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_seconds': 60 * 60} for _game in _atari7]
|
||||||
})
|
})
|
||||||
|
|
||||||
register_benchmark({
|
register_benchmark({
|
||||||
'name' : 'AtariExploration40M',
|
'name': 'AtariExploration10M',
|
||||||
'description' :'7 Atari games emphasizing exploration, with pixel observations, 40M frames',
|
'description': '7 Atari games emphasizing exploration, with pixel observations, 10M timesteps',
|
||||||
'tasks' : [{'env_id' : _game + _ATARI_SUFFIX, 'trials' : 2, 'num_timesteps' : int(40e6)} for _game in _atariexpl7]
|
'tasks': [{'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(10e6)} for _game in _atariexpl7]
|
||||||
})
|
})
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# MuJoCo
|
# MuJoCo
|
||||||
|
|
||||||
_mujocosmall = [
|
_mujocosmall = [
|
||||||
@@ -63,78 +73,60 @@ _mujocosmall = [
|
|||||||
'HalfCheetah-v1', 'Hopper-v1', 'Walker2d-v1',
|
'HalfCheetah-v1', 'Hopper-v1', 'Walker2d-v1',
|
||||||
'Reacher-v1', 'Swimmer-v1']
|
'Reacher-v1', 'Swimmer-v1']
|
||||||
register_benchmark({
|
register_benchmark({
|
||||||
'name' : 'Mujoco1M',
|
'name': 'Mujoco1M',
|
||||||
'description' : 'Some small 2D MuJoCo tasks, run for 1M timesteps',
|
'description': 'Some small 2D MuJoCo tasks, run for 1M timesteps',
|
||||||
'tasks' : [{'env_id' : _envid, 'trials' : 3, 'num_timesteps' : int(1e6)} for _envid in _mujocosmall]
|
'tasks': [{'env_id': _envid, 'trials': 3, 'num_timesteps': int(1e6)} for _envid in _mujocosmall]
|
||||||
})
|
})
|
||||||
register_benchmark({
|
register_benchmark({
|
||||||
'name' : 'MujocoWalkers',
|
'name': 'MujocoWalkers',
|
||||||
'description' : 'MuJoCo forward walkers, run for 8M, humanoid 100M',
|
'description': 'MuJoCo forward walkers, run for 8M, humanoid 100M',
|
||||||
'tasks' : [
|
'tasks': [
|
||||||
{'env_id' : "Hopper-v1", 'trials' : 4, 'num_timesteps' : 8*1000000 },
|
{'env_id': "Hopper-v1", 'trials': 4, 'num_timesteps': 8 * 1000000},
|
||||||
{'env_id' : "Walker2d-v1", 'trials' : 4, 'num_timesteps' : 8*1000000 },
|
{'env_id': "Walker2d-v1", 'trials': 4, 'num_timesteps': 8 * 1000000},
|
||||||
{'env_id' : "Humanoid-v1", 'trials' : 4, 'num_timesteps' : 100*1000000 },
|
{'env_id': "Humanoid-v1", 'trials': 4, 'num_timesteps': 100 * 1000000},
|
||||||
]
|
]
|
||||||
})
|
})
|
||||||
# To reproduce:
|
|
||||||
# python3 baselines/baselines/ppo2/ppo2_run_benchmark.py gce MujocoWalkers myrun_ppo2_whiteobs1_cpu8
|
|
||||||
# (observation input filters necessary)
|
|
||||||
|
|
||||||
|
|
||||||
# Roboschool
|
# Roboschool
|
||||||
|
|
||||||
register_benchmark({
|
register_benchmark({
|
||||||
'name' : 'Roboschool8M',
|
'name': 'Roboschool8M',
|
||||||
'description' : 'Small 2D tasks, up to 30 minutes to complete on 8 cores',
|
'description': 'Small 2D tasks, up to 30 minutes to complete on 8 cores',
|
||||||
'tasks' : [
|
'tasks': [
|
||||||
{'env_id' : "RoboschoolReacher-v1", 'trials' : 4, 'num_timesteps' : 2*1000000 },
|
{'env_id': "RoboschoolReacher-v1", 'trials': 4, 'num_timesteps': 2 * 1000000},
|
||||||
{'env_id' : "RoboschoolAnt-v1", 'trials' : 4, 'num_timesteps' : 8*1000000 },
|
{'env_id': "RoboschoolAnt-v1", 'trials': 4, 'num_timesteps': 8 * 1000000},
|
||||||
{'env_id' : "RoboschoolHalfCheetah-v1", 'trials' : 4, 'num_timesteps' : 8*1000000 },
|
{'env_id': "RoboschoolHalfCheetah-v1", 'trials': 4, 'num_timesteps': 8 * 1000000},
|
||||||
{'env_id' : "RoboschoolHopper-v1", 'trials' : 4, 'num_timesteps' : 8*1000000 },
|
{'env_id': "RoboschoolHopper-v1", 'trials': 4, 'num_timesteps': 8 * 1000000},
|
||||||
{'env_id' : "RoboschoolWalker2d-v1", 'trials' : 4, 'num_timesteps' : 8*1000000 },
|
{'env_id': "RoboschoolWalker2d-v1", 'trials': 4, 'num_timesteps': 8 * 1000000},
|
||||||
]
|
]
|
||||||
})
|
})
|
||||||
register_benchmark({
|
register_benchmark({
|
||||||
'name' : 'RoboschoolHarder',
|
'name': 'RoboschoolHarder',
|
||||||
'description' : 'Test your might!!! Up to 12 hours on 32 cores',
|
'description': 'Test your might!!! Up to 12 hours on 32 cores',
|
||||||
'tasks' : [
|
'tasks': [
|
||||||
{'env_id' : "RoboschoolHumanoid-v1", 'trials' : 4, 'num_timesteps' : 100*1000000 },
|
{'env_id': "RoboschoolHumanoid-v1", 'trials': 4, 'num_timesteps': 100 * 1000000},
|
||||||
{'env_id' : "RoboschoolHumanoidFlagrun-v1", 'trials' : 4, 'num_timesteps' : 200*1000000 },
|
{'env_id': "RoboschoolHumanoidFlagrun-v1", 'trials': 4, 'num_timesteps': 200 * 1000000},
|
||||||
{'env_id' : "RoboschoolHumanoidFlagrunHarder-v1", 'trials' : 4, 'num_timesteps' : 400*1000000 },
|
{'env_id': "RoboschoolHumanoidFlagrunHarder-v1", 'trials': 4, 'num_timesteps': 400 * 1000000},
|
||||||
]
|
]
|
||||||
})
|
})
|
||||||
# To reproduce:
|
|
||||||
# python3 baselines/baselines/ppo2/ppo2_run_benchmark.py gce Roboschool8M myrun_ppo2_cpu8
|
|
||||||
# python3 baselines/baselines/ppo2/ppo2_run_benchmark.py gce RoboschoolHarder myrun_ppo2_cpu32_large_samples65536
|
|
||||||
# (Large network, train on 65536 samples each iteration. Also, _large is really necessary only for Harder)
|
|
||||||
|
|
||||||
|
|
||||||
# Other
|
# Other
|
||||||
|
|
||||||
_atari50 = [ # actually 49
|
_atari50 = [ # actually 47
|
||||||
'Alien', 'Amidar', 'Assault', 'Asterix', 'Asteroids',
|
'Alien', 'Amidar', 'Assault', 'Asterix', 'Asteroids',
|
||||||
'Atlantis', 'BankHeist', 'BattleZone', 'BeamRider', 'Bowling',
|
'Atlantis', 'BankHeist', 'BattleZone', 'BeamRider', 'Bowling',
|
||||||
'Boxing', 'Breakout', 'Centipede', 'ChopperCommand', 'CrazyClimber',
|
'Breakout', 'Centipede', 'ChopperCommand', 'CrazyClimber',
|
||||||
'DemonAttack', 'DoubleDunk', 'Enduro', 'FishingDerby', 'Freeway',
|
'DemonAttack', 'DoubleDunk', 'Enduro', 'FishingDerby', 'Freeway',
|
||||||
'Frostbite', 'Gopher', 'Gravitar', 'IceHockey', 'Jamesbond',
|
'Frostbite', 'Gopher', 'Gravitar', 'IceHockey', 'Jamesbond',
|
||||||
'Kangaroo', 'Krull', 'KungFuMaster', 'MontezumaRevenge', 'MsPacman',
|
'Kangaroo', 'Krull', 'KungFuMaster', 'MontezumaRevenge', 'MsPacman',
|
||||||
'NameThisGame', 'Pitfall', 'Pong', 'PrivateEye', 'Qbert',
|
'NameThisGame', 'Pitfall', 'Pong', 'PrivateEye', 'Qbert',
|
||||||
'Riverraid', 'RoadRunner', 'Robotank', 'Seaquest', 'SpaceInvaders',
|
'RoadRunner', 'Robotank', 'Seaquest', 'SpaceInvaders', 'StarGunner',
|
||||||
'StarGunner', 'Tennis', 'TimePilot', 'Tutankham', 'UpNDown',
|
'Tennis', 'TimePilot', 'Tutankham', 'UpNDown', 'Venture',
|
||||||
'Venture', 'VideoPinball', 'WizardOfWor', 'Zaxxon',
|
'VideoPinball', 'WizardOfWor', 'Zaxxon',
|
||||||
]
|
]
|
||||||
|
|
||||||
register_benchmark({
|
register_benchmark({
|
||||||
'name' : 'Atari50_40M',
|
'name': 'Atari50_10M',
|
||||||
'description' :'7 Atari games from Mnih et al. (2013), with pixel observations, 40M frames',
|
'description': '47 Atari games from Mnih et al. (2013), with pixel observations, 10M timesteps',
|
||||||
'tasks' : [{'env_id' : _game + _ATARI_SUFFIX, 'trials' : 3, 'num_timesteps' : int(40e6)} for _game in _atari50]
|
'tasks': [{'env_id': _game + _ATARI_SUFFIX, 'trials': 3, 'num_timesteps': int(10e6)} for _game in _atari50]
|
||||||
})
|
})
|
||||||
|
|
||||||
def env_shortname(s):
|
|
||||||
"Make typical names above shorter, while keeping recognizable"
|
|
||||||
s = s.replace("NoFrameskip", "")
|
|
||||||
if s[:10]=="Roboschool": s = s[10:]
|
|
||||||
i = s.rfind("-v")
|
|
||||||
if i!=-1: s = s[:i]
|
|
||||||
|
|
||||||
return s.lower()
|
|
||||||
|
@@ -2,20 +2,17 @@ __all__ = ['Monitor', 'get_monitor_files', 'load_results']
|
|||||||
|
|
||||||
import gym
|
import gym
|
||||||
from gym.core import Wrapper
|
from gym.core import Wrapper
|
||||||
from os import path
|
|
||||||
import time
|
import time
|
||||||
from glob import glob
|
from glob import glob
|
||||||
|
import csv
|
||||||
try:
|
import os.path as osp
|
||||||
import ujson as json # Not necessary for monitor writing, but very useful for monitor loading
|
import json
|
||||||
except ImportError:
|
|
||||||
import json
|
|
||||||
|
|
||||||
class Monitor(Wrapper):
|
class Monitor(Wrapper):
|
||||||
EXT = "monitor.json"
|
EXT = "monitor.csv"
|
||||||
f = None
|
f = None
|
||||||
|
|
||||||
def __init__(self, env, filename, allow_early_resets=False):
|
def __init__(self, env, filename, allow_early_resets=False, reset_keywords=()):
|
||||||
Wrapper.__init__(self, env=env)
|
Wrapper.__init__(self, env=env)
|
||||||
self.tstart = time.time()
|
self.tstart = time.time()
|
||||||
if filename is None:
|
if filename is None:
|
||||||
@@ -23,50 +20,38 @@ class Monitor(Wrapper):
|
|||||||
self.logger = None
|
self.logger = None
|
||||||
else:
|
else:
|
||||||
if not filename.endswith(Monitor.EXT):
|
if not filename.endswith(Monitor.EXT):
|
||||||
filename = filename + "." + Monitor.EXT
|
if osp.isdir(filename):
|
||||||
|
filename = osp.join(filename, Monitor.EXT)
|
||||||
|
else:
|
||||||
|
filename = filename + "." + Monitor.EXT
|
||||||
self.f = open(filename, "wt")
|
self.f = open(filename, "wt")
|
||||||
self.logger = JSONLogger(self.f)
|
self.f.write('#%s\n'%json.dumps({"t_start": self.tstart, "gym_version": gym.__version__,
|
||||||
self.logger.writekvs({"t_start": self.tstart, "gym_version": gym.__version__,
|
"env_id": env.spec.id if env.spec else 'Unknown'}))
|
||||||
"env_id": env.spec.id if env.spec else 'Unknown'})
|
self.logger = csv.DictWriter(self.f, fieldnames=('r', 'l', 't')+reset_keywords)
|
||||||
|
self.logger.writeheader()
|
||||||
|
|
||||||
|
self.reset_keywords = reset_keywords
|
||||||
self.allow_early_resets = allow_early_resets
|
self.allow_early_resets = allow_early_resets
|
||||||
self.rewards = None
|
self.rewards = None
|
||||||
self.needs_reset = True
|
self.needs_reset = True
|
||||||
self.episode_rewards = []
|
self.episode_rewards = []
|
||||||
self.episode_lengths = []
|
self.episode_lengths = []
|
||||||
self.total_steps = 0
|
self.total_steps = 0
|
||||||
self.current_metadata = {} # extra info that gets injected into each log entry
|
self.current_reset_info = {} # extra info about the current episode, that was passed in during reset()
|
||||||
# Useful for metalearning where we're modifying the environment externally
|
|
||||||
# But want our logs to know about these modifications
|
|
||||||
|
|
||||||
def __getstate__(self): # XXX
|
def _reset(self, **kwargs):
|
||||||
d = self.__dict__.copy()
|
|
||||||
if self.f:
|
|
||||||
del d['f'], d['logger']
|
|
||||||
d['_filename'] = self.f.name
|
|
||||||
d['_num_episodes'] = len(self.episode_rewards)
|
|
||||||
else:
|
|
||||||
d['_filename'] = None
|
|
||||||
return d
|
|
||||||
def __setstate__(self, d):
|
|
||||||
filename = d.pop('_filename')
|
|
||||||
self.__dict__ = d
|
|
||||||
if filename is not None:
|
|
||||||
nlines = d.pop('_num_episodes') + 1
|
|
||||||
self.f = open(filename, "r+t")
|
|
||||||
for _ in range(nlines):
|
|
||||||
self.f.readline()
|
|
||||||
self.f.truncate()
|
|
||||||
self.logger = JSONLogger(self.f)
|
|
||||||
|
|
||||||
|
|
||||||
def reset(self):
|
|
||||||
if not self.allow_early_resets and not self.needs_reset:
|
if not self.allow_early_resets and not self.needs_reset:
|
||||||
raise RuntimeError("Tried to reset an environment before done. If you want to allow early resets, wrap your env with Monitor(env, path, allow_early_resets=True)")
|
raise RuntimeError("Tried to reset an environment before done. If you want to allow early resets, wrap your env with Monitor(env, path, allow_early_resets=True)")
|
||||||
self.rewards = []
|
self.rewards = []
|
||||||
self.needs_reset = False
|
self.needs_reset = False
|
||||||
return self.env.reset()
|
for k in self.reset_keywords:
|
||||||
|
v = kwargs.get(k)
|
||||||
|
if v is None:
|
||||||
|
raise ValueError('Expected you to pass kwarg %s into reset'%k)
|
||||||
|
self.current_reset_info[k] = v
|
||||||
|
return self.env.reset(**kwargs)
|
||||||
|
|
||||||
def step(self, action):
|
def _step(self, action):
|
||||||
if self.needs_reset:
|
if self.needs_reset:
|
||||||
raise RuntimeError("Tried to step environment that needs reset")
|
raise RuntimeError("Tried to step environment that needs reset")
|
||||||
ob, rew, done, info = self.env.step(action)
|
ob, rew, done, info = self.env.step(action)
|
||||||
@@ -75,10 +60,11 @@ class Monitor(Wrapper):
|
|||||||
self.needs_reset = True
|
self.needs_reset = True
|
||||||
eprew = sum(self.rewards)
|
eprew = sum(self.rewards)
|
||||||
eplen = len(self.rewards)
|
eplen = len(self.rewards)
|
||||||
epinfo = {"r": eprew, "l": eplen, "t": round(time.time() - self.tstart, 6)}
|
epinfo = {"r": round(eprew, 6), "l": eplen, "t": round(time.time() - self.tstart, 6)}
|
||||||
epinfo.update(self.current_metadata)
|
epinfo.update(self.current_reset_info)
|
||||||
if self.logger:
|
if self.logger:
|
||||||
self.logger.writekvs(epinfo)
|
self.logger.writerow(epinfo)
|
||||||
|
self.f.flush()
|
||||||
self.episode_rewards.append(eprew)
|
self.episode_rewards.append(eprew)
|
||||||
self.episode_lengths.append(eplen)
|
self.episode_lengths.append(eplen)
|
||||||
info['episode'] = epinfo
|
info['episode'] = epinfo
|
||||||
@@ -98,52 +84,40 @@ class Monitor(Wrapper):
|
|||||||
def get_episode_lengths(self):
|
def get_episode_lengths(self):
|
||||||
return self.episode_lengths
|
return self.episode_lengths
|
||||||
|
|
||||||
class JSONLogger(object):
|
|
||||||
def __init__(self, file):
|
|
||||||
self.file = file
|
|
||||||
|
|
||||||
def writekvs(self, kvs):
|
|
||||||
for k,v in kvs.items():
|
|
||||||
if hasattr(v, 'dtype'):
|
|
||||||
v = v.tolist()
|
|
||||||
kvs[k] = float(v)
|
|
||||||
self.file.write(json.dumps(kvs) + '\n')
|
|
||||||
self.file.flush()
|
|
||||||
|
|
||||||
|
|
||||||
class LoadMonitorResultsError(Exception):
|
class LoadMonitorResultsError(Exception):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def get_monitor_files(dir):
|
def get_monitor_files(dir):
|
||||||
return glob(path.join(dir, "*" + Monitor.EXT))
|
return glob(osp.join(dir, "*" + Monitor.EXT))
|
||||||
|
|
||||||
def load_results(dir, raw_episodes=False):
|
def load_results(dir):
|
||||||
fnames = get_monitor_files(dir)
|
import pandas
|
||||||
if not fnames:
|
monitor_files = glob(osp.join(dir, "*monitor.*")) # get both csv and (old) json files
|
||||||
|
if not monitor_files:
|
||||||
raise LoadMonitorResultsError("no monitor files of the form *%s found in %s" % (Monitor.EXT, dir))
|
raise LoadMonitorResultsError("no monitor files of the form *%s found in %s" % (Monitor.EXT, dir))
|
||||||
episodes = []
|
dfs = []
|
||||||
headers = []
|
headers = []
|
||||||
for fname in fnames:
|
for fname in monitor_files:
|
||||||
with open(fname, 'rt') as fh:
|
with open(fname, 'rt') as fh:
|
||||||
lines = fh.readlines()
|
if fname.endswith('csv'):
|
||||||
header = json.loads(lines[0])
|
firstline = fh.readline()
|
||||||
headers.append(header)
|
assert firstline[0] == '#'
|
||||||
for line in lines[1:]:
|
header = json.loads(firstline[1:])
|
||||||
episode = json.loads(line)
|
df = pandas.read_csv(fh, index_col=None)
|
||||||
episode['abstime'] = header['t_start'] + episode['t']
|
headers.append(header)
|
||||||
del episode['t']
|
elif fname.endswith('json'): # Deprecated json format
|
||||||
episodes.append(episode)
|
episodes = []
|
||||||
header0 = headers[0]
|
lines = fh.readlines()
|
||||||
for header in headers[1:]:
|
header = json.loads(lines[0])
|
||||||
assert header['env_id'] == header0['env_id'], "mixing data from two envs"
|
headers.append(header)
|
||||||
episodes = sorted(episodes, key=lambda e: e['abstime'])
|
for line in lines[1:]:
|
||||||
if raw_episodes:
|
episode = json.loads(line)
|
||||||
return episodes
|
episodes.append(episode)
|
||||||
else:
|
df = pandas.DataFrame(episodes)
|
||||||
return {
|
df['t'] += header['t_start']
|
||||||
'env_info': {'env_id': header0['env_id'], 'gym_version': header0['gym_version']},
|
dfs.append(df)
|
||||||
'episode_end_times': [e['abstime'] for e in episodes],
|
df = pandas.concat(dfs)
|
||||||
'episode_lengths': [e['l'] for e in episodes],
|
df.sort_values('t', inplace=True)
|
||||||
'episode_rewards': [e['r'] for e in episodes],
|
df['t'] -= min(header['t_start'] for header in headers)
|
||||||
'initial_reset_time': min([min(header['t_start'] for header in headers)])
|
df.headers = headers # HACK to preserve backwards compatibility
|
||||||
}
|
return df
|
@@ -1,9 +1,8 @@
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
from collections import deque
|
from collections import deque
|
||||||
from PIL import Image
|
|
||||||
import gym
|
import gym
|
||||||
from gym import spaces
|
from gym import spaces
|
||||||
|
import cv2
|
||||||
|
|
||||||
class NoopResetEnv(gym.Wrapper):
|
class NoopResetEnv(gym.Wrapper):
|
||||||
def __init__(self, env, noop_max=30):
|
def __init__(self, env, noop_max=30):
|
||||||
@@ -13,11 +12,16 @@ class NoopResetEnv(gym.Wrapper):
|
|||||||
gym.Wrapper.__init__(self, env)
|
gym.Wrapper.__init__(self, env)
|
||||||
self.noop_max = noop_max
|
self.noop_max = noop_max
|
||||||
self.override_num_noops = None
|
self.override_num_noops = None
|
||||||
assert env.unwrapped.get_action_meanings()[0] == 'NOOP'
|
if isinstance(env.action_space, gym.spaces.MultiBinary):
|
||||||
|
self.noop_action = np.zeros(self.env.action_space.n, dtype=np.int64)
|
||||||
|
else:
|
||||||
|
# used for atari environments
|
||||||
|
self.noop_action = 0
|
||||||
|
assert env.unwrapped.get_action_meanings()[0] == 'NOOP'
|
||||||
|
|
||||||
def _reset(self):
|
def _reset(self, **kwargs):
|
||||||
""" Do no-op action for a number of steps in [1, noop_max]."""
|
""" Do no-op action for a number of steps in [1, noop_max]."""
|
||||||
self.env.reset()
|
self.env.reset(**kwargs)
|
||||||
if self.override_num_noops is not None:
|
if self.override_num_noops is not None:
|
||||||
noops = self.override_num_noops
|
noops = self.override_num_noops
|
||||||
else:
|
else:
|
||||||
@@ -25,9 +29,9 @@ class NoopResetEnv(gym.Wrapper):
|
|||||||
assert noops > 0
|
assert noops > 0
|
||||||
obs = None
|
obs = None
|
||||||
for _ in range(noops):
|
for _ in range(noops):
|
||||||
obs, _, done, _ = self.env.step(0)
|
obs, _, done, _ = self.env.step(self.noop_action)
|
||||||
if done:
|
if done:
|
||||||
obs = self.env.reset()
|
obs = self.env.reset(**kwargs)
|
||||||
return obs
|
return obs
|
||||||
|
|
||||||
class FireResetEnv(gym.Wrapper):
|
class FireResetEnv(gym.Wrapper):
|
||||||
@@ -37,14 +41,14 @@ class FireResetEnv(gym.Wrapper):
|
|||||||
assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
|
assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
|
||||||
assert len(env.unwrapped.get_action_meanings()) >= 3
|
assert len(env.unwrapped.get_action_meanings()) >= 3
|
||||||
|
|
||||||
def _reset(self):
|
def _reset(self, **kwargs):
|
||||||
self.env.reset()
|
self.env.reset(**kwargs)
|
||||||
obs, _, done, _ = self.env.step(1)
|
obs, _, done, _ = self.env.step(1)
|
||||||
if done:
|
if done:
|
||||||
self.env.reset()
|
self.env.reset(**kwargs)
|
||||||
obs, _, done, _ = self.env.step(2)
|
obs, _, done, _ = self.env.step(2)
|
||||||
if done:
|
if done:
|
||||||
self.env.reset()
|
self.env.reset(**kwargs)
|
||||||
return obs
|
return obs
|
||||||
|
|
||||||
class EpisodicLifeEnv(gym.Wrapper):
|
class EpisodicLifeEnv(gym.Wrapper):
|
||||||
@@ -70,13 +74,13 @@ class EpisodicLifeEnv(gym.Wrapper):
|
|||||||
self.lives = lives
|
self.lives = lives
|
||||||
return obs, reward, done, info
|
return obs, reward, done, info
|
||||||
|
|
||||||
def _reset(self):
|
def _reset(self, **kwargs):
|
||||||
"""Reset only when lives are exhausted.
|
"""Reset only when lives are exhausted.
|
||||||
This way all states are still reachable even though lives are episodic,
|
This way all states are still reachable even though lives are episodic,
|
||||||
and the learner need not know about any of this behind-the-scenes.
|
and the learner need not know about any of this behind-the-scenes.
|
||||||
"""
|
"""
|
||||||
if self.was_real_done:
|
if self.was_real_done:
|
||||||
obs = self.env.reset()
|
obs = self.env.reset(**kwargs)
|
||||||
else:
|
else:
|
||||||
# no-op step to advance from terminal/lost life state
|
# no-op step to advance from terminal/lost life state
|
||||||
obs, _, _, _ = self.env.step(0)
|
obs, _, _, _ = self.env.step(0)
|
||||||
@@ -88,30 +92,26 @@ class MaxAndSkipEnv(gym.Wrapper):
|
|||||||
"""Return only every `skip`-th frame"""
|
"""Return only every `skip`-th frame"""
|
||||||
gym.Wrapper.__init__(self, env)
|
gym.Wrapper.__init__(self, env)
|
||||||
# most recent raw observations (for max pooling across time steps)
|
# most recent raw observations (for max pooling across time steps)
|
||||||
self._obs_buffer = deque(maxlen=2)
|
self._obs_buffer = np.zeros((2,)+env.observation_space.shape, dtype='uint8')
|
||||||
self._skip = skip
|
self._skip = skip
|
||||||
|
|
||||||
def _step(self, action):
|
def _step(self, action):
|
||||||
"""Repeat action, sum reward, and max over last observations."""
|
"""Repeat action, sum reward, and max over last observations."""
|
||||||
total_reward = 0.0
|
total_reward = 0.0
|
||||||
done = None
|
done = None
|
||||||
for _ in range(self._skip):
|
for i in range(self._skip):
|
||||||
obs, reward, done, info = self.env.step(action)
|
obs, reward, done, info = self.env.step(action)
|
||||||
self._obs_buffer.append(obs)
|
if i == self._skip - 2: self._obs_buffer[0] = obs
|
||||||
|
if i == self._skip - 1: self._obs_buffer[1] = obs
|
||||||
total_reward += reward
|
total_reward += reward
|
||||||
if done:
|
if done:
|
||||||
break
|
break
|
||||||
max_frame = np.max(np.stack(self._obs_buffer), axis=0)
|
# Note that the observation on the done=True frame
|
||||||
|
# doesn't matter
|
||||||
|
max_frame = self._obs_buffer.max(axis=0)
|
||||||
|
|
||||||
return max_frame, total_reward, done, info
|
return max_frame, total_reward, done, info
|
||||||
|
|
||||||
def _reset(self):
|
|
||||||
"""Clear past frame buffer and init. to first obs. from inner env."""
|
|
||||||
self._obs_buffer.clear()
|
|
||||||
obs = self.env.reset()
|
|
||||||
self._obs_buffer.append(obs)
|
|
||||||
return obs
|
|
||||||
|
|
||||||
class ClipRewardEnv(gym.RewardWrapper):
|
class ClipRewardEnv(gym.RewardWrapper):
|
||||||
def _reward(self, reward):
|
def _reward(self, reward):
|
||||||
"""Bin reward to {+1, 0, -1} by its sign."""
|
"""Bin reward to {+1, 0, -1} by its sign."""
|
||||||
@@ -121,52 +121,89 @@ class WarpFrame(gym.ObservationWrapper):
|
|||||||
def __init__(self, env):
|
def __init__(self, env):
|
||||||
"""Warp frames to 84x84 as done in the Nature paper and later work."""
|
"""Warp frames to 84x84 as done in the Nature paper and later work."""
|
||||||
gym.ObservationWrapper.__init__(self, env)
|
gym.ObservationWrapper.__init__(self, env)
|
||||||
self.res = 84
|
self.width = 84
|
||||||
self.observation_space = spaces.Box(low=0, high=255, shape=(self.res, self.res, 1))
|
self.height = 84
|
||||||
|
self.observation_space = spaces.Box(low=0, high=255, shape=(self.height, self.width, 1))
|
||||||
|
|
||||||
def _observation(self, obs):
|
def _observation(self, frame):
|
||||||
frame = np.dot(obs.astype('float32'), np.array([0.299, 0.587, 0.114], 'float32'))
|
frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
|
||||||
frame = np.array(Image.fromarray(frame).resize((self.res, self.res),
|
frame = cv2.resize(frame, (self.width, self.height), interpolation=cv2.INTER_AREA)
|
||||||
resample=Image.BILINEAR), dtype=np.uint8)
|
return frame[:, :, None]
|
||||||
return frame.reshape((self.res, self.res, 1))
|
|
||||||
|
|
||||||
class FrameStack(gym.Wrapper):
|
class FrameStack(gym.Wrapper):
|
||||||
def __init__(self, env, k):
|
def __init__(self, env, k):
|
||||||
"""Buffer observations and stack across channels (last axis)."""
|
"""Stack k last frames.
|
||||||
|
|
||||||
|
Returns lazy array, which is much more memory efficient.
|
||||||
|
|
||||||
|
See Also
|
||||||
|
--------
|
||||||
|
baselines.common.atari_wrappers.LazyFrames
|
||||||
|
"""
|
||||||
gym.Wrapper.__init__(self, env)
|
gym.Wrapper.__init__(self, env)
|
||||||
self.k = k
|
self.k = k
|
||||||
self.frames = deque([], maxlen=k)
|
self.frames = deque([], maxlen=k)
|
||||||
shp = env.observation_space.shape
|
shp = env.observation_space.shape
|
||||||
assert shp[2] == 1 # can only stack 1-channel frames
|
self.observation_space = spaces.Box(low=0, high=255, shape=(shp[0], shp[1], shp[2] * k))
|
||||||
self.observation_space = spaces.Box(low=0, high=255, shape=(shp[0], shp[1], k))
|
|
||||||
|
|
||||||
def _reset(self):
|
def _reset(self):
|
||||||
"""Clear buffer and re-fill by duplicating the first observation."""
|
|
||||||
ob = self.env.reset()
|
ob = self.env.reset()
|
||||||
for _ in range(self.k): self.frames.append(ob)
|
for _ in range(self.k):
|
||||||
return self._observation()
|
self.frames.append(ob)
|
||||||
|
return self._get_ob()
|
||||||
|
|
||||||
def _step(self, action):
|
def _step(self, action):
|
||||||
ob, reward, done, info = self.env.step(action)
|
ob, reward, done, info = self.env.step(action)
|
||||||
self.frames.append(ob)
|
self.frames.append(ob)
|
||||||
return self._observation(), reward, done, info
|
return self._get_ob(), reward, done, info
|
||||||
|
|
||||||
def _observation(self):
|
def _get_ob(self):
|
||||||
assert len(self.frames) == self.k
|
assert len(self.frames) == self.k
|
||||||
return np.concatenate(self.frames, axis=2)
|
return LazyFrames(list(self.frames))
|
||||||
|
|
||||||
def wrap_deepmind(env, episode_life=True, clip_rewards=True):
|
class ScaledFloatFrame(gym.ObservationWrapper):
|
||||||
"""Configure environment for DeepMind-style Atari.
|
def _observation(self, observation):
|
||||||
|
# careful! This undoes the memory optimization, use
|
||||||
|
# with smaller replay buffers only.
|
||||||
|
return np.array(observation).astype(np.float32) / 255.0
|
||||||
|
|
||||||
Note: this does not include frame stacking!"""
|
class LazyFrames(object):
|
||||||
assert 'NoFrameskip' in env.spec.id # required for DeepMind-style skip
|
def __init__(self, frames):
|
||||||
if episode_life:
|
"""This object ensures that common frames between the observations are only stored once.
|
||||||
env = EpisodicLifeEnv(env)
|
It exists purely to optimize memory usage which can be huge for DQN's 1M frames replay
|
||||||
|
buffers.
|
||||||
|
|
||||||
|
This object should only be converted to numpy array before being passed to the model.
|
||||||
|
|
||||||
|
You'd not belive how complex the previous solution was."""
|
||||||
|
self._frames = frames
|
||||||
|
|
||||||
|
def __array__(self, dtype=None):
|
||||||
|
out = np.concatenate(self._frames, axis=2)
|
||||||
|
if dtype is not None:
|
||||||
|
out = out.astype(dtype)
|
||||||
|
return out
|
||||||
|
|
||||||
|
def make_atari(env_id):
|
||||||
|
env = gym.make(env_id)
|
||||||
|
assert 'NoFrameskip' in env.spec.id
|
||||||
env = NoopResetEnv(env, noop_max=30)
|
env = NoopResetEnv(env, noop_max=30)
|
||||||
env = MaxAndSkipEnv(env, skip=4)
|
env = MaxAndSkipEnv(env, skip=4)
|
||||||
|
return env
|
||||||
|
|
||||||
|
def wrap_deepmind(env, episode_life=True, clip_rewards=True, frame_stack=False, scale=False):
|
||||||
|
"""Configure environment for DeepMind-style Atari.
|
||||||
|
"""
|
||||||
|
if episode_life:
|
||||||
|
env = EpisodicLifeEnv(env)
|
||||||
if 'FIRE' in env.unwrapped.get_action_meanings():
|
if 'FIRE' in env.unwrapped.get_action_meanings():
|
||||||
env = FireResetEnv(env)
|
env = FireResetEnv(env)
|
||||||
env = WarpFrame(env)
|
env = WarpFrame(env)
|
||||||
|
if scale:
|
||||||
|
env = ScaledFloatFrame(env)
|
||||||
if clip_rewards:
|
if clip_rewards:
|
||||||
env = ClipRewardEnv(env)
|
env = ClipRewardEnv(env)
|
||||||
|
if frame_stack:
|
||||||
|
env = FrameStack(env, 4)
|
||||||
return env
|
return env
|
||||||
|
|
||||||
|
@@ -1,239 +0,0 @@
|
|||||||
import cv2
|
|
||||||
import gym
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
from collections import deque
|
|
||||||
from gym import spaces
|
|
||||||
|
|
||||||
|
|
||||||
class NoopResetEnv(gym.Wrapper):
|
|
||||||
def __init__(self, env=None, noop_max=30):
|
|
||||||
"""Sample initial states by taking random number of no-ops on reset.
|
|
||||||
No-op is assumed to be action 0.
|
|
||||||
"""
|
|
||||||
super(NoopResetEnv, self).__init__(env)
|
|
||||||
self.noop_max = noop_max
|
|
||||||
self.override_num_noops = None
|
|
||||||
assert env.unwrapped.get_action_meanings()[0] == 'NOOP'
|
|
||||||
|
|
||||||
def _reset(self):
|
|
||||||
""" Do no-op action for a number of steps in [1, noop_max]."""
|
|
||||||
self.env.reset()
|
|
||||||
if self.override_num_noops is not None:
|
|
||||||
noops = self.override_num_noops
|
|
||||||
else:
|
|
||||||
noops = np.random.randint(1, self.noop_max + 1)
|
|
||||||
assert noops > 0
|
|
||||||
obs = None
|
|
||||||
for _ in range(noops):
|
|
||||||
obs, _, done, _ = self.env.step(0)
|
|
||||||
if done:
|
|
||||||
obs = self.env.reset()
|
|
||||||
return obs
|
|
||||||
|
|
||||||
|
|
||||||
class FireResetEnv(gym.Wrapper):
|
|
||||||
def __init__(self, env=None):
|
|
||||||
"""For environments where the user need to press FIRE for the game to start."""
|
|
||||||
super(FireResetEnv, self).__init__(env)
|
|
||||||
assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
|
|
||||||
assert len(env.unwrapped.get_action_meanings()) >= 3
|
|
||||||
|
|
||||||
def _reset(self):
|
|
||||||
self.env.reset()
|
|
||||||
obs, _, done, _ = self.env.step(1)
|
|
||||||
if done:
|
|
||||||
self.env.reset()
|
|
||||||
obs, _, done, _ = self.env.step(2)
|
|
||||||
if done:
|
|
||||||
self.env.reset()
|
|
||||||
return obs
|
|
||||||
|
|
||||||
|
|
||||||
class EpisodicLifeEnv(gym.Wrapper):
|
|
||||||
def __init__(self, env=None):
|
|
||||||
"""Make end-of-life == end-of-episode, but only reset on true game over.
|
|
||||||
Done by DeepMind for the DQN and co. since it helps value estimation.
|
|
||||||
"""
|
|
||||||
super(EpisodicLifeEnv, self).__init__(env)
|
|
||||||
self.lives = 0
|
|
||||||
self.was_real_done = True
|
|
||||||
self.was_real_reset = False
|
|
||||||
|
|
||||||
def _step(self, action):
|
|
||||||
obs, reward, done, info = self.env.step(action)
|
|
||||||
self.was_real_done = done
|
|
||||||
# check current lives, make loss of life terminal,
|
|
||||||
# then update lives to handle bonus lives
|
|
||||||
lives = self.env.unwrapped.ale.lives()
|
|
||||||
if lives < self.lives and lives > 0:
|
|
||||||
# for Qbert somtimes we stay in lives == 0 condtion for a few frames
|
|
||||||
# so its important to keep lives > 0, so that we only reset once
|
|
||||||
# the environment advertises done.
|
|
||||||
done = True
|
|
||||||
self.lives = lives
|
|
||||||
return obs, reward, done, info
|
|
||||||
|
|
||||||
def _reset(self):
|
|
||||||
"""Reset only when lives are exhausted.
|
|
||||||
This way all states are still reachable even though lives are episodic,
|
|
||||||
and the learner need not know about any of this behind-the-scenes.
|
|
||||||
"""
|
|
||||||
if self.was_real_done:
|
|
||||||
obs = self.env.reset()
|
|
||||||
self.was_real_reset = True
|
|
||||||
else:
|
|
||||||
# no-op step to advance from terminal/lost life state
|
|
||||||
obs, _, _, _ = self.env.step(0)
|
|
||||||
self.was_real_reset = False
|
|
||||||
self.lives = self.env.unwrapped.ale.lives()
|
|
||||||
return obs
|
|
||||||
|
|
||||||
|
|
||||||
class MaxAndSkipEnv(gym.Wrapper):
|
|
||||||
def __init__(self, env=None, skip=4):
|
|
||||||
"""Return only every `skip`-th frame"""
|
|
||||||
super(MaxAndSkipEnv, self).__init__(env)
|
|
||||||
# most recent raw observations (for max pooling across time steps)
|
|
||||||
self._obs_buffer = deque(maxlen=2)
|
|
||||||
self._skip = skip
|
|
||||||
|
|
||||||
def _step(self, action):
|
|
||||||
total_reward = 0.0
|
|
||||||
done = None
|
|
||||||
for _ in range(self._skip):
|
|
||||||
obs, reward, done, info = self.env.step(action)
|
|
||||||
self._obs_buffer.append(obs)
|
|
||||||
total_reward += reward
|
|
||||||
if done:
|
|
||||||
break
|
|
||||||
|
|
||||||
max_frame = np.max(np.stack(self._obs_buffer), axis=0)
|
|
||||||
|
|
||||||
return max_frame, total_reward, done, info
|
|
||||||
|
|
||||||
def _reset(self):
|
|
||||||
"""Clear past frame buffer and init. to first obs. from inner env."""
|
|
||||||
self._obs_buffer.clear()
|
|
||||||
obs = self.env.reset()
|
|
||||||
self._obs_buffer.append(obs)
|
|
||||||
return obs
|
|
||||||
|
|
||||||
|
|
||||||
class ProcessFrame84(gym.ObservationWrapper):
|
|
||||||
def __init__(self, env=None):
|
|
||||||
super(ProcessFrame84, self).__init__(env)
|
|
||||||
self.observation_space = spaces.Box(low=0, high=255, shape=(84, 84, 1))
|
|
||||||
|
|
||||||
def _observation(self, obs):
|
|
||||||
return ProcessFrame84.process(obs)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def process(frame):
|
|
||||||
if frame.size == 210 * 160 * 3:
|
|
||||||
img = np.reshape(frame, [210, 160, 3]).astype(np.float32)
|
|
||||||
elif frame.size == 250 * 160 * 3:
|
|
||||||
img = np.reshape(frame, [250, 160, 3]).astype(np.float32)
|
|
||||||
else:
|
|
||||||
assert False, "Unknown resolution."
|
|
||||||
img = img[:, :, 0] * 0.299 + img[:, :, 1] * 0.587 + img[:, :, 2] * 0.114
|
|
||||||
resized_screen = cv2.resize(img, (84, 110), interpolation=cv2.INTER_AREA)
|
|
||||||
x_t = resized_screen[18:102, :]
|
|
||||||
x_t = np.reshape(x_t, [84, 84, 1])
|
|
||||||
return x_t.astype(np.uint8)
|
|
||||||
|
|
||||||
|
|
||||||
class ClippedRewardsWrapper(gym.RewardWrapper):
|
|
||||||
def _reward(self, reward):
|
|
||||||
"""Change all the positive rewards to 1, negative to -1 and keep zero."""
|
|
||||||
return np.sign(reward)
|
|
||||||
|
|
||||||
|
|
||||||
class LazyFrames(object):
|
|
||||||
def __init__(self, frames):
|
|
||||||
"""This object ensures that common frames between the observations are only stored once.
|
|
||||||
It exists purely to optimize memory usage which can be huge for DQN's 1M frames replay
|
|
||||||
buffers.
|
|
||||||
|
|
||||||
This object should only be converted to numpy array before being passed to the model.
|
|
||||||
|
|
||||||
You'd not belive how complex the previous solution was."""
|
|
||||||
self._frames = frames
|
|
||||||
|
|
||||||
def __array__(self, dtype=None):
|
|
||||||
out = np.concatenate(self._frames, axis=2)
|
|
||||||
if dtype is not None:
|
|
||||||
out = out.astype(dtype)
|
|
||||||
return out
|
|
||||||
|
|
||||||
|
|
||||||
class FrameStack(gym.Wrapper):
|
|
||||||
def __init__(self, env, k):
|
|
||||||
"""Stack k last frames.
|
|
||||||
|
|
||||||
Returns lazy array, which is much more memory efficient.
|
|
||||||
|
|
||||||
See Also
|
|
||||||
--------
|
|
||||||
baselines.common.atari_wrappers.LazyFrames
|
|
||||||
"""
|
|
||||||
gym.Wrapper.__init__(self, env)
|
|
||||||
self.k = k
|
|
||||||
self.frames = deque([], maxlen=k)
|
|
||||||
shp = env.observation_space.shape
|
|
||||||
self.observation_space = spaces.Box(low=0, high=255, shape=(shp[0], shp[1], shp[2] * k))
|
|
||||||
|
|
||||||
def _reset(self):
|
|
||||||
ob = self.env.reset()
|
|
||||||
for _ in range(self.k):
|
|
||||||
self.frames.append(ob)
|
|
||||||
return self._get_ob()
|
|
||||||
|
|
||||||
def _step(self, action):
|
|
||||||
ob, reward, done, info = self.env.step(action)
|
|
||||||
self.frames.append(ob)
|
|
||||||
return self._get_ob(), reward, done, info
|
|
||||||
|
|
||||||
def _get_ob(self):
|
|
||||||
assert len(self.frames) == self.k
|
|
||||||
return LazyFrames(list(self.frames))
|
|
||||||
|
|
||||||
|
|
||||||
class ScaledFloatFrame(gym.ObservationWrapper):
|
|
||||||
def _observation(self, obs):
|
|
||||||
# careful! This undoes the memory optimization, use
|
|
||||||
# with smaller replay buffers only.
|
|
||||||
return np.array(obs).astype(np.float32) / 255.0
|
|
||||||
|
|
||||||
|
|
||||||
def wrap_dqn(env):
|
|
||||||
"""Apply a common set of wrappers for Atari games."""
|
|
||||||
assert 'NoFrameskip' in env.spec.id
|
|
||||||
env = EpisodicLifeEnv(env)
|
|
||||||
env = NoopResetEnv(env, noop_max=30)
|
|
||||||
env = MaxAndSkipEnv(env, skip=4)
|
|
||||||
if 'FIRE' in env.unwrapped.get_action_meanings():
|
|
||||||
env = FireResetEnv(env)
|
|
||||||
env = ProcessFrame84(env)
|
|
||||||
env = FrameStack(env, 4)
|
|
||||||
env = ClippedRewardsWrapper(env)
|
|
||||||
return env
|
|
||||||
|
|
||||||
|
|
||||||
class A2cProcessFrame(gym.Wrapper):
|
|
||||||
def __init__(self, env):
|
|
||||||
gym.Wrapper.__init__(self, env)
|
|
||||||
self.observation_space = spaces.Box(low=0, high=255, shape=(84, 84, 1))
|
|
||||||
|
|
||||||
def _step(self, action):
|
|
||||||
ob, reward, done, info = self.env.step(action)
|
|
||||||
return A2cProcessFrame.process(ob), reward, done, info
|
|
||||||
|
|
||||||
def _reset(self):
|
|
||||||
return A2cProcessFrame.process(self.env.reset())
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def process(frame):
|
|
||||||
frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
|
|
||||||
frame = cv2.resize(frame, (84, 84), interpolation=cv2.INTER_AREA)
|
|
||||||
return frame.reshape(84, 84, 1)
|
|
@@ -10,10 +10,7 @@ except ImportError:
|
|||||||
from shutil import unpack_archive
|
from shutil import unpack_archive
|
||||||
from threading import Event
|
from threading import Event
|
||||||
|
|
||||||
"""TODOS:
|
# TODOS: use Azure snapshots instead of hacky backups
|
||||||
- use Azure snapshots instead of hacky backups
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
def fixed_list_blobs(service, *args, **kwargs):
|
def fixed_list_blobs(service, *args, **kwargs):
|
||||||
"""By defualt list_containers only returns a subset of results.
|
"""By defualt list_containers only returns a subset of results.
|
||||||
@@ -37,7 +34,7 @@ def make_archive(source_path, dest_path):
|
|||||||
prefix_path = os.path.dirname(source_path)
|
prefix_path = os.path.dirname(source_path)
|
||||||
with zipfile.ZipFile(dest_path, "w", compression=zipfile.ZIP_STORED) as zf:
|
with zipfile.ZipFile(dest_path, "w", compression=zipfile.ZIP_STORED) as zf:
|
||||||
if os.path.isdir(source_path):
|
if os.path.isdir(source_path):
|
||||||
for dirname, subdirs, files in os.walk(source_path):
|
for dirname, _subdirs, files in os.walk(source_path):
|
||||||
zf.write(dirname, os.path.relpath(dirname, prefix_path))
|
zf.write(dirname, os.path.relpath(dirname, prefix_path))
|
||||||
for filename in files:
|
for filename in files:
|
||||||
filepath = os.path.join(dirname, filename)
|
filepath = os.path.join(dirname, filename)
|
||||||
|
@@ -2,7 +2,6 @@ import tensorflow as tf
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
import baselines.common.tf_util as U
|
import baselines.common.tf_util as U
|
||||||
from tensorflow.python.ops import math_ops
|
from tensorflow.python.ops import math_ops
|
||||||
from tensorflow.python.ops import nn
|
|
||||||
|
|
||||||
class Pd(object):
|
class Pd(object):
|
||||||
"""
|
"""
|
||||||
|
@@ -4,7 +4,6 @@ import os
|
|||||||
import pickle
|
import pickle
|
||||||
import random
|
import random
|
||||||
import tempfile
|
import tempfile
|
||||||
import time
|
|
||||||
import zipfile
|
import zipfile
|
||||||
|
|
||||||
|
|
||||||
@@ -153,76 +152,6 @@ class RunningAvg(object):
|
|||||||
"""Get the current estimate"""
|
"""Get the current estimate"""
|
||||||
return self._value
|
return self._value
|
||||||
|
|
||||||
|
|
||||||
class SimpleMonitor(gym.Wrapper):
|
|
||||||
def __init__(self, env):
|
|
||||||
"""Adds two qunatities to info returned by every step:
|
|
||||||
|
|
||||||
num_steps: int
|
|
||||||
Number of steps takes so far
|
|
||||||
rewards: [float]
|
|
||||||
All the cumulative rewards for the episodes completed so far.
|
|
||||||
"""
|
|
||||||
super().__init__(env)
|
|
||||||
# current episode state
|
|
||||||
self._current_reward = None
|
|
||||||
self._num_steps = None
|
|
||||||
# temporary monitor state that we do not save
|
|
||||||
self._time_offset = None
|
|
||||||
self._total_steps = None
|
|
||||||
# monitor state
|
|
||||||
self._episode_rewards = []
|
|
||||||
self._episode_lengths = []
|
|
||||||
self._episode_end_times = []
|
|
||||||
|
|
||||||
def _reset(self):
|
|
||||||
obs = self.env.reset()
|
|
||||||
# recompute temporary state if needed
|
|
||||||
if self._time_offset is None:
|
|
||||||
self._time_offset = time.time()
|
|
||||||
if len(self._episode_end_times) > 0:
|
|
||||||
self._time_offset -= self._episode_end_times[-1]
|
|
||||||
if self._total_steps is None:
|
|
||||||
self._total_steps = sum(self._episode_lengths)
|
|
||||||
# update monitor state
|
|
||||||
if self._current_reward is not None:
|
|
||||||
self._episode_rewards.append(self._current_reward)
|
|
||||||
self._episode_lengths.append(self._num_steps)
|
|
||||||
self._episode_end_times.append(time.time() - self._time_offset)
|
|
||||||
# reset episode state
|
|
||||||
self._current_reward = 0
|
|
||||||
self._num_steps = 0
|
|
||||||
|
|
||||||
return obs
|
|
||||||
|
|
||||||
def _step(self, action):
|
|
||||||
obs, rew, done, info = self.env.step(action)
|
|
||||||
self._current_reward += rew
|
|
||||||
self._num_steps += 1
|
|
||||||
self._total_steps += 1
|
|
||||||
info['steps'] = self._total_steps
|
|
||||||
info['rewards'] = self._episode_rewards
|
|
||||||
return (obs, rew, done, info)
|
|
||||||
|
|
||||||
def get_state(self):
|
|
||||||
return {
|
|
||||||
'env_id': self.env.unwrapped.spec.id,
|
|
||||||
'episode_data': {
|
|
||||||
'episode_rewards': self._episode_rewards,
|
|
||||||
'episode_lengths': self._episode_lengths,
|
|
||||||
'episode_end_times': self._episode_end_times,
|
|
||||||
'initial_reset_time': 0,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
def set_state(self, state):
|
|
||||||
assert state['env_id'] == self.env.unwrapped.spec.id
|
|
||||||
ed = state['episode_data']
|
|
||||||
self._episode_rewards = ed['episode_rewards']
|
|
||||||
self._episode_lengths = ed['episode_lengths']
|
|
||||||
self._episode_end_times = ed['episode_end_times']
|
|
||||||
|
|
||||||
|
|
||||||
def boolean_flag(parser, name, default=False, help=None):
|
def boolean_flag(parser, name, default=False, help=None):
|
||||||
"""Add a boolean flag to argparse parser.
|
"""Add a boolean flag to argparse parser.
|
||||||
|
|
||||||
|
@@ -6,51 +6,41 @@ import copy
|
|||||||
import os
|
import os
|
||||||
import collections
|
import collections
|
||||||
|
|
||||||
|
|
||||||
# ================================================================
|
# ================================================================
|
||||||
# Make consistent with numpy
|
# Make consistent with numpy
|
||||||
# ================================================================
|
# ================================================================
|
||||||
|
|
||||||
clip = tf.clip_by_value
|
clip = tf.clip_by_value
|
||||||
|
|
||||||
|
|
||||||
def sum(x, axis=None, keepdims=False):
|
def sum(x, axis=None, keepdims=False):
|
||||||
axis = None if axis is None else [axis]
|
axis = None if axis is None else [axis]
|
||||||
return tf.reduce_sum(x, axis=axis, keep_dims=keepdims)
|
return tf.reduce_sum(x, axis=axis, keep_dims=keepdims)
|
||||||
|
|
||||||
|
|
||||||
def mean(x, axis=None, keepdims=False):
|
def mean(x, axis=None, keepdims=False):
|
||||||
axis = None if axis is None else [axis]
|
axis = None if axis is None else [axis]
|
||||||
return tf.reduce_mean(x, axis=axis, keep_dims=keepdims)
|
return tf.reduce_mean(x, axis=axis, keep_dims=keepdims)
|
||||||
|
|
||||||
|
|
||||||
def var(x, axis=None, keepdims=False):
|
def var(x, axis=None, keepdims=False):
|
||||||
meanx = mean(x, axis=axis, keepdims=keepdims)
|
meanx = mean(x, axis=axis, keepdims=keepdims)
|
||||||
return mean(tf.square(x - meanx), axis=axis, keepdims=keepdims)
|
return mean(tf.square(x - meanx), axis=axis, keepdims=keepdims)
|
||||||
|
|
||||||
|
|
||||||
def std(x, axis=None, keepdims=False):
|
def std(x, axis=None, keepdims=False):
|
||||||
return tf.sqrt(var(x, axis=axis, keepdims=keepdims))
|
return tf.sqrt(var(x, axis=axis, keepdims=keepdims))
|
||||||
|
|
||||||
|
|
||||||
def max(x, axis=None, keepdims=False):
|
def max(x, axis=None, keepdims=False):
|
||||||
axis = None if axis is None else [axis]
|
axis = None if axis is None else [axis]
|
||||||
return tf.reduce_max(x, axis=axis, keep_dims=keepdims)
|
return tf.reduce_max(x, axis=axis, keep_dims=keepdims)
|
||||||
|
|
||||||
|
|
||||||
def min(x, axis=None, keepdims=False):
|
def min(x, axis=None, keepdims=False):
|
||||||
axis = None if axis is None else [axis]
|
axis = None if axis is None else [axis]
|
||||||
return tf.reduce_min(x, axis=axis, keep_dims=keepdims)
|
return tf.reduce_min(x, axis=axis, keep_dims=keepdims)
|
||||||
|
|
||||||
|
|
||||||
def concatenate(arrs, axis=0):
|
def concatenate(arrs, axis=0):
|
||||||
return tf.concat(axis=axis, values=arrs)
|
return tf.concat(axis=axis, values=arrs)
|
||||||
|
|
||||||
|
|
||||||
def argmax(x, axis=None):
|
def argmax(x, axis=None):
|
||||||
return tf.argmax(x, axis=axis)
|
return tf.argmax(x, axis=axis)
|
||||||
|
|
||||||
|
|
||||||
def switch(condition, then_expression, else_expression):
|
def switch(condition, then_expression, else_expression):
|
||||||
"""Switches between two operations depending on a scalar value (int or bool).
|
"""Switches between two operations depending on a scalar value (int or bool).
|
||||||
Note that both `then_expression` and `else_expression`
|
Note that both `then_expression` and `else_expression`
|
||||||
@@ -72,35 +62,29 @@ def switch(condition, then_expression, else_expression):
|
|||||||
# Extras
|
# Extras
|
||||||
# ================================================================
|
# ================================================================
|
||||||
|
|
||||||
|
|
||||||
def l2loss(params):
|
def l2loss(params):
|
||||||
if len(params) == 0:
|
if len(params) == 0:
|
||||||
return tf.constant(0.0)
|
return tf.constant(0.0)
|
||||||
else:
|
else:
|
||||||
return tf.add_n([sum(tf.square(p)) for p in params])
|
return tf.add_n([sum(tf.square(p)) for p in params])
|
||||||
|
|
||||||
|
|
||||||
def lrelu(x, leak=0.2):
|
def lrelu(x, leak=0.2):
|
||||||
f1 = 0.5 * (1 + leak)
|
f1 = 0.5 * (1 + leak)
|
||||||
f2 = 0.5 * (1 - leak)
|
f2 = 0.5 * (1 - leak)
|
||||||
return f1 * x + f2 * abs(x)
|
return f1 * x + f2 * abs(x)
|
||||||
|
|
||||||
|
|
||||||
def categorical_sample_logits(X):
|
def categorical_sample_logits(X):
|
||||||
# https://github.com/tensorflow/tensorflow/issues/456
|
# https://github.com/tensorflow/tensorflow/issues/456
|
||||||
U = tf.random_uniform(tf.shape(X))
|
U = tf.random_uniform(tf.shape(X))
|
||||||
return argmax(X - tf.log(-tf.log(U)), axis=1)
|
return argmax(X - tf.log(-tf.log(U)), axis=1)
|
||||||
|
|
||||||
|
|
||||||
# ================================================================
|
# ================================================================
|
||||||
# Inputs
|
# Inputs
|
||||||
# ================================================================
|
# ================================================================
|
||||||
|
|
||||||
|
|
||||||
def is_placeholder(x):
|
def is_placeholder(x):
|
||||||
return type(x) is tf.Tensor and len(x.op.inputs) == 0
|
return type(x) is tf.Tensor and len(x.op.inputs) == 0
|
||||||
|
|
||||||
|
|
||||||
class TfInput(object):
|
class TfInput(object):
|
||||||
def __init__(self, name="(unnamed)"):
|
def __init__(self, name="(unnamed)"):
|
||||||
"""Generalized Tensorflow placeholder. The main differences are:
|
"""Generalized Tensorflow placeholder. The main differences are:
|
||||||
@@ -119,7 +103,6 @@ class TfInput(object):
|
|||||||
"""Given data input it to the placeholder(s)."""
|
"""Given data input it to the placeholder(s)."""
|
||||||
raise NotImplemented()
|
raise NotImplemented()
|
||||||
|
|
||||||
|
|
||||||
class PlacholderTfInput(TfInput):
|
class PlacholderTfInput(TfInput):
|
||||||
def __init__(self, placeholder):
|
def __init__(self, placeholder):
|
||||||
"""Wrapper for regular tensorflow placeholder."""
|
"""Wrapper for regular tensorflow placeholder."""
|
||||||
@@ -132,7 +115,6 @@ class PlacholderTfInput(TfInput):
|
|||||||
def make_feed_dict(self, data):
|
def make_feed_dict(self, data):
|
||||||
return {self._placeholder: data}
|
return {self._placeholder: data}
|
||||||
|
|
||||||
|
|
||||||
class BatchInput(PlacholderTfInput):
|
class BatchInput(PlacholderTfInput):
|
||||||
def __init__(self, shape, dtype=tf.float32, name=None):
|
def __init__(self, shape, dtype=tf.float32, name=None):
|
||||||
"""Creates a placeholder for a batch of tensors of a given shape and dtype
|
"""Creates a placeholder for a batch of tensors of a given shape and dtype
|
||||||
@@ -148,7 +130,6 @@ class BatchInput(PlacholderTfInput):
|
|||||||
"""
|
"""
|
||||||
super().__init__(tf.placeholder(dtype, [None] + list(shape), name=name))
|
super().__init__(tf.placeholder(dtype, [None] + list(shape), name=name))
|
||||||
|
|
||||||
|
|
||||||
class Uint8Input(PlacholderTfInput):
|
class Uint8Input(PlacholderTfInput):
|
||||||
def __init__(self, shape, name=None):
|
def __init__(self, shape, name=None):
|
||||||
"""Takes input in uint8 format which is cast to float32 and divided by 255
|
"""Takes input in uint8 format which is cast to float32 and divided by 255
|
||||||
@@ -171,7 +152,6 @@ class Uint8Input(PlacholderTfInput):
|
|||||||
def get(self):
|
def get(self):
|
||||||
return self._output
|
return self._output
|
||||||
|
|
||||||
|
|
||||||
def ensure_tf_input(thing):
|
def ensure_tf_input(thing):
|
||||||
"""Takes either tf.placeholder of TfInput and outputs equivalent TfInput"""
|
"""Takes either tf.placeholder of TfInput and outputs equivalent TfInput"""
|
||||||
if isinstance(thing, TfInput):
|
if isinstance(thing, TfInput):
|
||||||
@@ -185,7 +165,6 @@ def ensure_tf_input(thing):
|
|||||||
# Mathematical utils
|
# Mathematical utils
|
||||||
# ================================================================
|
# ================================================================
|
||||||
|
|
||||||
|
|
||||||
def huber_loss(x, delta=1.0):
|
def huber_loss(x, delta=1.0):
|
||||||
"""Reference: https://en.wikipedia.org/wiki/Huber_loss"""
|
"""Reference: https://en.wikipedia.org/wiki/Huber_loss"""
|
||||||
return tf.where(
|
return tf.where(
|
||||||
@@ -198,7 +177,6 @@ def huber_loss(x, delta=1.0):
|
|||||||
# Optimizer utils
|
# Optimizer utils
|
||||||
# ================================================================
|
# ================================================================
|
||||||
|
|
||||||
|
|
||||||
def minimize_and_clip(optimizer, objective, var_list, clip_val=10):
|
def minimize_and_clip(optimizer, objective, var_list, clip_val=10):
|
||||||
"""Minimized `objective` using `optimizer` w.r.t. variables in
|
"""Minimized `objective` using `optimizer` w.r.t. variables in
|
||||||
`var_list` while ensure the norm of the gradients for each
|
`var_list` while ensure the norm of the gradients for each
|
||||||
@@ -210,7 +188,6 @@ def minimize_and_clip(optimizer, objective, var_list, clip_val=10):
|
|||||||
gradients[i] = (tf.clip_by_norm(grad, clip_val), var)
|
gradients[i] = (tf.clip_by_norm(grad, clip_val), var)
|
||||||
return optimizer.apply_gradients(gradients)
|
return optimizer.apply_gradients(gradients)
|
||||||
|
|
||||||
|
|
||||||
# ================================================================
|
# ================================================================
|
||||||
# Global session
|
# Global session
|
||||||
# ================================================================
|
# ================================================================
|
||||||
@@ -219,7 +196,6 @@ def get_session():
|
|||||||
"""Returns recently made Tensorflow session"""
|
"""Returns recently made Tensorflow session"""
|
||||||
return tf.get_default_session()
|
return tf.get_default_session()
|
||||||
|
|
||||||
|
|
||||||
def make_session(num_cpu):
|
def make_session(num_cpu):
|
||||||
"""Returns a session that will use <num_cpu> CPU's only"""
|
"""Returns a session that will use <num_cpu> CPU's only"""
|
||||||
tf_config = tf.ConfigProto(
|
tf_config = tf.ConfigProto(
|
||||||
@@ -227,31 +203,25 @@ def make_session(num_cpu):
|
|||||||
intra_op_parallelism_threads=num_cpu)
|
intra_op_parallelism_threads=num_cpu)
|
||||||
return tf.Session(config=tf_config)
|
return tf.Session(config=tf_config)
|
||||||
|
|
||||||
|
|
||||||
def single_threaded_session():
|
def single_threaded_session():
|
||||||
"""Returns a session which will only use a single CPU"""
|
"""Returns a session which will only use a single CPU"""
|
||||||
return make_session(1)
|
return make_session(1)
|
||||||
|
|
||||||
|
|
||||||
ALREADY_INITIALIZED = set()
|
ALREADY_INITIALIZED = set()
|
||||||
|
|
||||||
|
|
||||||
def initialize():
|
def initialize():
|
||||||
"""Initialize all the uninitialized variables in the global scope."""
|
"""Initialize all the uninitialized variables in the global scope."""
|
||||||
new_variables = set(tf.global_variables()) - ALREADY_INITIALIZED
|
new_variables = set(tf.global_variables()) - ALREADY_INITIALIZED
|
||||||
get_session().run(tf.variables_initializer(new_variables))
|
get_session().run(tf.variables_initializer(new_variables))
|
||||||
ALREADY_INITIALIZED.update(new_variables)
|
ALREADY_INITIALIZED.update(new_variables)
|
||||||
|
|
||||||
|
|
||||||
def eval(expr, feed_dict=None):
|
def eval(expr, feed_dict=None):
|
||||||
if feed_dict is None:
|
if feed_dict is None:
|
||||||
feed_dict = {}
|
feed_dict = {}
|
||||||
return get_session().run(expr, feed_dict=feed_dict)
|
return get_session().run(expr, feed_dict=feed_dict)
|
||||||
|
|
||||||
|
|
||||||
VALUE_SETTERS = collections.OrderedDict()
|
VALUE_SETTERS = collections.OrderedDict()
|
||||||
|
|
||||||
|
|
||||||
def set_value(v, val):
|
def set_value(v, val):
|
||||||
global VALUE_SETTERS
|
global VALUE_SETTERS
|
||||||
if v in VALUE_SETTERS:
|
if v in VALUE_SETTERS:
|
||||||
@@ -262,17 +232,14 @@ def set_value(v, val):
|
|||||||
VALUE_SETTERS[v] = (set_op, set_endpoint)
|
VALUE_SETTERS[v] = (set_op, set_endpoint)
|
||||||
get_session().run(set_op, feed_dict={set_endpoint: val})
|
get_session().run(set_op, feed_dict={set_endpoint: val})
|
||||||
|
|
||||||
|
|
||||||
# ================================================================
|
# ================================================================
|
||||||
# Saving variables
|
# Saving variables
|
||||||
# ================================================================
|
# ================================================================
|
||||||
|
|
||||||
|
|
||||||
def load_state(fname):
|
def load_state(fname):
|
||||||
saver = tf.train.Saver()
|
saver = tf.train.Saver()
|
||||||
saver.restore(get_session(), fname)
|
saver.restore(get_session(), fname)
|
||||||
|
|
||||||
|
|
||||||
def save_state(fname):
|
def save_state(fname):
|
||||||
os.makedirs(os.path.dirname(fname), exist_ok=True)
|
os.makedirs(os.path.dirname(fname), exist_ok=True)
|
||||||
saver = tf.train.Saver()
|
saver = tf.train.Saver()
|
||||||
@@ -282,7 +249,6 @@ def save_state(fname):
|
|||||||
# Model components
|
# Model components
|
||||||
# ================================================================
|
# ================================================================
|
||||||
|
|
||||||
|
|
||||||
def normc_initializer(std=1.0):
|
def normc_initializer(std=1.0):
|
||||||
def _initializer(shape, dtype=None, partition_info=None): # pylint: disable=W0613
|
def _initializer(shape, dtype=None, partition_info=None): # pylint: disable=W0613
|
||||||
out = np.random.randn(*shape).astype(np.float32)
|
out = np.random.randn(*shape).astype(np.float32)
|
||||||
@@ -290,7 +256,6 @@ def normc_initializer(std=1.0):
|
|||||||
return tf.constant(out)
|
return tf.constant(out)
|
||||||
return _initializer
|
return _initializer
|
||||||
|
|
||||||
|
|
||||||
def conv2d(x, num_filters, name, filter_size=(3, 3), stride=(1, 1), pad="SAME", dtype=tf.float32, collections=None,
|
def conv2d(x, num_filters, name, filter_size=(3, 3), stride=(1, 1), pad="SAME", dtype=tf.float32, collections=None,
|
||||||
summary_tag=None):
|
summary_tag=None):
|
||||||
with tf.variable_scope(name):
|
with tf.variable_scope(name):
|
||||||
@@ -320,7 +285,6 @@ def conv2d(x, num_filters, name, filter_size=(3, 3), stride=(1, 1), pad="SAME",
|
|||||||
|
|
||||||
return tf.nn.conv2d(x, w, stride_shape, pad) + b
|
return tf.nn.conv2d(x, w, stride_shape, pad) + b
|
||||||
|
|
||||||
|
|
||||||
def dense(x, size, name, weight_init=None, bias=True):
|
def dense(x, size, name, weight_init=None, bias=True):
|
||||||
w = tf.get_variable(name + "/w", [x.get_shape()[1], size], initializer=weight_init)
|
w = tf.get_variable(name + "/w", [x.get_shape()[1], size], initializer=weight_init)
|
||||||
ret = tf.matmul(x, w)
|
ret = tf.matmul(x, w)
|
||||||
@@ -330,7 +294,6 @@ def dense(x, size, name, weight_init=None, bias=True):
|
|||||||
else:
|
else:
|
||||||
return ret
|
return ret
|
||||||
|
|
||||||
|
|
||||||
def wndense(x, size, name, init_scale=1.0):
|
def wndense(x, size, name, init_scale=1.0):
|
||||||
v = tf.get_variable(name + "/V", [int(x.get_shape()[1]), size],
|
v = tf.get_variable(name + "/V", [int(x.get_shape()[1]), size],
|
||||||
initializer=tf.random_normal_initializer(0, 0.05))
|
initializer=tf.random_normal_initializer(0, 0.05))
|
||||||
@@ -342,11 +305,9 @@ def wndense(x, size, name, init_scale=1.0):
|
|||||||
scaler = g / tf.sqrt(sum(tf.square(v), axis=0, keepdims=True))
|
scaler = g / tf.sqrt(sum(tf.square(v), axis=0, keepdims=True))
|
||||||
return tf.reshape(scaler, [1, size]) * x + tf.reshape(b, [1, size])
|
return tf.reshape(scaler, [1, size]) * x + tf.reshape(b, [1, size])
|
||||||
|
|
||||||
|
|
||||||
def densenobias(x, size, name, weight_init=None):
|
def densenobias(x, size, name, weight_init=None):
|
||||||
return dense(x, size, name, weight_init=weight_init, bias=False)
|
return dense(x, size, name, weight_init=weight_init, bias=False)
|
||||||
|
|
||||||
|
|
||||||
def dropout(x, pkeep, phase=None, mask=None):
|
def dropout(x, pkeep, phase=None, mask=None):
|
||||||
mask = tf.floor(pkeep + tf.random_uniform(tf.shape(x))) if mask is None else mask
|
mask = tf.floor(pkeep + tf.random_uniform(tf.shape(x))) if mask is None else mask
|
||||||
if phase is None:
|
if phase is None:
|
||||||
@@ -354,13 +315,10 @@ def dropout(x, pkeep, phase=None, mask=None):
|
|||||||
else:
|
else:
|
||||||
return switch(phase, mask * x, pkeep * x)
|
return switch(phase, mask * x, pkeep * x)
|
||||||
|
|
||||||
|
|
||||||
# ================================================================
|
# ================================================================
|
||||||
# Theano-like Function
|
# Theano-like Function
|
||||||
# ================================================================
|
# ================================================================
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def function(inputs, outputs, updates=None, givens=None):
|
def function(inputs, outputs, updates=None, givens=None):
|
||||||
"""Just like Theano function. Take a bunch of tensorflow placeholders and expressions
|
"""Just like Theano function. Take a bunch of tensorflow placeholders and expressions
|
||||||
computed based on those placeholders and produces f(inputs) -> outputs. Function f takes
|
computed based on those placeholders and produces f(inputs) -> outputs. Function f takes
|
||||||
@@ -401,7 +359,6 @@ def function(inputs, outputs, updates=None, givens=None):
|
|||||||
f = _Function(inputs, [outputs], updates, givens=givens)
|
f = _Function(inputs, [outputs], updates, givens=givens)
|
||||||
return lambda *args, **kwargs: f(*args, **kwargs)[0]
|
return lambda *args, **kwargs: f(*args, **kwargs)[0]
|
||||||
|
|
||||||
|
|
||||||
class _Function(object):
|
class _Function(object):
|
||||||
def __init__(self, inputs, outputs, updates, givens, check_nan=False):
|
def __init__(self, inputs, outputs, updates, givens, check_nan=False):
|
||||||
for inpt in inputs:
|
for inpt in inputs:
|
||||||
@@ -448,7 +405,6 @@ class _Function(object):
|
|||||||
raise RuntimeError("Nan detected")
|
raise RuntimeError("Nan detected")
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
||||||
def mem_friendly_function(nondata_inputs, data_inputs, outputs, batch_size):
|
def mem_friendly_function(nondata_inputs, data_inputs, outputs, batch_size):
|
||||||
if isinstance(outputs, list):
|
if isinstance(outputs, list):
|
||||||
return _MemFriendlyFunction(nondata_inputs, data_inputs, outputs, batch_size)
|
return _MemFriendlyFunction(nondata_inputs, data_inputs, outputs, batch_size)
|
||||||
@@ -456,7 +412,6 @@ def mem_friendly_function(nondata_inputs, data_inputs, outputs, batch_size):
|
|||||||
f = _MemFriendlyFunction(nondata_inputs, data_inputs, [outputs], batch_size)
|
f = _MemFriendlyFunction(nondata_inputs, data_inputs, [outputs], batch_size)
|
||||||
return lambda *inputs: f(*inputs)[0]
|
return lambda *inputs: f(*inputs)[0]
|
||||||
|
|
||||||
|
|
||||||
class _MemFriendlyFunction(object):
|
class _MemFriendlyFunction(object):
|
||||||
def __init__(self, nondata_inputs, data_inputs, outputs, batch_size):
|
def __init__(self, nondata_inputs, data_inputs, outputs, batch_size):
|
||||||
self.nondata_inputs = nondata_inputs
|
self.nondata_inputs = nondata_inputs
|
||||||
@@ -490,7 +445,6 @@ class _MemFriendlyFunction(object):
|
|||||||
# Modules
|
# Modules
|
||||||
# ================================================================
|
# ================================================================
|
||||||
|
|
||||||
|
|
||||||
class Module(object):
|
class Module(object):
|
||||||
def __init__(self, name):
|
def __init__(self, name):
|
||||||
self.name = name
|
self.name = name
|
||||||
@@ -528,7 +482,6 @@ class Module(object):
|
|||||||
assert self.scope is not None, "need to call module once before getting variables"
|
assert self.scope is not None, "need to call module once before getting variables"
|
||||||
return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope)
|
return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope)
|
||||||
|
|
||||||
|
|
||||||
def module(name):
|
def module(name):
|
||||||
@functools.wraps
|
@functools.wraps
|
||||||
def wrapper(f):
|
def wrapper(f):
|
||||||
@@ -542,14 +495,11 @@ def module(name):
|
|||||||
# Graph traversal
|
# Graph traversal
|
||||||
# ================================================================
|
# ================================================================
|
||||||
|
|
||||||
|
|
||||||
VARIABLES = {}
|
VARIABLES = {}
|
||||||
|
|
||||||
|
|
||||||
def get_parents(node):
|
def get_parents(node):
|
||||||
return node.op.inputs
|
return node.op.inputs
|
||||||
|
|
||||||
|
|
||||||
def topsorted(outputs):
|
def topsorted(outputs):
|
||||||
"""
|
"""
|
||||||
Topological sort via non-recursive depth-first search
|
Topological sort via non-recursive depth-first search
|
||||||
@@ -586,7 +536,6 @@ def topsorted(outputs):
|
|||||||
stack.append((j, 0))
|
stack.append((j, 0))
|
||||||
return out
|
return out
|
||||||
|
|
||||||
|
|
||||||
# ================================================================
|
# ================================================================
|
||||||
# Flat vectors
|
# Flat vectors
|
||||||
# ================================================================
|
# ================================================================
|
||||||
@@ -597,15 +546,12 @@ def var_shape(x):
|
|||||||
"shape function assumes that shape is fully known"
|
"shape function assumes that shape is fully known"
|
||||||
return out
|
return out
|
||||||
|
|
||||||
|
|
||||||
def numel(x):
|
def numel(x):
|
||||||
return intprod(var_shape(x))
|
return intprod(var_shape(x))
|
||||||
|
|
||||||
|
|
||||||
def intprod(x):
|
def intprod(x):
|
||||||
return int(np.prod(x))
|
return int(np.prod(x))
|
||||||
|
|
||||||
|
|
||||||
def flatgrad(loss, var_list, clip_norm=None):
|
def flatgrad(loss, var_list, clip_norm=None):
|
||||||
grads = tf.gradients(loss, var_list)
|
grads = tf.gradients(loss, var_list)
|
||||||
if clip_norm is not None:
|
if clip_norm is not None:
|
||||||
@@ -615,7 +561,6 @@ def flatgrad(loss, var_list, clip_norm=None):
|
|||||||
for (v, grad) in zip(var_list, grads)
|
for (v, grad) in zip(var_list, grads)
|
||||||
])
|
])
|
||||||
|
|
||||||
|
|
||||||
class SetFromFlat(object):
|
class SetFromFlat(object):
|
||||||
def __init__(self, var_list, dtype=tf.float32):
|
def __init__(self, var_list, dtype=tf.float32):
|
||||||
assigns = []
|
assigns = []
|
||||||
@@ -634,7 +579,6 @@ class SetFromFlat(object):
|
|||||||
def __call__(self, theta):
|
def __call__(self, theta):
|
||||||
get_session().run(self.op, feed_dict={self.theta: theta})
|
get_session().run(self.op, feed_dict={self.theta: theta})
|
||||||
|
|
||||||
|
|
||||||
class GetFlat(object):
|
class GetFlat(object):
|
||||||
def __init__(self, var_list):
|
def __init__(self, var_list):
|
||||||
self.op = tf.concat(axis=0, values=[tf.reshape(v, [numel(v)]) for v in var_list])
|
self.op = tf.concat(axis=0, values=[tf.reshape(v, [numel(v)]) for v in var_list])
|
||||||
@@ -646,7 +590,6 @@ class GetFlat(object):
|
|||||||
# Misc
|
# Misc
|
||||||
# ================================================================
|
# ================================================================
|
||||||
|
|
||||||
|
|
||||||
def fancy_slice_2d(X, inds0, inds1):
|
def fancy_slice_2d(X, inds0, inds1):
|
||||||
"""
|
"""
|
||||||
like numpy X[inds0, inds1]
|
like numpy X[inds0, inds1]
|
||||||
@@ -659,12 +602,10 @@ def fancy_slice_2d(X, inds0, inds1):
|
|||||||
Xflat = tf.reshape(X, [-1])
|
Xflat = tf.reshape(X, [-1])
|
||||||
return tf.gather(Xflat, inds0 * ncols + inds1)
|
return tf.gather(Xflat, inds0 * ncols + inds1)
|
||||||
|
|
||||||
|
|
||||||
# ================================================================
|
# ================================================================
|
||||||
# Scopes
|
# Scopes
|
||||||
# ================================================================
|
# ================================================================
|
||||||
|
|
||||||
|
|
||||||
def scope_vars(scope, trainable_only=False):
|
def scope_vars(scope, trainable_only=False):
|
||||||
"""
|
"""
|
||||||
Get variables inside a scope
|
Get variables inside a scope
|
||||||
@@ -687,17 +628,14 @@ def scope_vars(scope, trainable_only=False):
|
|||||||
scope=scope if isinstance(scope, str) else scope.name
|
scope=scope if isinstance(scope, str) else scope.name
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def scope_name():
|
def scope_name():
|
||||||
"""Returns the name of current scope as a string, e.g. deepq/q_func"""
|
"""Returns the name of current scope as a string, e.g. deepq/q_func"""
|
||||||
return tf.get_variable_scope().name
|
return tf.get_variable_scope().name
|
||||||
|
|
||||||
|
|
||||||
def absolute_scope_name(relative_scope_name):
|
def absolute_scope_name(relative_scope_name):
|
||||||
"""Appends parent scope name to `relative_scope_name`"""
|
"""Appends parent scope name to `relative_scope_name`"""
|
||||||
return scope_name() + "/" + relative_scope_name
|
return scope_name() + "/" + relative_scope_name
|
||||||
|
|
||||||
|
|
||||||
def lengths_to_mask(lengths_b, max_length):
|
def lengths_to_mask(lengths_b, max_length):
|
||||||
"""
|
"""
|
||||||
Turns a vector of lengths into a boolean mask
|
Turns a vector of lengths into a boolean mask
|
||||||
@@ -715,7 +653,6 @@ def lengths_to_mask(lengths_b, max_length):
|
|||||||
mask_bt = tf.expand_dims(tf.range(max_length), 0) < tf.expand_dims(lengths_b, 1)
|
mask_bt = tf.expand_dims(tf.range(max_length), 0) < tf.expand_dims(lengths_b, 1)
|
||||||
return mask_bt
|
return mask_bt
|
||||||
|
|
||||||
|
|
||||||
def in_session(f):
|
def in_session(f):
|
||||||
@functools.wraps(f)
|
@functools.wraps(f)
|
||||||
def newfunc(*args, **kwargs):
|
def newfunc(*args, **kwargs):
|
||||||
@@ -723,10 +660,8 @@ def in_session(f):
|
|||||||
f(*args, **kwargs)
|
f(*args, **kwargs)
|
||||||
return newfunc
|
return newfunc
|
||||||
|
|
||||||
|
|
||||||
_PLACEHOLDER_CACHE = {} # name -> (placeholder, dtype, shape)
|
_PLACEHOLDER_CACHE = {} # name -> (placeholder, dtype, shape)
|
||||||
|
|
||||||
|
|
||||||
def get_placeholder(name, dtype, shape):
|
def get_placeholder(name, dtype, shape):
|
||||||
if name in _PLACEHOLDER_CACHE:
|
if name in _PLACEHOLDER_CACHE:
|
||||||
out, dtype1, shape1 = _PLACEHOLDER_CACHE[name]
|
out, dtype1, shape1 = _PLACEHOLDER_CACHE[name]
|
||||||
@@ -737,15 +672,12 @@ def get_placeholder(name, dtype, shape):
|
|||||||
_PLACEHOLDER_CACHE[name] = (out, dtype, shape)
|
_PLACEHOLDER_CACHE[name] = (out, dtype, shape)
|
||||||
return out
|
return out
|
||||||
|
|
||||||
|
|
||||||
def get_placeholder_cached(name):
|
def get_placeholder_cached(name):
|
||||||
return _PLACEHOLDER_CACHE[name][0]
|
return _PLACEHOLDER_CACHE[name][0]
|
||||||
|
|
||||||
|
|
||||||
def flattenallbut0(x):
|
def flattenallbut0(x):
|
||||||
return tf.reshape(x, [-1, intprod(x.get_shape().as_list()[1:])])
|
return tf.reshape(x, [-1, intprod(x.get_shape().as_list()[1:])])
|
||||||
|
|
||||||
|
|
||||||
def reset():
|
def reset():
|
||||||
global _PLACEHOLDER_CACHE
|
global _PLACEHOLDER_CACHE
|
||||||
global VARIABLES
|
global VARIABLES
|
||||||
|
@@ -2,7 +2,9 @@ import numpy as np
|
|||||||
from multiprocessing import Process, Pipe
|
from multiprocessing import Process, Pipe
|
||||||
from baselines.common.vec_env import VecEnv
|
from baselines.common.vec_env import VecEnv
|
||||||
|
|
||||||
def worker(remote, env_fn_wrapper):
|
|
||||||
|
def worker(remote, parent_remote, env_fn_wrapper):
|
||||||
|
parent_remote.close()
|
||||||
env = env_fn_wrapper.x()
|
env = env_fn_wrapper.x()
|
||||||
while True:
|
while True:
|
||||||
cmd, data = remote.recv()
|
cmd, data = remote.recv()
|
||||||
@@ -14,6 +16,9 @@ def worker(remote, env_fn_wrapper):
|
|||||||
elif cmd == 'reset':
|
elif cmd == 'reset':
|
||||||
ob = env.reset()
|
ob = env.reset()
|
||||||
remote.send(ob)
|
remote.send(ob)
|
||||||
|
elif cmd == 'reset_task':
|
||||||
|
ob = env.reset_task()
|
||||||
|
remote.send(ob)
|
||||||
elif cmd == 'close':
|
elif cmd == 'close':
|
||||||
remote.close()
|
remote.close()
|
||||||
break
|
break
|
||||||
@@ -22,6 +27,7 @@ def worker(remote, env_fn_wrapper):
|
|||||||
else:
|
else:
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
class CloudpickleWrapper(object):
|
class CloudpickleWrapper(object):
|
||||||
"""
|
"""
|
||||||
Uses cloudpickle to serialize contents (otherwise multiprocessing tries to use pickle)
|
Uses cloudpickle to serialize contents (otherwise multiprocessing tries to use pickle)
|
||||||
@@ -35,17 +41,22 @@ class CloudpickleWrapper(object):
|
|||||||
import pickle
|
import pickle
|
||||||
self.x = pickle.loads(ob)
|
self.x = pickle.loads(ob)
|
||||||
|
|
||||||
|
|
||||||
class SubprocVecEnv(VecEnv):
|
class SubprocVecEnv(VecEnv):
|
||||||
def __init__(self, env_fns):
|
def __init__(self, env_fns):
|
||||||
"""
|
"""
|
||||||
envs: list of gym environments to run in subprocesses
|
envs: list of gym environments to run in subprocesses
|
||||||
"""
|
"""
|
||||||
|
self.closed = False
|
||||||
nenvs = len(env_fns)
|
nenvs = len(env_fns)
|
||||||
self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(nenvs)])
|
self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(nenvs)])
|
||||||
self.ps = [Process(target=worker, args=(work_remote, CloudpickleWrapper(env_fn)))
|
self.ps = [Process(target=worker, args=(work_remote, remote, CloudpickleWrapper(env_fn)))
|
||||||
for (work_remote, env_fn) in zip(self.work_remotes, env_fns)]
|
for (work_remote, remote, env_fn) in zip(self.work_remotes, self.remotes, env_fns)]
|
||||||
for p in self.ps:
|
for p in self.ps:
|
||||||
|
p.daemon = True # if the main process crashes, we should not cause things to hang
|
||||||
p.start()
|
p.start()
|
||||||
|
for remote in self.work_remotes:
|
||||||
|
remote.close()
|
||||||
|
|
||||||
self.remotes[0].send(('get_spaces', None))
|
self.remotes[0].send(('get_spaces', None))
|
||||||
self.action_space, self.observation_space = self.remotes[0].recv()
|
self.action_space, self.observation_space = self.remotes[0].recv()
|
||||||
@@ -63,11 +74,20 @@ class SubprocVecEnv(VecEnv):
|
|||||||
remote.send(('reset', None))
|
remote.send(('reset', None))
|
||||||
return np.stack([remote.recv() for remote in self.remotes])
|
return np.stack([remote.recv() for remote in self.remotes])
|
||||||
|
|
||||||
|
def reset_task(self):
|
||||||
|
for remote in self.remotes:
|
||||||
|
remote.send(('reset_task', None))
|
||||||
|
return np.stack([remote.recv() for remote in self.remotes])
|
||||||
|
|
||||||
def close(self):
|
def close(self):
|
||||||
|
if self.closed:
|
||||||
|
return
|
||||||
|
|
||||||
for remote in self.remotes:
|
for remote in self.remotes:
|
||||||
remote.send(('close', None))
|
remote.send(('close', None))
|
||||||
for p in self.ps:
|
for p in self.ps:
|
||||||
p.join()
|
p.join()
|
||||||
|
self.closed = True
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def num_envs(self):
|
def num_envs(self):
|
||||||
|
@@ -19,11 +19,12 @@ from mpi4py import MPI
|
|||||||
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs):
|
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs):
|
||||||
# Configure things.
|
# Configure things.
|
||||||
rank = MPI.COMM_WORLD.Get_rank()
|
rank = MPI.COMM_WORLD.Get_rank()
|
||||||
if rank != 0: logger.set_level(logger.DISABLED)
|
if rank != 0:
|
||||||
|
logger.set_level(logger.DISABLED)
|
||||||
|
|
||||||
# Create envs.
|
# Create envs.
|
||||||
env = gym.make(env_id)
|
env = gym.make(env_id)
|
||||||
env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), "%i.monitor.json"%rank))
|
env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
|
||||||
gym.logger.setLevel(logging.WARN)
|
gym.logger.setLevel(logging.WARN)
|
||||||
|
|
||||||
if evaluation and rank==0:
|
if evaluation and rank==0:
|
||||||
@@ -81,7 +82,7 @@ def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs):
|
|||||||
|
|
||||||
def parse_args():
|
def parse_args():
|
||||||
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||||
|
|
||||||
parser.add_argument('--env-id', type=str, default='HalfCheetah-v1')
|
parser.add_argument('--env-id', type=str, default='HalfCheetah-v1')
|
||||||
boolean_flag(parser, 'render-eval', default=False)
|
boolean_flag(parser, 'render-eval', default=False)
|
||||||
boolean_flag(parser, 'layer-norm', default=True)
|
boolean_flag(parser, 'layer-norm', default=True)
|
||||||
@@ -103,11 +104,21 @@ def parse_args():
|
|||||||
parser.add_argument('--nb-eval-steps', type=int, default=100) # per epoch cycle and MPI worker
|
parser.add_argument('--nb-eval-steps', type=int, default=100) # per epoch cycle and MPI worker
|
||||||
parser.add_argument('--nb-rollout-steps', type=int, default=100) # per epoch cycle and MPI worker
|
parser.add_argument('--nb-rollout-steps', type=int, default=100) # per epoch cycle and MPI worker
|
||||||
parser.add_argument('--noise-type', type=str, default='adaptive-param_0.2') # choices are adaptive-param_xx, ou_xx, normal_xx, none
|
parser.add_argument('--noise-type', type=str, default='adaptive-param_0.2') # choices are adaptive-param_xx, ou_xx, normal_xx, none
|
||||||
|
parser.add_argument('--num-timesteps', type=int, default=None)
|
||||||
boolean_flag(parser, 'evaluation', default=False)
|
boolean_flag(parser, 'evaluation', default=False)
|
||||||
return vars(parser.parse_args())
|
args = parser.parse_args()
|
||||||
|
# we don't directly specify timesteps for this script, so make sure that if we do specify them
|
||||||
|
# they agree with the other parameters
|
||||||
|
if args.num_timesteps is not None:
|
||||||
|
assert(args.num_timesteps == args.nb_epochs * args.nb_epoch_cycles * args.nb_rollout_steps)
|
||||||
|
dict_args = vars(args)
|
||||||
|
del dict_args['num_timesteps']
|
||||||
|
return dict_args
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
args = parse_args()
|
args = parse_args()
|
||||||
|
if MPI.COMM_WORLD.Get_rank() == 0:
|
||||||
|
logger.configure()
|
||||||
# Run actual script.
|
# Run actual script.
|
||||||
run(**args)
|
run(**args)
|
||||||
|
@@ -1,6 +1,3 @@
|
|||||||
import time
|
|
||||||
|
|
||||||
import gym
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import tensorflow as tf
|
import tensorflow as tf
|
||||||
from mpi4py import MPI
|
from mpi4py import MPI
|
||||||
|
@@ -1,5 +1,8 @@
|
|||||||
from baselines.deepq import models # noqa
|
from baselines.deepq import models # noqa
|
||||||
from baselines.deepq.build_graph import build_act, build_train # noqa
|
from baselines.deepq.build_graph import build_act, build_train # noqa
|
||||||
|
|
||||||
from baselines.deepq.simple import learn, load # noqa
|
from baselines.deepq.simple import learn, load # noqa
|
||||||
from baselines.deepq.replay_buffer import ReplayBuffer, PrioritizedReplayBuffer # noqa
|
from baselines.deepq.replay_buffer import ReplayBuffer, PrioritizedReplayBuffer # noqa
|
||||||
|
|
||||||
|
def wrap_atari_dqn(env):
|
||||||
|
from baselines.common.atari_wrappers import wrap_deepmind
|
||||||
|
return wrap_deepmind(env, frame_stack=True, scale=True)
|
@@ -10,8 +10,8 @@ import baselines.common.tf_util as U
|
|||||||
from baselines import deepq
|
from baselines import deepq
|
||||||
from baselines.common.misc_util import (
|
from baselines.common.misc_util import (
|
||||||
boolean_flag,
|
boolean_flag,
|
||||||
SimpleMonitor,
|
|
||||||
)
|
)
|
||||||
|
from baselines import bench
|
||||||
from baselines.common.atari_wrappers_deprecated import wrap_dqn
|
from baselines.common.atari_wrappers_deprecated import wrap_dqn
|
||||||
from baselines.deepq.experiments.atari.model import model, dueling_model
|
from baselines.deepq.experiments.atari.model import model, dueling_model
|
||||||
|
|
||||||
@@ -30,7 +30,7 @@ def parse_args():
|
|||||||
|
|
||||||
def make_env(game_name):
|
def make_env(game_name):
|
||||||
env = gym.make(game_name + "NoFrameskip-v4")
|
env = gym.make(game_name + "NoFrameskip-v4")
|
||||||
env = SimpleMonitor(env)
|
env = bench.Monitor(env, None)
|
||||||
env = wrap_dqn(env)
|
env = wrap_dqn(env)
|
||||||
return env
|
return env
|
||||||
|
|
||||||
|
@@ -19,11 +19,9 @@ from baselines.common.misc_util import (
|
|||||||
relatively_safe_pickle_dump,
|
relatively_safe_pickle_dump,
|
||||||
set_global_seeds,
|
set_global_seeds,
|
||||||
RunningAvg,
|
RunningAvg,
|
||||||
SimpleMonitor
|
|
||||||
)
|
)
|
||||||
from baselines.common.schedules import LinearSchedule, PiecewiseSchedule
|
from baselines.common.schedules import LinearSchedule, PiecewiseSchedule
|
||||||
# when updating this to non-deperecated ones, it is important to
|
from baselines import bench
|
||||||
# copy over LazyFrames
|
|
||||||
from baselines.common.atari_wrappers_deprecated import wrap_dqn
|
from baselines.common.atari_wrappers_deprecated import wrap_dqn
|
||||||
from baselines.common.azure_utils import Container
|
from baselines.common.azure_utils import Container
|
||||||
from .model import model, dueling_model
|
from .model import model, dueling_model
|
||||||
@@ -64,7 +62,7 @@ def parse_args():
|
|||||||
|
|
||||||
def make_env(game_name):
|
def make_env(game_name):
|
||||||
env = gym.make(game_name + "NoFrameskip-v4")
|
env = gym.make(game_name + "NoFrameskip-v4")
|
||||||
monitored_env = SimpleMonitor(env) # puts rewards and number of steps in info, before environment is wrapped
|
monitored_env = bench.Monitor(env, logger.get_dir()) # puts rewards and number of steps in info, before environment is wrapped
|
||||||
env = wrap_dqn(monitored_env) # applies a bunch of modification to simplify the observation space (downsample, make b/w)
|
env = wrap_dqn(monitored_env) # applies a bunch of modification to simplify the observation space (downsample, make b/w)
|
||||||
return env, monitored_env
|
return env, monitored_env
|
||||||
|
|
||||||
|
@@ -5,15 +5,15 @@ import os
|
|||||||
|
|
||||||
import baselines.common.tf_util as U
|
import baselines.common.tf_util as U
|
||||||
|
|
||||||
from baselines import deepq
|
from baselines import deepq, bench
|
||||||
from baselines.common.misc_util import get_wrapper_by_name, SimpleMonitor, boolean_flag, set_global_seeds
|
from baselines.common.misc_util import get_wrapper_by_name, boolean_flag, set_global_seeds
|
||||||
from baselines.common.atari_wrappers_deprecated import wrap_dqn
|
from baselines.common.atari_wrappers_deprecated import wrap_dqn
|
||||||
from baselines.deepq.experiments.atari.model import model, dueling_model
|
from baselines.deepq.experiments.atari.model import model, dueling_model
|
||||||
|
|
||||||
|
|
||||||
def make_env(game_name):
|
def make_env(game_name):
|
||||||
env = gym.make(game_name + "NoFrameskip-v4")
|
env = gym.make(game_name + "NoFrameskip-v4")
|
||||||
env_monitored = SimpleMonitor(env)
|
env_monitored = bench.Monitor(env, None)
|
||||||
env = wrap_dqn(env_monitored)
|
env = wrap_dqn(env_monitored)
|
||||||
return env_monitored, env
|
return env_monitored, env
|
||||||
|
|
||||||
@@ -47,14 +47,14 @@ def wang2015_eval(game_name, act, stochastic):
|
|||||||
eval_episode_steps += 1
|
eval_episode_steps += 1
|
||||||
action = act(np.array(obs)[None], stochastic=stochastic)[0]
|
action = act(np.array(obs)[None], stochastic=stochastic)[0]
|
||||||
|
|
||||||
obs, reward, done, info = eval_env.step(action)
|
obs, _reward, done, info = eval_env.step(action)
|
||||||
if done:
|
if done:
|
||||||
obs = eval_env.reset()
|
obs = eval_env.reset()
|
||||||
if len(info["rewards"]) > 0:
|
if len(info["rewards"]) > 0:
|
||||||
episode_rewards.append(info["rewards"][0])
|
episode_rewards.append(info["rewards"][0])
|
||||||
break
|
break
|
||||||
if info["steps"] > 108000: # 5 minutes of gameplay
|
if info["steps"] > 108000: # 5 minutes of gameplay
|
||||||
episode_rewards.append(env_monitored._current_reward)
|
episode_rewards.append(sum(env_monitored.rewards))
|
||||||
break
|
break
|
||||||
print("Num steps in episode {} was {} yielding {} reward".format(
|
print("Num steps in episode {} was {} yielding {} reward".format(
|
||||||
num_noops, eval_episode_steps, episode_rewards[-1]), flush=True)
|
num_noops, eval_episode_steps, episode_rewards[-1]), flush=True)
|
||||||
@@ -66,7 +66,7 @@ def wang2015_eval(game_name, act, stochastic):
|
|||||||
def main():
|
def main():
|
||||||
set_global_seeds(1)
|
set_global_seeds(1)
|
||||||
args = parse_args()
|
args = parse_args()
|
||||||
with U.make_session(4) as sess: # noqa
|
with U.make_session(4): # noqa
|
||||||
_, env = make_env(args.env)
|
_, env = make_env(args.env)
|
||||||
act = deepq.build_act(
|
act = deepq.build_act(
|
||||||
make_obs_ph=lambda name: U.Uint8Input(env.observation_space.shape, name=name),
|
make_obs_ph=lambda name: U.Uint8Input(env.observation_space.shape, name=name),
|
||||||
|
@@ -1,12 +1,10 @@
|
|||||||
import gym
|
import gym
|
||||||
|
|
||||||
from baselines import deepq
|
from baselines import deepq
|
||||||
from baselines.common.atari_wrappers_deprecated import wrap_dqn, ScaledFloatFrame
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
env = gym.make("PongNoFrameskip-v4")
|
env = gym.make("PongNoFrameskip-v4")
|
||||||
env = ScaledFloatFrame(wrap_dqn(env))
|
env = deepq.wrap_atari_dqn(env)
|
||||||
act = deepq.load("pong_model.pkl")
|
act = deepq.load("pong_model.pkl")
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
|
47
baselines/deepq/experiments/run_atari.py
Normal file
47
baselines/deepq/experiments/run_atari.py
Normal file
@@ -0,0 +1,47 @@
|
|||||||
|
import gym
|
||||||
|
|
||||||
|
from baselines import deepq
|
||||||
|
from baselines.common import set_global_seeds
|
||||||
|
from baselines import bench
|
||||||
|
import argparse
|
||||||
|
from baselines import logger
|
||||||
|
from baselines.common.atari_wrappers import make_atari
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||||
|
parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4')
|
||||||
|
parser.add_argument('--seed', help='RNG seed', type=int, default=0)
|
||||||
|
parser.add_argument('--prioritized', type=int, default=1)
|
||||||
|
parser.add_argument('--dueling', type=int, default=1)
|
||||||
|
parser.add_argument('--num-timesteps', type=int, default=int(10e6))
|
||||||
|
args = parser.parse_args()
|
||||||
|
logger.configure()
|
||||||
|
set_global_seeds(args.seed)
|
||||||
|
env = make_atari(args.env)
|
||||||
|
env = bench.Monitor(env, logger.get_dir())
|
||||||
|
env = deepq.wrap_atari_dqn(env)
|
||||||
|
model = deepq.models.cnn_to_mlp(
|
||||||
|
convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)],
|
||||||
|
hiddens=[256],
|
||||||
|
dueling=bool(args.dueling),
|
||||||
|
)
|
||||||
|
act = deepq.learn(
|
||||||
|
env,
|
||||||
|
q_func=model,
|
||||||
|
lr=1e-4,
|
||||||
|
max_timesteps=args.num_timesteps,
|
||||||
|
buffer_size=10000,
|
||||||
|
exploration_fraction=0.1,
|
||||||
|
exploration_final_eps=0.01,
|
||||||
|
train_freq=4,
|
||||||
|
learning_starts=10000,
|
||||||
|
target_network_update_freq=1000,
|
||||||
|
gamma=0.99,
|
||||||
|
prioritized_replay=bool(args.prioritized)
|
||||||
|
)
|
||||||
|
# act.save("pong_model.pkl") XXX
|
||||||
|
env.close()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
@@ -1,34 +0,0 @@
|
|||||||
import gym
|
|
||||||
|
|
||||||
from baselines import deepq
|
|
||||||
from baselines.common.atari_wrappers_deprecated import wrap_dqn, ScaledFloatFrame
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
env = gym.make("PongNoFrameskip-v4")
|
|
||||||
env = ScaledFloatFrame(wrap_dqn(env))
|
|
||||||
model = deepq.models.cnn_to_mlp(
|
|
||||||
convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)],
|
|
||||||
hiddens=[256],
|
|
||||||
dueling=True
|
|
||||||
)
|
|
||||||
act = deepq.learn(
|
|
||||||
env,
|
|
||||||
q_func=model,
|
|
||||||
lr=1e-4,
|
|
||||||
max_timesteps=2000000,
|
|
||||||
buffer_size=10000,
|
|
||||||
exploration_fraction=0.1,
|
|
||||||
exploration_final_eps=0.01,
|
|
||||||
train_freq=4,
|
|
||||||
learning_starts=10000,
|
|
||||||
target_network_update_freq=1000,
|
|
||||||
gamma=0.99,
|
|
||||||
prioritized_replay=True
|
|
||||||
)
|
|
||||||
act.save("pong_model.pkl")
|
|
||||||
env.close()
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
main()
|
|
@@ -1,12 +1,13 @@
|
|||||||
import numpy as np
|
|
||||||
import os
|
import os
|
||||||
import dill
|
|
||||||
import tempfile
|
import tempfile
|
||||||
|
|
||||||
import tensorflow as tf
|
import tensorflow as tf
|
||||||
import zipfile
|
import zipfile
|
||||||
|
import cloudpickle
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
import gym
|
||||||
import baselines.common.tf_util as U
|
import baselines.common.tf_util as U
|
||||||
|
|
||||||
from baselines import logger
|
from baselines import logger
|
||||||
from baselines.common.schedules import LinearSchedule
|
from baselines.common.schedules import LinearSchedule
|
||||||
from baselines import deepq
|
from baselines import deepq
|
||||||
@@ -19,11 +20,11 @@ class ActWrapper(object):
|
|||||||
self._act_params = act_params
|
self._act_params = act_params
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def load(path, num_cpu=16):
|
def load(path):
|
||||||
with open(path, "rb") as f:
|
with open(path, "rb") as f:
|
||||||
model_data, act_params = dill.load(f)
|
model_data, act_params = cloudpickle.load(f)
|
||||||
act = deepq.build_act(**act_params)
|
act = deepq.build_act(**act_params)
|
||||||
sess = U.make_session(num_cpu=num_cpu)
|
sess = tf.Session()
|
||||||
sess.__enter__()
|
sess.__enter__()
|
||||||
with tempfile.TemporaryDirectory() as td:
|
with tempfile.TemporaryDirectory() as td:
|
||||||
arc_path = os.path.join(td, "packed.zip")
|
arc_path = os.path.join(td, "packed.zip")
|
||||||
@@ -38,8 +39,11 @@ class ActWrapper(object):
|
|||||||
def __call__(self, *args, **kwargs):
|
def __call__(self, *args, **kwargs):
|
||||||
return self._act(*args, **kwargs)
|
return self._act(*args, **kwargs)
|
||||||
|
|
||||||
def save(self, path):
|
def save(self, path=None):
|
||||||
"""Save model to a pickle located at `path`"""
|
"""Save model to a pickle located at `path`"""
|
||||||
|
if path is None:
|
||||||
|
path = os.path.join(logger.get_dir(), "model.pkl")
|
||||||
|
|
||||||
with tempfile.TemporaryDirectory() as td:
|
with tempfile.TemporaryDirectory() as td:
|
||||||
U.save_state(os.path.join(td, "model"))
|
U.save_state(os.path.join(td, "model"))
|
||||||
arc_name = os.path.join(td, "packed.zip")
|
arc_name = os.path.join(td, "packed.zip")
|
||||||
@@ -52,18 +56,16 @@ class ActWrapper(object):
|
|||||||
with open(arc_name, "rb") as f:
|
with open(arc_name, "rb") as f:
|
||||||
model_data = f.read()
|
model_data = f.read()
|
||||||
with open(path, "wb") as f:
|
with open(path, "wb") as f:
|
||||||
dill.dump((model_data, self._act_params), f)
|
cloudpickle.dump((model_data, self._act_params), f)
|
||||||
|
|
||||||
|
|
||||||
def load(path, num_cpu=16):
|
def load(path):
|
||||||
"""Load act function that was returned by learn function.
|
"""Load act function that was returned by learn function.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
path: str
|
path: str
|
||||||
path to the act function pickle
|
path to the act function pickle
|
||||||
num_cpu: int
|
|
||||||
number of cpus to use for executing the policy
|
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
@@ -71,7 +73,7 @@ def load(path, num_cpu=16):
|
|||||||
function that takes a batch of observations
|
function that takes a batch of observations
|
||||||
and returns actions.
|
and returns actions.
|
||||||
"""
|
"""
|
||||||
return ActWrapper.load(path, num_cpu=num_cpu)
|
return ActWrapper.load(path)
|
||||||
|
|
||||||
|
|
||||||
def learn(env,
|
def learn(env,
|
||||||
@@ -83,7 +85,7 @@ def learn(env,
|
|||||||
exploration_final_eps=0.02,
|
exploration_final_eps=0.02,
|
||||||
train_freq=1,
|
train_freq=1,
|
||||||
batch_size=32,
|
batch_size=32,
|
||||||
print_freq=1,
|
print_freq=100,
|
||||||
checkpoint_freq=10000,
|
checkpoint_freq=10000,
|
||||||
learning_starts=1000,
|
learning_starts=1000,
|
||||||
gamma=1.0,
|
gamma=1.0,
|
||||||
@@ -93,7 +95,6 @@ def learn(env,
|
|||||||
prioritized_replay_beta0=0.4,
|
prioritized_replay_beta0=0.4,
|
||||||
prioritized_replay_beta_iters=None,
|
prioritized_replay_beta_iters=None,
|
||||||
prioritized_replay_eps=1e-6,
|
prioritized_replay_eps=1e-6,
|
||||||
num_cpu=16,
|
|
||||||
param_noise=False,
|
param_noise=False,
|
||||||
callback=None):
|
callback=None):
|
||||||
"""Train a deepq model.
|
"""Train a deepq model.
|
||||||
@@ -151,8 +152,6 @@ def learn(env,
|
|||||||
to 1.0. If set to None equals to max_timesteps.
|
to 1.0. If set to None equals to max_timesteps.
|
||||||
prioritized_replay_eps: float
|
prioritized_replay_eps: float
|
||||||
epsilon to add to the TD errors when updating priorities.
|
epsilon to add to the TD errors when updating priorities.
|
||||||
num_cpu: int
|
|
||||||
number of cpus to use for training
|
|
||||||
callback: (locals, globals) -> None
|
callback: (locals, globals) -> None
|
||||||
function called at every steps with state of the algorithm.
|
function called at every steps with state of the algorithm.
|
||||||
If callback returns true training stops.
|
If callback returns true training stops.
|
||||||
@@ -165,11 +164,14 @@ def learn(env,
|
|||||||
"""
|
"""
|
||||||
# Create all the functions necessary to train the model
|
# Create all the functions necessary to train the model
|
||||||
|
|
||||||
sess = U.make_session(num_cpu=num_cpu)
|
sess = tf.Session()
|
||||||
sess.__enter__()
|
sess.__enter__()
|
||||||
|
|
||||||
|
# capture the shape outside the closure so that the env object is not serialized
|
||||||
|
# by cloudpickle when serializing make_obs_ph
|
||||||
|
observation_space_shape = env.observation_space.shape
|
||||||
def make_obs_ph(name):
|
def make_obs_ph(name):
|
||||||
return U.BatchInput(env.observation_space.shape, name=name)
|
return U.BatchInput(observation_space_shape, name=name)
|
||||||
|
|
||||||
act, train, update_target, debug = deepq.build_train(
|
act, train, update_target, debug = deepq.build_train(
|
||||||
make_obs_ph=make_obs_ph,
|
make_obs_ph=make_obs_ph,
|
||||||
@@ -180,12 +182,15 @@ def learn(env,
|
|||||||
grad_norm_clipping=10,
|
grad_norm_clipping=10,
|
||||||
param_noise=param_noise
|
param_noise=param_noise
|
||||||
)
|
)
|
||||||
|
|
||||||
act_params = {
|
act_params = {
|
||||||
'make_obs_ph': make_obs_ph,
|
'make_obs_ph': make_obs_ph,
|
||||||
'q_func': q_func,
|
'q_func': q_func,
|
||||||
'num_actions': env.action_space.n,
|
'num_actions': env.action_space.n,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
act = ActWrapper(act, act_params)
|
||||||
|
|
||||||
# Create the replay buffer
|
# Create the replay buffer
|
||||||
if prioritized_replay:
|
if prioritized_replay:
|
||||||
replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha)
|
replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha)
|
||||||
@@ -233,8 +238,13 @@ def learn(env,
|
|||||||
kwargs['update_param_noise_threshold'] = update_param_noise_threshold
|
kwargs['update_param_noise_threshold'] = update_param_noise_threshold
|
||||||
kwargs['update_param_noise_scale'] = True
|
kwargs['update_param_noise_scale'] = True
|
||||||
action = act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0]
|
action = act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0]
|
||||||
|
if isinstance(env.action_space, gym.spaces.MultiBinary):
|
||||||
|
env_action = np.zeros(env.action_space.n)
|
||||||
|
env_action[action] = 1
|
||||||
|
else:
|
||||||
|
env_action = action
|
||||||
reset = False
|
reset = False
|
||||||
new_obs, rew, done, _ = env.step(action)
|
new_obs, rew, done, _ = env.step(env_action)
|
||||||
# Store transition in the replay buffer.
|
# Store transition in the replay buffer.
|
||||||
replay_buffer.add(obs, action, rew, new_obs, float(done))
|
replay_buffer.add(obs, action, rew, new_obs, float(done))
|
||||||
obs = new_obs
|
obs = new_obs
|
||||||
@@ -285,4 +295,4 @@ def learn(env,
|
|||||||
logger.log("Restored model with mean reward: {}".format(saved_mean_reward))
|
logger.log("Restored model with mean reward: {}".format(saved_mean_reward))
|
||||||
U.load_state(model_file)
|
U.load_state(model_file)
|
||||||
|
|
||||||
return ActWrapper(act, act_params)
|
return act
|
||||||
|
@@ -6,8 +6,10 @@ import json
|
|||||||
import time
|
import time
|
||||||
import datetime
|
import datetime
|
||||||
import tempfile
|
import tempfile
|
||||||
|
from mpi4py import MPI
|
||||||
|
|
||||||
LOG_OUTPUT_FORMATS = ['stdout', 'log', 'json']
|
LOG_OUTPUT_FORMATS = ['stdout', 'log', 'csv']
|
||||||
|
# Also valid: json, tensorboard
|
||||||
|
|
||||||
DEBUG = 10
|
DEBUG = 10
|
||||||
INFO = 20
|
INFO = 20
|
||||||
@@ -16,26 +18,23 @@ ERROR = 40
|
|||||||
|
|
||||||
DISABLED = 50
|
DISABLED = 50
|
||||||
|
|
||||||
class OutputFormat(object):
|
class KVWriter(object):
|
||||||
def writekvs(self, kvs):
|
def writekvs(self, kvs):
|
||||||
"""
|
|
||||||
Write key-value pairs
|
|
||||||
"""
|
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def writeseq(self, args):
|
class SeqWriter(object):
|
||||||
"""
|
def writeseq(self, seq):
|
||||||
Write a sequence of other data (e.g. a logging message)
|
raise NotImplementedError
|
||||||
"""
|
|
||||||
pass
|
|
||||||
|
|
||||||
def close(self):
|
class HumanOutputFormat(KVWriter, SeqWriter):
|
||||||
return
|
def __init__(self, filename_or_file):
|
||||||
|
if isinstance(filename_or_file, str):
|
||||||
|
self.file = open(filename_or_file, 'at')
|
||||||
class HumanOutputFormat(OutputFormat):
|
self.own_file = True
|
||||||
def __init__(self, file):
|
else:
|
||||||
self.file = file
|
assert hasattr(filename_or_file, 'read'), 'expected file or str, got %s'%filename_or_file
|
||||||
|
self.file = filename_or_file
|
||||||
|
self.own_file = False
|
||||||
|
|
||||||
def writekvs(self, kvs):
|
def writekvs(self, kvs):
|
||||||
# Create strings for printing
|
# Create strings for printing
|
||||||
@@ -48,8 +47,12 @@ class HumanOutputFormat(OutputFormat):
|
|||||||
key2str[self._truncate(key)] = self._truncate(valstr)
|
key2str[self._truncate(key)] = self._truncate(valstr)
|
||||||
|
|
||||||
# Find max widths
|
# Find max widths
|
||||||
keywidth = max(map(len, key2str.keys()))
|
if len(key2str) == 0:
|
||||||
valwidth = max(map(len, key2str.values()))
|
print('WARNING: tried to write empty key-value dict')
|
||||||
|
return
|
||||||
|
else:
|
||||||
|
keywidth = max(map(len, key2str.keys()))
|
||||||
|
valwidth = max(map(len, key2str.values()))
|
||||||
|
|
||||||
# Write out the data
|
# Write out the data
|
||||||
dashes = '-' * (keywidth + valwidth + 7)
|
dashes = '-' * (keywidth + valwidth + 7)
|
||||||
@@ -70,15 +73,19 @@ class HumanOutputFormat(OutputFormat):
|
|||||||
def _truncate(self, s):
|
def _truncate(self, s):
|
||||||
return s[:20] + '...' if len(s) > 23 else s
|
return s[:20] + '...' if len(s) > 23 else s
|
||||||
|
|
||||||
def writeseq(self, args):
|
def writeseq(self, seq):
|
||||||
for arg in args:
|
for arg in seq:
|
||||||
self.file.write(arg)
|
self.file.write(arg)
|
||||||
self.file.write('\n')
|
self.file.write('\n')
|
||||||
self.file.flush()
|
self.file.flush()
|
||||||
|
|
||||||
class JSONOutputFormat(OutputFormat):
|
def close(self):
|
||||||
def __init__(self, file):
|
if self.own_file:
|
||||||
self.file = file
|
self.file.close()
|
||||||
|
|
||||||
|
class JSONOutputFormat(KVWriter):
|
||||||
|
def __init__(self, filename):
|
||||||
|
self.file = open(filename, 'at')
|
||||||
|
|
||||||
def writekvs(self, kvs):
|
def writekvs(self, kvs):
|
||||||
for k, v in sorted(kvs.items()):
|
for k, v in sorted(kvs.items()):
|
||||||
@@ -88,7 +95,46 @@ class JSONOutputFormat(OutputFormat):
|
|||||||
self.file.write(json.dumps(kvs) + '\n')
|
self.file.write(json.dumps(kvs) + '\n')
|
||||||
self.file.flush()
|
self.file.flush()
|
||||||
|
|
||||||
class TensorBoardOutputFormat(OutputFormat):
|
def close(self):
|
||||||
|
self.file.close()
|
||||||
|
|
||||||
|
class CSVOutputFormat(KVWriter):
|
||||||
|
def __init__(self, filename):
|
||||||
|
self.file = open(filename, 'a+t')
|
||||||
|
self.keys = []
|
||||||
|
self.sep = ','
|
||||||
|
|
||||||
|
def writekvs(self, kvs):
|
||||||
|
# Add our current row to the history
|
||||||
|
extra_keys = kvs.keys() - self.keys
|
||||||
|
if extra_keys:
|
||||||
|
self.keys.extend(extra_keys)
|
||||||
|
self.file.seek(0)
|
||||||
|
lines = self.file.readlines()
|
||||||
|
self.file.seek(0)
|
||||||
|
for (i, k) in enumerate(self.keys):
|
||||||
|
if i > 0:
|
||||||
|
self.file.write(',')
|
||||||
|
self.file.write(k)
|
||||||
|
self.file.write('\n')
|
||||||
|
for line in lines[1:]:
|
||||||
|
self.file.write(line[:-1])
|
||||||
|
self.file.write(self.sep * len(extra_keys))
|
||||||
|
self.file.write('\n')
|
||||||
|
for (i, k) in enumerate(self.keys):
|
||||||
|
if i > 0:
|
||||||
|
self.file.write(',')
|
||||||
|
v = kvs.get(k)
|
||||||
|
if v:
|
||||||
|
self.file.write(str(v))
|
||||||
|
self.file.write('\n')
|
||||||
|
self.file.flush()
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
self.file.close()
|
||||||
|
|
||||||
|
|
||||||
|
class TensorBoardOutputFormat(KVWriter):
|
||||||
"""
|
"""
|
||||||
Dumps key/value pairs into TensorBoard's numeric format.
|
Dumps key/value pairs into TensorBoard's numeric format.
|
||||||
"""
|
"""
|
||||||
@@ -99,7 +145,7 @@ class TensorBoardOutputFormat(OutputFormat):
|
|||||||
prefix = 'events'
|
prefix = 'events'
|
||||||
path = osp.join(osp.abspath(dir), prefix)
|
path = osp.join(osp.abspath(dir), prefix)
|
||||||
import tensorflow as tf
|
import tensorflow as tf
|
||||||
from tensorflow.python import pywrap_tensorflow
|
from tensorflow.python import pywrap_tensorflow
|
||||||
from tensorflow.core.util import event_pb2
|
from tensorflow.core.util import event_pb2
|
||||||
from tensorflow.python.util import compat
|
from tensorflow.python.util import compat
|
||||||
self.tf = tf
|
self.tf = tf
|
||||||
@@ -123,18 +169,22 @@ class TensorBoardOutputFormat(OutputFormat):
|
|||||||
self.writer.Close()
|
self.writer.Close()
|
||||||
self.writer = None
|
self.writer = None
|
||||||
|
|
||||||
|
|
||||||
def make_output_format(format, ev_dir):
|
def make_output_format(format, ev_dir):
|
||||||
os.makedirs(ev_dir, exist_ok=True)
|
os.makedirs(ev_dir, exist_ok=True)
|
||||||
|
rank = MPI.COMM_WORLD.Get_rank()
|
||||||
if format == 'stdout':
|
if format == 'stdout':
|
||||||
return HumanOutputFormat(sys.stdout)
|
return HumanOutputFormat(sys.stdout)
|
||||||
elif format == 'log':
|
elif format == 'log':
|
||||||
log_file = open(osp.join(ev_dir, 'log.txt'), 'wt')
|
suffix = "" if rank==0 else ("-mpi%03i"%rank)
|
||||||
return HumanOutputFormat(log_file)
|
return HumanOutputFormat(osp.join(ev_dir, 'log%s.txt' % suffix))
|
||||||
elif format == 'json':
|
elif format == 'json':
|
||||||
json_file = open(osp.join(ev_dir, 'progress.json'), 'wt')
|
assert rank==0
|
||||||
return JSONOutputFormat(json_file)
|
return JSONOutputFormat(osp.join(ev_dir, 'progress.json'))
|
||||||
|
elif format == 'csv':
|
||||||
|
assert rank==0
|
||||||
|
return CSVOutputFormat(osp.join(ev_dir, 'progress.csv'))
|
||||||
elif format == 'tensorboard':
|
elif format == 'tensorboard':
|
||||||
|
assert rank==0
|
||||||
return TensorBoardOutputFormat(osp.join(ev_dir, 'tb'))
|
return TensorBoardOutputFormat(osp.join(ev_dir, 'tb'))
|
||||||
else:
|
else:
|
||||||
raise ValueError('Unknown format specified: %s' % (format,))
|
raise ValueError('Unknown format specified: %s' % (format,))
|
||||||
@@ -167,7 +217,7 @@ def dumpkvs():
|
|||||||
Logger.CURRENT.dumpkvs()
|
Logger.CURRENT.dumpkvs()
|
||||||
|
|
||||||
def getkvs():
|
def getkvs():
|
||||||
return Logger.CURRENT.name2val
|
return Logger.CURRENT.name2val
|
||||||
|
|
||||||
|
|
||||||
def log(*args, level=INFO):
|
def log(*args, level=INFO):
|
||||||
@@ -176,19 +226,15 @@ def log(*args, level=INFO):
|
|||||||
"""
|
"""
|
||||||
Logger.CURRENT.log(*args, level=level)
|
Logger.CURRENT.log(*args, level=level)
|
||||||
|
|
||||||
|
|
||||||
def debug(*args):
|
def debug(*args):
|
||||||
log(*args, level=DEBUG)
|
log(*args, level=DEBUG)
|
||||||
|
|
||||||
|
|
||||||
def info(*args):
|
def info(*args):
|
||||||
log(*args, level=INFO)
|
log(*args, level=INFO)
|
||||||
|
|
||||||
|
|
||||||
def warn(*args):
|
def warn(*args):
|
||||||
log(*args, level=WARN)
|
log(*args, level=WARN)
|
||||||
|
|
||||||
|
|
||||||
def error(*args):
|
def error(*args):
|
||||||
log(*args, level=ERROR)
|
log(*args, level=ERROR)
|
||||||
|
|
||||||
@@ -232,7 +278,8 @@ class Logger(object):
|
|||||||
def dumpkvs(self):
|
def dumpkvs(self):
|
||||||
if self.level == DISABLED: return
|
if self.level == DISABLED: return
|
||||||
for fmt in self.output_formats:
|
for fmt in self.output_formats:
|
||||||
fmt.writekvs(self.name2val)
|
if isinstance(fmt, KVWriter):
|
||||||
|
fmt.writekvs(self.name2val)
|
||||||
self.name2val.clear()
|
self.name2val.clear()
|
||||||
|
|
||||||
def log(self, *args, level=INFO):
|
def log(self, *args, level=INFO):
|
||||||
@@ -255,34 +302,45 @@ class Logger(object):
|
|||||||
# ----------------------------------------
|
# ----------------------------------------
|
||||||
def _do_log(self, args):
|
def _do_log(self, args):
|
||||||
for fmt in self.output_formats:
|
for fmt in self.output_formats:
|
||||||
fmt.writeseq(args)
|
if isinstance(fmt, SeqWriter):
|
||||||
|
fmt.writeseq(map(str, args))
|
||||||
|
|
||||||
Logger.DEFAULT = Logger.CURRENT = Logger(dir=None, output_formats=[HumanOutputFormat(sys.stdout)])
|
Logger.DEFAULT = Logger.CURRENT = Logger(dir=None, output_formats=[HumanOutputFormat(sys.stdout)])
|
||||||
|
|
||||||
def configure(dir=None, format_strs=None):
|
def configure(dir=None, format_strs=None):
|
||||||
assert Logger.CURRENT is Logger.DEFAULT,\
|
|
||||||
"Only call logger.configure() when it's in the default state. Try calling logger.reset() first."
|
|
||||||
prevlogger = Logger.CURRENT
|
|
||||||
if dir is None:
|
if dir is None:
|
||||||
dir = os.getenv('OPENAI_LOGDIR')
|
dir = os.getenv('OPENAI_LOGDIR')
|
||||||
if dir is None:
|
if dir is None:
|
||||||
dir = osp.join(tempfile.gettempdir(),
|
dir = osp.join(tempfile.gettempdir(),
|
||||||
datetime.datetime.now().strftime("openai-%Y-%m-%d-%H-%M-%S-%f"))
|
datetime.datetime.now().strftime("openai-%Y-%m-%d-%H-%M-%S-%f"))
|
||||||
|
assert isinstance(dir, str)
|
||||||
|
os.makedirs(dir, exist_ok=True)
|
||||||
|
|
||||||
if format_strs is None:
|
if format_strs is None:
|
||||||
format_strs = LOG_OUTPUT_FORMATS
|
strs = os.getenv('OPENAI_LOG_FORMAT')
|
||||||
|
format_strs = strs.split(',') if strs else LOG_OUTPUT_FORMATS
|
||||||
output_formats = [make_output_format(f, dir) for f in format_strs]
|
output_formats = [make_output_format(f, dir) for f in format_strs]
|
||||||
|
|
||||||
Logger.CURRENT = Logger(dir=dir, output_formats=output_formats)
|
Logger.CURRENT = Logger(dir=dir, output_formats=output_formats)
|
||||||
log('Logging to %s'%dir)
|
log('Logging to %s'%dir)
|
||||||
|
|
||||||
if os.getenv('OPENAI_LOGDIR'):
|
|
||||||
# if OPENAI_LOGDIR is set, configure the logger on import
|
|
||||||
# this kind of nasty (unexpected to user), but I don't know how else to inject the logger
|
|
||||||
# to a script that's getting run in a subprocess
|
|
||||||
configure(dir=os.getenv('OPENAI_LOGDIR'))
|
|
||||||
|
|
||||||
def reset():
|
def reset():
|
||||||
Logger.CURRENT = Logger.DEFAULT
|
if Logger.CURRENT is not Logger.DEFAULT:
|
||||||
log('Reset logger')
|
Logger.CURRENT.close()
|
||||||
|
Logger.CURRENT = Logger.DEFAULT
|
||||||
|
log('Reset logger')
|
||||||
|
|
||||||
|
class scoped_configure(object):
|
||||||
|
def __init__(self, dir=None, format_strs=None):
|
||||||
|
self.dir = dir
|
||||||
|
self.format_strs = format_strs
|
||||||
|
self.prevlogger = None
|
||||||
|
def __enter__(self):
|
||||||
|
self.prevlogger = Logger.CURRENT
|
||||||
|
configure(dir=self.dir, format_strs=self.format_strs)
|
||||||
|
def __exit__(self, *args):
|
||||||
|
Logger.CURRENT.close()
|
||||||
|
Logger.CURRENT = self.prevlogger
|
||||||
|
|
||||||
# ================================================================
|
# ================================================================
|
||||||
|
|
||||||
@@ -294,14 +352,14 @@ def _demo():
|
|||||||
dir = "/tmp/testlogging"
|
dir = "/tmp/testlogging"
|
||||||
if os.path.exists(dir):
|
if os.path.exists(dir):
|
||||||
shutil.rmtree(dir)
|
shutil.rmtree(dir)
|
||||||
with session(dir=dir):
|
configure(dir=dir)
|
||||||
logkv("a", 3)
|
logkv("a", 3)
|
||||||
logkv("b", 2.5)
|
logkv("b", 2.5)
|
||||||
dumpkvs()
|
dumpkvs()
|
||||||
logkv("b", -2.5)
|
logkv("b", -2.5)
|
||||||
logkv("a", 5.5)
|
logkv("a", 5.5)
|
||||||
dumpkvs()
|
dumpkvs()
|
||||||
info("^^^ should see a = 5.5")
|
info("^^^ should see a = 5.5")
|
||||||
|
|
||||||
logkv("b", -2.5)
|
logkv("b", -2.5)
|
||||||
dumpkvs()
|
dumpkvs()
|
||||||
@@ -310,5 +368,55 @@ def _demo():
|
|||||||
dumpkvs()
|
dumpkvs()
|
||||||
|
|
||||||
|
|
||||||
|
# ================================================================
|
||||||
|
# Readers
|
||||||
|
# ================================================================
|
||||||
|
|
||||||
|
def read_json(fname):
|
||||||
|
import pandas
|
||||||
|
ds = []
|
||||||
|
with open(fname, 'rt') as fh:
|
||||||
|
for line in fh:
|
||||||
|
ds.append(json.loads(line))
|
||||||
|
return pandas.DataFrame(ds)
|
||||||
|
|
||||||
|
def read_csv(fname):
|
||||||
|
import pandas
|
||||||
|
return pandas.read_csv(fname, index_col=None, comment='#')
|
||||||
|
|
||||||
|
def read_tb(path):
|
||||||
|
"""
|
||||||
|
path : a tensorboard file OR a directory, where we will find all TB files
|
||||||
|
of the form events.*
|
||||||
|
"""
|
||||||
|
import pandas
|
||||||
|
import numpy as np
|
||||||
|
from glob import glob
|
||||||
|
from collections import defaultdict
|
||||||
|
import tensorflow as tf
|
||||||
|
if osp.isdir(path):
|
||||||
|
fnames = glob(osp.join(path, "events.*"))
|
||||||
|
elif osp.basename(path).startswith("events."):
|
||||||
|
fnames = [path]
|
||||||
|
else:
|
||||||
|
raise NotImplementedError("Expected tensorboard file or directory containing them. Got %s"%path)
|
||||||
|
tag2pairs = defaultdict(list)
|
||||||
|
maxstep = 0
|
||||||
|
for fname in fnames:
|
||||||
|
for summary in tf.train.summary_iterator(fname):
|
||||||
|
if summary.step > 0:
|
||||||
|
for v in summary.summary.value:
|
||||||
|
pair = (summary.step, v.simple_value)
|
||||||
|
tag2pairs[v.tag].append(pair)
|
||||||
|
maxstep = max(summary.step, maxstep)
|
||||||
|
data = np.empty((maxstep, len(tag2pairs)))
|
||||||
|
data[:] = np.nan
|
||||||
|
tags = sorted(tag2pairs.keys())
|
||||||
|
for (colidx,tag) in enumerate(tags):
|
||||||
|
pairs = tag2pairs[tag]
|
||||||
|
for (step, value) in pairs:
|
||||||
|
data[step-1, colidx] = value
|
||||||
|
return pandas.DataFrame(data, columns=tags)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
_demo()
|
_demo()
|
||||||
|
@@ -1,4 +1,3 @@
|
|||||||
from baselines.common.mpi_running_mean_std import RunningMeanStd
|
|
||||||
import baselines.common.tf_util as U
|
import baselines.common.tf_util as U
|
||||||
import tensorflow as tf
|
import tensorflow as tf
|
||||||
import gym
|
import gym
|
||||||
@@ -18,7 +17,7 @@ class CnnPolicy(object):
|
|||||||
sequence_length = None
|
sequence_length = None
|
||||||
|
|
||||||
ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))
|
ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))
|
||||||
|
|
||||||
x = ob / 255.0
|
x = ob / 255.0
|
||||||
if kind == 'small': # from A3C paper
|
if kind == 'small': # from A3C paper
|
||||||
x = tf.nn.relu(U.conv2d(x, 16, "l1", [8, 8], [4, 4], pad="VALID"))
|
x = tf.nn.relu(U.conv2d(x, 16, "l1", [8, 8], [4, 4], pad="VALID"))
|
||||||
|
@@ -18,7 +18,7 @@ class MlpPolicy(object):
|
|||||||
sequence_length = None
|
sequence_length = None
|
||||||
|
|
||||||
ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))
|
ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))
|
||||||
|
|
||||||
with tf.variable_scope("obfilter"):
|
with tf.variable_scope("obfilter"):
|
||||||
self.ob_rms = RunningMeanStd(shape=ob_space.shape)
|
self.ob_rms = RunningMeanStd(shape=ob_space.shape)
|
||||||
|
|
||||||
@@ -27,12 +27,12 @@ class MlpPolicy(object):
|
|||||||
for i in range(num_hid_layers):
|
for i in range(num_hid_layers):
|
||||||
last_out = tf.nn.tanh(U.dense(last_out, hid_size, "vffc%i"%(i+1), weight_init=U.normc_initializer(1.0)))
|
last_out = tf.nn.tanh(U.dense(last_out, hid_size, "vffc%i"%(i+1), weight_init=U.normc_initializer(1.0)))
|
||||||
self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:,0]
|
self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:,0]
|
||||||
|
|
||||||
last_out = obz
|
last_out = obz
|
||||||
for i in range(num_hid_layers):
|
for i in range(num_hid_layers):
|
||||||
last_out = tf.nn.tanh(U.dense(last_out, hid_size, "polfc%i"%(i+1), weight_init=U.normc_initializer(1.0)))
|
last_out = tf.nn.tanh(U.dense(last_out, hid_size, "polfc%i"%(i+1), weight_init=U.normc_initializer(1.0)))
|
||||||
if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
|
if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
|
||||||
mean = U.dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01))
|
mean = U.dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01))
|
||||||
logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer())
|
logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer())
|
||||||
pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1)
|
pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1)
|
||||||
else:
|
else:
|
||||||
|
@@ -78,7 +78,7 @@ def add_vtarg_and_adv(seg, gamma, lam):
|
|||||||
seg["tdlamret"] = seg["adv"] + seg["vpred"]
|
seg["tdlamret"] = seg["adv"] + seg["vpred"]
|
||||||
|
|
||||||
def learn(env, policy_func, *,
|
def learn(env, policy_func, *,
|
||||||
timesteps_per_batch, # timesteps per actor per update
|
timesteps_per_actorbatch, # timesteps per actor per update
|
||||||
clip_param, entcoeff, # clipping parameter epsilon, entropy coeff
|
clip_param, entcoeff, # clipping parameter epsilon, entropy coeff
|
||||||
optim_epochs, optim_stepsize, optim_batchsize,# optimization hypers
|
optim_epochs, optim_stepsize, optim_batchsize,# optimization hypers
|
||||||
gamma, lam, # advantage estimation
|
gamma, lam, # advantage estimation
|
||||||
@@ -130,7 +130,7 @@ def learn(env, policy_func, *,
|
|||||||
|
|
||||||
# Prepare for rollouts
|
# Prepare for rollouts
|
||||||
# ----------------------------------------
|
# ----------------------------------------
|
||||||
seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True)
|
seg_gen = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=True)
|
||||||
|
|
||||||
episodes_so_far = 0
|
episodes_so_far = 0
|
||||||
timesteps_so_far = 0
|
timesteps_so_far = 0
|
||||||
|
@@ -6,37 +6,34 @@ from baselines import bench
|
|||||||
import os.path as osp
|
import os.path as osp
|
||||||
import gym, logging
|
import gym, logging
|
||||||
from baselines import logger
|
from baselines import logger
|
||||||
|
from baselines.common.atari_wrappers import make_atari, wrap_deepmind
|
||||||
|
|
||||||
def wrap_train(env):
|
def train(env_id, num_timesteps, seed):
|
||||||
from baselines.common.atari_wrappers import (wrap_deepmind, FrameStack)
|
|
||||||
env = wrap_deepmind(env, clip_rewards=True)
|
|
||||||
env = FrameStack(env, 4)
|
|
||||||
return env
|
|
||||||
|
|
||||||
def train(env_id, num_frames, seed):
|
|
||||||
from baselines.ppo1 import pposgd_simple, cnn_policy
|
from baselines.ppo1 import pposgd_simple, cnn_policy
|
||||||
import baselines.common.tf_util as U
|
import baselines.common.tf_util as U
|
||||||
rank = MPI.COMM_WORLD.Get_rank()
|
rank = MPI.COMM_WORLD.Get_rank()
|
||||||
sess = U.single_threaded_session()
|
sess = U.single_threaded_session()
|
||||||
sess.__enter__()
|
sess.__enter__()
|
||||||
if rank != 0: logger.set_level(logger.DISABLED)
|
if rank == 0:
|
||||||
|
logger.configure()
|
||||||
|
else:
|
||||||
|
logger.configure(format_strs=[])
|
||||||
workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
|
workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
|
||||||
set_global_seeds(workerseed)
|
set_global_seeds(workerseed)
|
||||||
env = gym.make(env_id)
|
env = make_atari(env_id)
|
||||||
def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613
|
def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613
|
||||||
return cnn_policy.CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space)
|
return cnn_policy.CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space)
|
||||||
env = bench.Monitor(env, logger.get_dir() and
|
env = bench.Monitor(env, logger.get_dir() and
|
||||||
osp.join(logger.get_dir(), "%i.monitor.json" % rank))
|
osp.join(logger.get_dir(), str(rank)))
|
||||||
env.seed(workerseed)
|
env.seed(workerseed)
|
||||||
gym.logger.setLevel(logging.WARN)
|
gym.logger.setLevel(logging.WARN)
|
||||||
|
|
||||||
env = wrap_train(env)
|
env = wrap_deepmind(env)
|
||||||
num_timesteps = int(num_frames / 4 * 1.1)
|
|
||||||
env.seed(workerseed)
|
env.seed(workerseed)
|
||||||
|
|
||||||
pposgd_simple.learn(env, policy_fn,
|
pposgd_simple.learn(env, policy_fn,
|
||||||
max_timesteps=num_timesteps,
|
max_timesteps=int(num_timesteps * 1.1),
|
||||||
timesteps_per_batch=256,
|
timesteps_per_actorbatch=256,
|
||||||
clip_param=0.2, entcoeff=0.01,
|
clip_param=0.2, entcoeff=0.01,
|
||||||
optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=64,
|
optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=64,
|
||||||
gamma=0.99, lam=0.95,
|
gamma=0.99, lam=0.95,
|
||||||
@@ -49,8 +46,9 @@ def main():
|
|||||||
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||||
parser.add_argument('--env', help='environment ID', default='PongNoFrameskip-v4')
|
parser.add_argument('--env', help='environment ID', default='PongNoFrameskip-v4')
|
||||||
parser.add_argument('--seed', help='RNG seed', type=int, default=0)
|
parser.add_argument('--seed', help='RNG seed', type=int, default=0)
|
||||||
|
parser.add_argument('--num-timesteps', type=int, default=int(10e6))
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
train(args.env, num_frames=40e6, seed=args.seed)
|
train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
main()
|
main()
|
||||||
|
@@ -1,10 +1,8 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
from baselines.common import set_global_seeds, tf_util as U
|
from baselines.common import set_global_seeds, tf_util as U
|
||||||
from baselines import bench
|
from baselines import bench
|
||||||
import os.path as osp
|
|
||||||
import gym, logging
|
import gym, logging
|
||||||
from baselines import logger
|
from baselines import logger
|
||||||
import sys
|
|
||||||
|
|
||||||
def train(env_id, num_timesteps, seed):
|
def train(env_id, num_timesteps, seed):
|
||||||
from baselines.ppo1 import mlp_policy, pposgd_simple
|
from baselines.ppo1 import mlp_policy, pposgd_simple
|
||||||
@@ -14,13 +12,12 @@ def train(env_id, num_timesteps, seed):
|
|||||||
def policy_fn(name, ob_space, ac_space):
|
def policy_fn(name, ob_space, ac_space):
|
||||||
return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
|
return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
|
||||||
hid_size=64, num_hid_layers=2)
|
hid_size=64, num_hid_layers=2)
|
||||||
env = bench.Monitor(env, logger.get_dir() and
|
env = bench.Monitor(env, logger.get_dir())
|
||||||
osp.join(logger.get_dir(), "monitor.json"))
|
|
||||||
env.seed(seed)
|
env.seed(seed)
|
||||||
gym.logger.setLevel(logging.WARN)
|
gym.logger.setLevel(logging.WARN)
|
||||||
pposgd_simple.learn(env, policy_fn,
|
pposgd_simple.learn(env, policy_fn,
|
||||||
max_timesteps=num_timesteps,
|
max_timesteps=num_timesteps,
|
||||||
timesteps_per_batch=2048,
|
timesteps_per_actorbatch=2048,
|
||||||
clip_param=0.2, entcoeff=0.0,
|
clip_param=0.2, entcoeff=0.0,
|
||||||
optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64,
|
optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64,
|
||||||
gamma=0.99, lam=0.95, schedule='linear',
|
gamma=0.99, lam=0.95, schedule='linear',
|
||||||
@@ -32,8 +29,10 @@ def main():
|
|||||||
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||||
parser.add_argument('--env', help='environment ID', default='Hopper-v1')
|
parser.add_argument('--env', help='environment ID', default='Hopper-v1')
|
||||||
parser.add_argument('--seed', help='RNG seed', type=int, default=0)
|
parser.add_argument('--seed', help='RNG seed', type=int, default=0)
|
||||||
|
parser.add_argument('--num-timesteps', type=int, default=int(1e6))
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
train(args.env, num_timesteps=1e6, seed=args.seed)
|
logger.configure()
|
||||||
|
train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
@@ -1,4 +1,3 @@
|
|||||||
from baselines.common.mpi_running_mean_std import RunningMeanStd
|
|
||||||
import baselines.common.tf_util as U
|
import baselines.common.tf_util as U
|
||||||
import tensorflow as tf
|
import tensorflow as tf
|
||||||
import gym
|
import gym
|
||||||
@@ -18,7 +17,7 @@ class CnnPolicy(object):
|
|||||||
sequence_length = None
|
sequence_length = None
|
||||||
|
|
||||||
ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))
|
ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))
|
||||||
|
|
||||||
obscaled = ob / 255.0
|
obscaled = ob / 255.0
|
||||||
|
|
||||||
with tf.variable_scope("pol"):
|
with tf.variable_scope("pol"):
|
||||||
@@ -42,7 +41,7 @@ class CnnPolicy(object):
|
|||||||
self.state_out = []
|
self.state_out = []
|
||||||
|
|
||||||
stochastic = tf.placeholder(dtype=tf.bool, shape=())
|
stochastic = tf.placeholder(dtype=tf.bool, shape=())
|
||||||
ac = self.pd.sample() # XXX
|
ac = self.pd.sample()
|
||||||
self._act = U.function([stochastic, ob], [ac, self.vpred])
|
self._act = U.function([stochastic, ob], [ac, self.vpred])
|
||||||
|
|
||||||
def act(self, stochastic, ob):
|
def act(self, stochastic, ob):
|
||||||
|
@@ -1,45 +1,38 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
from mpi4py import MPI
|
from mpi4py import MPI
|
||||||
from baselines.common import set_global_seeds
|
from baselines.common import set_global_seeds
|
||||||
import os.path as osp
|
import os.path as osp
|
||||||
import gym, logging
|
import gym, logging
|
||||||
from baselines import logger
|
from baselines import logger
|
||||||
from baselines import bench
|
from baselines import bench
|
||||||
import sys
|
from baselines.common.atari_wrappers import make_atari, wrap_deepmind
|
||||||
|
|
||||||
def wrap_train(env):
|
def train(env_id, num_timesteps, seed):
|
||||||
from baselines.common.atari_wrappers import (wrap_deepmind, FrameStack)
|
|
||||||
env = wrap_deepmind(env, clip_rewards=False)
|
|
||||||
env = FrameStack(env, 3)
|
|
||||||
return env
|
|
||||||
|
|
||||||
def train(env_id, num_frames, seed):
|
|
||||||
from baselines.trpo_mpi.nosharing_cnn_policy import CnnPolicy
|
from baselines.trpo_mpi.nosharing_cnn_policy import CnnPolicy
|
||||||
from baselines.trpo_mpi import trpo_mpi
|
from baselines.trpo_mpi import trpo_mpi
|
||||||
import baselines.common.tf_util as U
|
import baselines.common.tf_util as U
|
||||||
rank = MPI.COMM_WORLD.Get_rank()
|
rank = MPI.COMM_WORLD.Get_rank()
|
||||||
sess = U.single_threaded_session()
|
sess = U.single_threaded_session()
|
||||||
sess.__enter__()
|
sess.__enter__()
|
||||||
if rank != 0:
|
if rank == 0:
|
||||||
logger.set_level(logger.DISABLED)
|
logger.configure()
|
||||||
|
else:
|
||||||
|
logger.configure(format_strs=[])
|
||||||
|
|
||||||
workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
|
workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
|
||||||
set_global_seeds(workerseed)
|
set_global_seeds(workerseed)
|
||||||
env = gym.make(env_id)
|
env = make_atari(env_id)
|
||||||
def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613
|
def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613
|
||||||
return CnnPolicy(name=name, ob_space=env.observation_space, ac_space=env.action_space)
|
return CnnPolicy(name=name, ob_space=env.observation_space, ac_space=env.action_space)
|
||||||
env = bench.Monitor(env, logger.get_dir() and
|
env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), str(rank)))
|
||||||
osp.join(logger.get_dir(), "%i.monitor.json"%rank))
|
|
||||||
env.seed(workerseed)
|
env.seed(workerseed)
|
||||||
gym.logger.setLevel(logging.WARN)
|
gym.logger.setLevel(logging.WARN)
|
||||||
|
|
||||||
env = wrap_train(env)
|
env = wrap_deepmind(env)
|
||||||
num_timesteps = int(num_frames / 4 * 1.1)
|
|
||||||
env.seed(workerseed)
|
env.seed(workerseed)
|
||||||
|
|
||||||
trpo_mpi.learn(env, policy_fn, timesteps_per_batch=512, max_kl=0.001, cg_iters=10, cg_damping=1e-3,
|
trpo_mpi.learn(env, policy_fn, timesteps_per_batch=512, max_kl=0.001, cg_iters=10, cg_damping=1e-3,
|
||||||
max_timesteps=num_timesteps, gamma=0.98, lam=1.0, vf_iters=3, vf_stepsize=1e-4, entcoeff=0.00)
|
max_timesteps=int(num_timesteps * 1.1), gamma=0.98, lam=1.0, vf_iters=3, vf_stepsize=1e-4, entcoeff=0.00)
|
||||||
env.close()
|
env.close()
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
@@ -47,8 +40,9 @@ def main():
|
|||||||
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||||
parser.add_argument('--env', help='environment ID', default='PongNoFrameskip-v4')
|
parser.add_argument('--env', help='environment ID', default='PongNoFrameskip-v4')
|
||||||
parser.add_argument('--seed', help='RNG seed', type=int, default=0)
|
parser.add_argument('--seed', help='RNG seed', type=int, default=0)
|
||||||
|
parser.add_argument('--num-timesteps', type=int, default=int(10e6))
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
train(args.env, num_frames=40e6, seed=args.seed)
|
train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
@@ -27,8 +27,8 @@ def train(env_id, num_timesteps, seed):
|
|||||||
def policy_fn(name, ob_space, ac_space):
|
def policy_fn(name, ob_space, ac_space):
|
||||||
return MlpPolicy(name=name, ob_space=env.observation_space, ac_space=env.action_space,
|
return MlpPolicy(name=name, ob_space=env.observation_space, ac_space=env.action_space,
|
||||||
hid_size=32, num_hid_layers=2)
|
hid_size=32, num_hid_layers=2)
|
||||||
env = bench.Monitor(env, logger.get_dir() and
|
env = bench.Monitor(env, logger.get_dir() and
|
||||||
osp.join(logger.get_dir(), "%i.monitor.json" % rank))
|
osp.join(logger.get_dir(), str(rank)))
|
||||||
env.seed(workerseed)
|
env.seed(workerseed)
|
||||||
gym.logger.setLevel(logging.WARN)
|
gym.logger.setLevel(logging.WARN)
|
||||||
|
|
||||||
@@ -41,8 +41,10 @@ def main():
|
|||||||
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||||
parser.add_argument('--env', help='environment ID', default='Hopper-v1')
|
parser.add_argument('--env', help='environment ID', default='Hopper-v1')
|
||||||
parser.add_argument('--seed', help='RNG seed', type=int, default=0)
|
parser.add_argument('--seed', help='RNG seed', type=int, default=0)
|
||||||
|
parser.add_argument('--num-timesteps', type=int, default=int(1e6))
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
train(args.env, num_timesteps=1e6, seed=args.seed)
|
logger.configure()
|
||||||
|
train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
@@ -288,4 +288,4 @@ def learn(env, policy_func, *,
|
|||||||
logger.dump_tabular()
|
logger.dump_tabular()
|
||||||
|
|
||||||
def flatten_lists(listoflists):
|
def flatten_lists(listoflists):
|
||||||
return [el for list_ in listoflists for el in list_]
|
return [el for list_ in listoflists for el in list_]
|
Reference in New Issue
Block a user