Lots of cleanups

Fixes for new gym version
Add @olegklimov and @unixpickle to authors list
This commit is contained in:
John Schulman
2018-01-25 18:33:48 -08:00
parent b5be53dc92
commit 9fa8e1baf1
62 changed files with 989 additions and 1604 deletions

View File

@@ -26,7 +26,7 @@ pip install -e .
To cite this repository in publications: To cite this repository in publications:
@misc{baselines, @misc{baselines,
author = {Dhariwal, Prafulla and Hesse, Christopher and Plappert, Matthias and Radford, Alec and Schulman, John and Sidor, Szymon and Wu, Yuhuai}, author = {Dhariwal, Prafulla and Hesse, Christopher and Klimov, Oleg and Nichol, Alex and Plappert, Matthias and Radford, Alec and Schulman, John and Sidor, Szymon and Wu, Yuhuai},
title = {OpenAI Baselines}, title = {OpenAI Baselines},
year = {2017}, year = {2017},
publisher = {GitHub}, publisher = {GitHub},

View File

@@ -1,3 +1,4 @@
import os
import os.path as osp import os.path as osp
import gym import gym
import time import time
@@ -10,22 +11,19 @@ from baselines import logger
from baselines.common import set_global_seeds, explained_variance from baselines.common import set_global_seeds, explained_variance
from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
from baselines.common.atari_wrappers import wrap_deepmind from baselines.common.atari_wrappers import wrap_deepmind
from baselines.common import tf_util
from baselines.a2c.utils import discount_with_dones from baselines.a2c.utils import discount_with_dones
from baselines.a2c.utils import Scheduler, make_path, find_trainable_variables from baselines.a2c.utils import Scheduler, make_path, find_trainable_variables
from baselines.a2c.policies import CnnPolicy
from baselines.a2c.utils import cat_entropy, mse from baselines.a2c.utils import cat_entropy, mse
class Model(object): class Model(object):
def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, nstack, num_procs, def __init__(self, policy, ob_space, ac_space, nenvs, nsteps,
ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4,
alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear'): alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear'):
config = tf.ConfigProto(allow_soft_placement=True,
intra_op_parallelism_threads=num_procs, sess = tf_util.make_session()
inter_op_parallelism_threads=num_procs)
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)
nact = ac_space.n nact = ac_space.n
nbatch = nenvs*nsteps nbatch = nenvs*nsteps
@@ -34,8 +32,8 @@ class Model(object):
R = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch])
LR = tf.placeholder(tf.float32, []) LR = tf.placeholder(tf.float32, [])
step_model = policy(sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False) step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False)
train_model = policy(sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True) train_model = policy(sess, ob_space, ac_space, nenvs*nsteps, nsteps, reuse=True)
neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=A) neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=A)
pg_loss = tf.reduce_mean(ADV * neglogpac) pg_loss = tf.reduce_mean(ADV * neglogpac)
@@ -58,7 +56,7 @@ class Model(object):
for step in range(len(obs)): for step in range(len(obs)):
cur_lr = lr.value() cur_lr = lr.value()
td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, LR:cur_lr} td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, LR:cur_lr}
if states != []: if states is not None:
td_map[train_model.S] = states td_map[train_model.S] = states
td_map[train_model.M] = masks td_map[train_model.M] = masks
policy_loss, value_loss, policy_entropy, _ = sess.run( policy_loss, value_loss, policy_entropy, _ = sess.run(
@@ -91,32 +89,25 @@ class Model(object):
class Runner(object): class Runner(object):
def __init__(self, env, model, nsteps=5, nstack=4, gamma=0.99): def __init__(self, env, model, nsteps=5, gamma=0.99):
self.env = env self.env = env
self.model = model self.model = model
nh, nw, nc = env.observation_space.shape nh, nw, nc = env.observation_space.shape
nenv = env.num_envs nenv = env.num_envs
self.batch_ob_shape = (nenv*nsteps, nh, nw, nc*nstack) self.batch_ob_shape = (nenv*nsteps, nh, nw, nc)
self.obs = np.zeros((nenv, nh, nw, nc*nstack), dtype=np.uint8) self.obs = np.zeros((nenv, nh, nw, nc), dtype=np.uint8)
self.nc = nc self.nc = nc
obs = env.reset() obs = env.reset()
self.update_obs(obs)
self.gamma = gamma self.gamma = gamma
self.nsteps = nsteps self.nsteps = nsteps
self.states = model.initial_state self.states = model.initial_state
self.dones = [False for _ in range(nenv)] self.dones = [False for _ in range(nenv)]
def update_obs(self, obs):
# Do frame-stacking here instead of the FrameStack wrapper to reduce
# IPC overhead
self.obs = np.roll(self.obs, shift=-self.nc, axis=3)
self.obs[:, :, :, -self.nc:] = obs
def run(self): def run(self):
mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[] mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[]
mb_states = self.states mb_states = self.states
for n in range(self.nsteps): for n in range(self.nsteps):
actions, values, states = self.model.step(self.obs, self.states, self.dones) actions, values, states, _ = self.model.step(self.obs, self.states, self.dones)
mb_obs.append(np.copy(self.obs)) mb_obs.append(np.copy(self.obs))
mb_actions.append(actions) mb_actions.append(actions)
mb_values.append(values) mb_values.append(values)
@@ -127,7 +118,7 @@ class Runner(object):
for n, done in enumerate(dones): for n, done in enumerate(dones):
if done: if done:
self.obs[n] = self.obs[n]*0 self.obs[n] = self.obs[n]*0
self.update_obs(obs) self.obs = obs
mb_rewards.append(rewards) mb_rewards.append(rewards)
mb_dones.append(self.dones) mb_dones.append(self.dones)
#batch of steps to batch of rollouts #batch of steps to batch of rollouts
@@ -154,17 +145,16 @@ class Runner(object):
mb_masks = mb_masks.flatten() mb_masks = mb_masks.flatten()
return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values
def learn(policy, env, seed, nsteps=5, nstack=4, total_timesteps=int(80e6), vf_coef=0.5, ent_coef=0.01, max_grad_norm=0.5, lr=7e-4, lrschedule='linear', epsilon=1e-5, alpha=0.99, gamma=0.99, log_interval=100): def learn(policy, env, seed, nsteps=5, total_timesteps=int(80e6), vf_coef=0.5, ent_coef=0.01, max_grad_norm=0.5, lr=7e-4, lrschedule='linear', epsilon=1e-5, alpha=0.99, gamma=0.99, log_interval=100):
tf.reset_default_graph() tf.reset_default_graph()
set_global_seeds(seed) set_global_seeds(seed)
nenvs = env.num_envs nenvs = env.num_envs
ob_space = env.observation_space ob_space = env.observation_space
ac_space = env.action_space ac_space = env.action_space
num_procs = len(env.remotes) # HACK model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nenvs=nenvs, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef,
model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nenvs=nenvs, nsteps=nsteps, nstack=nstack, num_procs=num_procs, ent_coef=ent_coef, vf_coef=vf_coef,
max_grad_norm=max_grad_norm, lr=lr, alpha=alpha, epsilon=epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule) max_grad_norm=max_grad_norm, lr=lr, alpha=alpha, epsilon=epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule)
runner = Runner(env, model, nsteps=nsteps, nstack=nstack, gamma=gamma) runner = Runner(env, model, nsteps=nsteps, gamma=gamma)
nbatch = nenvs*nsteps nbatch = nenvs*nsteps
tstart = time.time() tstart = time.time()
@@ -183,6 +173,3 @@ def learn(policy, env, seed, nsteps=5, nstack=4, total_timesteps=int(80e6), vf_c
logger.record_tabular("explained_variance", float(ev)) logger.record_tabular("explained_variance", float(ev))
logger.dump_tabular() logger.dump_tabular()
env.close() env.close()
if __name__ == '__main__':
main()

View File

@@ -1,36 +1,48 @@
import numpy as np import numpy as np
import tensorflow as tf import tensorflow as tf
from baselines.a2c.utils import conv, fc, conv_to_fc, batch_to_seq, seq_to_batch, lstm, lnlstm, sample from baselines.a2c.utils import conv, fc, conv_to_fc, batch_to_seq, seq_to_batch, lstm, lnlstm
from baselines.common.distributions import make_pdtype
def nature_cnn(unscaled_images):
"""
CNN from Nature paper.
"""
scaled_images = tf.cast(unscaled_images, tf.float32) / 255.
activ = tf.nn.relu
h = activ(conv(scaled_images, 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2)))
h2 = activ(conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2)))
h3 = activ(conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2)))
h3 = conv_to_fc(h3)
return activ(fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2)))
class LnLstmPolicy(object): class LnLstmPolicy(object):
def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, nlstm=256, reuse=False): def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False):
nbatch = nenv*nsteps nenv = nbatch // nsteps
nh, nw, nc = ob_space.shape nh, nw, nc = ob_space.shape
ob_shape = (nbatch, nh, nw, nc*nstack) ob_shape = (nbatch, nh, nw, nc)
nact = ac_space.n nact = ac_space.n
X = tf.placeholder(tf.uint8, ob_shape) #obs X = tf.placeholder(tf.uint8, ob_shape) #obs
M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
with tf.variable_scope("model", reuse=reuse): with tf.variable_scope("model", reuse=reuse):
h = conv(tf.cast(X, tf.float32)/255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2)) h = nature_cnn(X)
h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2)) xs = batch_to_seq(h, nenv, nsteps)
h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2))
h3 = conv_to_fc(h3)
h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
xs = batch_to_seq(h4, nenv, nsteps)
ms = batch_to_seq(M, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps)
h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm) h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm)
h5 = seq_to_batch(h5) h5 = seq_to_batch(h5)
pi = fc(h5, 'pi', nact, act=lambda x:x) pi = fc(h5, 'pi', nact)
vf = fc(h5, 'v', 1, act=lambda x:x) vf = fc(h5, 'v', 1)
self.pdtype = make_pdtype(ac_space)
self.pd = self.pdtype.pdfromflat(pi)
v0 = vf[:, 0] v0 = vf[:, 0]
a0 = sample(pi) a0 = self.pd.sample()
neglogp0 = self.pd.neglogp(a0)
self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32) self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)
def step(ob, state, mask): def step(ob, state, mask):
a, v, s = sess.run([a0, v0, snew], {X:ob, S:state, M:mask}) return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask})
return a, v, s
def value(ob, state, mask): def value(ob, state, mask):
return sess.run(v0, {X:ob, S:state, M:mask}) return sess.run(v0, {X:ob, S:state, M:mask})
@@ -45,34 +57,34 @@ class LnLstmPolicy(object):
class LstmPolicy(object): class LstmPolicy(object):
def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, nlstm=256, reuse=False): def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False):
nbatch = nenv*nsteps nenv = nbatch // nsteps
nh, nw, nc = ob_space.shape nh, nw, nc = ob_space.shape
ob_shape = (nbatch, nh, nw, nc*nstack) ob_shape = (nbatch, nh, nw, nc)
nact = ac_space.n nact = ac_space.n
X = tf.placeholder(tf.uint8, ob_shape) #obs X = tf.placeholder(tf.uint8, ob_shape) #obs
M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
with tf.variable_scope("model", reuse=reuse): with tf.variable_scope("model", reuse=reuse):
h = conv(tf.cast(X, tf.float32)/255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2)) h = nature_cnn(X)
h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2)) xs = batch_to_seq(h, nenv, nsteps)
h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2))
h3 = conv_to_fc(h3)
h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
xs = batch_to_seq(h4, nenv, nsteps)
ms = batch_to_seq(M, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps)
h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm) h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
h5 = seq_to_batch(h5) h5 = seq_to_batch(h5)
pi = fc(h5, 'pi', nact, act=lambda x:x) pi = fc(h5, 'pi', nact)
vf = fc(h5, 'v', 1, act=lambda x:x) vf = fc(h5, 'v', 1)
self.pdtype = make_pdtype(ac_space)
self.pd = self.pdtype.pdfromflat(pi)
v0 = vf[:, 0] v0 = vf[:, 0]
a0 = sample(pi) a0 = self.pd.sample()
neglogp0 = self.pd.neglogp(a0)
self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32) self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)
def step(ob, state, mask): def step(ob, state, mask):
a, v, s = sess.run([a0, v0, snew], {X:ob, S:state, M:mask}) return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask})
return a, v, s
def value(ob, state, mask): def value(ob, state, mask):
return sess.run(v0, {X:ob, S:state, M:mask}) return sess.run(v0, {X:ob, S:state, M:mask})
@@ -87,31 +99,67 @@ class LstmPolicy(object):
class CnnPolicy(object): class CnnPolicy(object):
def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False): def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False): #pylint: disable=W0613
nbatch = nenv*nsteps
nh, nw, nc = ob_space.shape nh, nw, nc = ob_space.shape
ob_shape = (nbatch, nh, nw, nc*nstack) ob_shape = (nbatch, nh, nw, nc)
nact = ac_space.n nact = ac_space.n
X = tf.placeholder(tf.uint8, ob_shape) #obs X = tf.placeholder(tf.uint8, ob_shape) #obs
with tf.variable_scope("model", reuse=reuse): with tf.variable_scope("model", reuse=reuse):
h = conv(tf.cast(X, tf.float32)/255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2)) h = nature_cnn(X)
h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2)) pi = fc(h, 'pi', nact, init_scale=0.01)
h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2)) vf = fc(h, 'v', 1)[:,0]
h3 = conv_to_fc(h3)
h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
pi = fc(h4, 'pi', nact, act=lambda x:x)
vf = fc(h4, 'v', 1, act=lambda x:x)
v0 = vf[:, 0] self.pdtype = make_pdtype(ac_space)
a0 = sample(pi) self.pd = self.pdtype.pdfromflat(pi)
self.initial_state = [] #not stateful
a0 = self.pd.sample()
neglogp0 = self.pd.neglogp(a0)
self.initial_state = None
def step(ob, *_args, **_kwargs): def step(ob, *_args, **_kwargs):
a, v = sess.run([a0, v0], {X:ob}) a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob})
return a, v, [] #dummy state return a, v, self.initial_state, neglogp
def value(ob, *_args, **_kwargs): def value(ob, *_args, **_kwargs):
return sess.run(v0, {X:ob}) return sess.run(vf, {X:ob})
self.X = X
self.pi = pi
self.vf = vf
self.step = step
self.value = value
class MlpPolicy(object):
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False): #pylint: disable=W0613
ob_shape = (nbatch,) + ob_space.shape
actdim = ac_space.shape[0]
X = tf.placeholder(tf.float32, ob_shape, name='Ob') #obs
with tf.variable_scope("model", reuse=reuse):
activ = tf.tanh
h1 = activ(fc(X, 'pi_fc1', nh=64, init_scale=np.sqrt(2)))
h2 = activ(fc(h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2)))
pi = fc(h2, 'pi', actdim, init_scale=0.01)
h1 = activ(fc(X, 'vf_fc1', nh=64, init_scale=np.sqrt(2)))
h2 = activ(fc(h1, 'vf_fc2', nh=64, init_scale=np.sqrt(2)))
vf = fc(h2, 'vf', 1)[:,0]
logstd = tf.get_variable(name="logstd", shape=[1, actdim],
initializer=tf.zeros_initializer())
pdparam = tf.concat([pi, pi * 0.0 + logstd], axis=1)
self.pdtype = make_pdtype(ac_space)
self.pd = self.pdtype.pdfromflat(pdparam)
a0 = self.pd.sample()
neglogp0 = self.pd.neglogp(a0)
self.initial_state = None
def step(ob, *_args, **_kwargs):
a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob})
return a, v, self.initial_state, neglogp
def value(ob, *_args, **_kwargs):
return sess.run(vf, {X:ob})
self.X = X self.X = X
self.pi = pi self.pi = pi

View File

@@ -1,45 +1,30 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import os, logging, gym
from baselines import logger
from baselines.common import set_global_seeds
from baselines import bench
from baselines.a2c.a2c import learn
from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
from baselines.common.atari_wrappers import make_atari, wrap_deepmind
from baselines.a2c.policies import CnnPolicy, LstmPolicy, LnLstmPolicy
def train(env_id, num_timesteps, seed, policy, lrschedule, num_cpu): from baselines import logger
def make_env(rank): from baselines.common.cmd_util import make_atari_env, atari_arg_parser
def _thunk(): from baselines.common.vec_env.vec_frame_stack import VecFrameStack
env = make_atari(env_id) from baselines.a2c.a2c import learn
env.seed(seed + rank) from baselines.ppo2.policies import CnnPolicy, LstmPolicy, LnLstmPolicy
env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
gym.logger.setLevel(logging.WARN) def train(env_id, num_timesteps, seed, policy, lrschedule, num_env):
return wrap_deepmind(env)
return _thunk
set_global_seeds(seed)
env = SubprocVecEnv([make_env(i) for i in range(num_cpu)])
if policy == 'cnn': if policy == 'cnn':
policy_fn = CnnPolicy policy_fn = CnnPolicy
elif policy == 'lstm': elif policy == 'lstm':
policy_fn = LstmPolicy policy_fn = LstmPolicy
elif policy == 'lnlstm': elif policy == 'lnlstm':
policy_fn = LnLstmPolicy policy_fn = LnLstmPolicy
env = VecFrameStack(make_atari_env(env_id, num_env, seed), 4)
learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), lrschedule=lrschedule) learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), lrschedule=lrschedule)
env.close() env.close()
def main(): def main():
import argparse parser = atari_arg_parser()
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4')
parser.add_argument('--seed', help='RNG seed', type=int, default=0)
parser.add_argument('--policy', help='Policy architecture', choices=['cnn', 'lstm', 'lnlstm'], default='cnn') parser.add_argument('--policy', help='Policy architecture', choices=['cnn', 'lstm', 'lnlstm'], default='cnn')
parser.add_argument('--lrschedule', help='Learning rate schedule', choices=['constant', 'linear'], default='constant') parser.add_argument('--lrschedule', help='Learning rate schedule', choices=['constant', 'linear'], default='constant')
parser.add_argument('--num-timesteps', type=int, default=int(10e6))
args = parser.parse_args() args = parser.parse_args()
logger.configure() logger.configure()
train(args.env, num_timesteps=args.num_timesteps, seed=args.seed, train(args.env, num_timesteps=args.num_timesteps, seed=args.seed,
policy=args.policy, lrschedule=args.lrschedule, num_cpu=16) policy=args.policy, lrschedule=args.lrschedule, num_env=16)
if __name__ == '__main__': if __name__ == '__main__':
main() main()

View File

@@ -39,23 +39,19 @@ def ortho_init(scale=1.0):
return (scale * q[:shape[0], :shape[1]]).astype(np.float32) return (scale * q[:shape[0], :shape[1]]).astype(np.float32)
return _ortho_init return _ortho_init
def conv(x, scope, nf, rf, stride, pad='VALID', act=tf.nn.relu, init_scale=1.0): def conv(x, scope, *, nf, rf, stride, pad='VALID', init_scale=1.0):
with tf.variable_scope(scope): with tf.variable_scope(scope):
nin = x.get_shape()[3].value nin = x.get_shape()[3].value
w = tf.get_variable("w", [rf, rf, nin, nf], initializer=ortho_init(init_scale)) w = tf.get_variable("w", [rf, rf, nin, nf], initializer=ortho_init(init_scale))
b = tf.get_variable("b", [nf], initializer=tf.constant_initializer(0.0)) b = tf.get_variable("b", [nf], initializer=tf.constant_initializer(0.0))
z = tf.nn.conv2d(x, w, strides=[1, stride, stride, 1], padding=pad)+b return tf.nn.conv2d(x, w, strides=[1, stride, stride, 1], padding=pad)+b
h = act(z)
return h
def fc(x, scope, nh, act=tf.nn.relu, init_scale=1.0): def fc(x, scope, nh, *, init_scale=1.0, init_bias=0.0):
with tf.variable_scope(scope): with tf.variable_scope(scope):
nin = x.get_shape()[1].value nin = x.get_shape()[1].value
w = tf.get_variable("w", [nin, nh], initializer=ortho_init(init_scale)) w = tf.get_variable("w", [nin, nh], initializer=ortho_init(init_scale))
b = tf.get_variable("b", [nh], initializer=tf.constant_initializer(0.0)) b = tf.get_variable("b", [nh], initializer=tf.constant_initializer(init_bias))
z = tf.matmul(x, w)+b return tf.matmul(x, w)+b
h = act(z)
return h
def batch_to_seq(h, nbatch, nsteps, flat=False): def batch_to_seq(h, nbatch, nsteps, flat=False):
if flat: if flat:
@@ -162,9 +158,34 @@ def constant(p):
def linear(p): def linear(p):
return 1-p return 1-p
def middle_drop(p):
eps = 0.75
if 1-p<eps:
return eps*0.1
return 1-p
def double_linear_con(p):
p *= 2
eps = 0.125
if 1-p<eps:
return eps
return 1-p
def double_middle_drop(p):
eps1 = 0.75
eps2 = 0.25
if 1-p<eps1:
if 1-p<eps2:
return eps2*0.5
return eps1*0.1
return 1-p
schedules = { schedules = {
'linear':linear, 'linear':linear,
'constant':constant 'constant':constant,
'double_linear_con': double_linear_con,
'middle_drop': middle_drop,
'double_middle_drop': double_middle_drop
} }
class Scheduler(object): class Scheduler(object):

View File

@@ -1,6 +1,7 @@
import numpy as np import numpy as np
import tensorflow as tf import tensorflow as tf
from baselines.a2c.utils import conv, fc, conv_to_fc, batch_to_seq, seq_to_batch, lstm, lnlstm, sample, check_shape from baselines.ppo2.policies import nature_cnn
from baselines.a2c.utils import fc, batch_to_seq, seq_to_batch, lstm, sample
class AcerCnnPolicy(object): class AcerCnnPolicy(object):
@@ -12,14 +13,10 @@ class AcerCnnPolicy(object):
nact = ac_space.n nact = ac_space.n
X = tf.placeholder(tf.uint8, ob_shape) # obs X = tf.placeholder(tf.uint8, ob_shape) # obs
with tf.variable_scope("model", reuse=reuse): with tf.variable_scope("model", reuse=reuse):
h = conv(tf.cast(X, tf.float32) / 255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2)) h = nature_cnn(X)
h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2)) pi_logits = fc(h, 'pi', nact, init_scale=0.01)
h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2))
h3 = conv_to_fc(h3)
h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
pi_logits = fc(h4, 'pi', nact, act=lambda x: x, init_scale=0.01)
pi = tf.nn.softmax(pi_logits) pi = tf.nn.softmax(pi_logits)
q = fc(h4, 'q', nact, act=lambda x: x) q = fc(h, 'q', nact)
a = sample(pi_logits) # could change this to use self.pi instead a = sample(pi_logits) # could change this to use self.pi instead
self.initial_state = [] # not stateful self.initial_state = [] # not stateful
@@ -54,14 +51,10 @@ class AcerLstmPolicy(object):
M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
with tf.variable_scope("model", reuse=reuse): with tf.variable_scope("model", reuse=reuse):
h = conv(tf.cast(X, tf.float32) / 255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2)) h = nature_cnn(X)
h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))
h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2))
h3 = conv_to_fc(h3)
h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
# lstm # lstm
xs = batch_to_seq(h4, nenv, nsteps) xs = batch_to_seq(h, nenv, nsteps)
ms = batch_to_seq(M, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps)
h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm) h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
h5 = seq_to_batch(h5) h5 = seq_to_batch(h5)

View File

@@ -1,24 +1,11 @@
#!/usr/bin/env python #!/usr/bin/env python3
import os, logging, gym
from baselines import logger from baselines import logger
from baselines.common import set_global_seeds
from baselines import bench
from baselines.acer.acer_simple import learn from baselines.acer.acer_simple import learn
from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
from baselines.common.atari_wrappers import make_atari, wrap_deepmind
from baselines.acer.policies import AcerCnnPolicy, AcerLstmPolicy from baselines.acer.policies import AcerCnnPolicy, AcerLstmPolicy
from baselines.common.cmd_util import make_atari_env, atari_arg_parser
def train(env_id, num_timesteps, seed, policy, lrschedule, num_cpu): def train(env_id, num_timesteps, seed, policy, lrschedule, num_cpu):
def make_env(rank): env = make_atari_env(env_id, num_cpu, seed)
def _thunk():
env = make_atari(env_id)
env.seed(seed + rank)
env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
gym.logger.setLevel(logging.WARN)
return wrap_deepmind(env)
return _thunk
set_global_seeds(seed)
env = SubprocVecEnv([make_env(i) for i in range(num_cpu)])
if policy == 'cnn': if policy == 'cnn':
policy_fn = AcerCnnPolicy policy_fn = AcerCnnPolicy
elif policy == 'lstm': elif policy == 'lstm':
@@ -30,16 +17,12 @@ def train(env_id, num_timesteps, seed, policy, lrschedule, num_cpu):
env.close() env.close()
def main(): def main():
import argparse parser = atari_arg_parser()
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4')
parser.add_argument('--seed', help='RNG seed', type=int, default=0)
parser.add_argument('--policy', help='Policy architecture', choices=['cnn', 'lstm', 'lnlstm'], default='cnn') parser.add_argument('--policy', help='Policy architecture', choices=['cnn', 'lstm', 'lnlstm'], default='cnn')
parser.add_argument('--lrschedule', help='Learning rate schedule', choices=['constant', 'linear'], default='constant') parser.add_argument('--lrschedule', help='Learning rate schedule', choices=['constant', 'linear'], default='constant')
parser.add_argument('--logdir', help ='Directory for logging', default='./log') parser.add_argument('--logdir', help ='Directory for logging')
parser.add_argument('--num-timesteps', type=int, default=int(10e6))
args = parser.parse_args() args = parser.parse_args()
logger.configure(os.path.abspath(args.logdir)) logger.configure(args.logdir)
train(args.env, num_timesteps=args.num_timesteps, seed=args.seed, train(args.env, num_timesteps=args.num_timesteps, seed=args.seed,
policy=args.policy, lrschedule=args.lrschedule, num_cpu=16) policy=args.policy, lrschedule=args.lrschedule, num_cpu=16)

View File

@@ -1,10 +1,10 @@
import numpy as np import numpy as np
import tensorflow as tf import tensorflow as tf
from baselines import logger from baselines import logger
from baselines import common import baselines.common as common
from baselines.common import tf_util as U from baselines.common import tf_util as U
from baselines.acktr import kfac from baselines.acktr import kfac
from baselines.acktr.filters import ZFilter from baselines.common.filters import ZFilter
def pathlength(path): def pathlength(path):
return path["reward"].shape[0]# Loss function that we'll differentiate to get the policy gradient return path["reward"].shape[0]# Loss function that we'll differentiate to get the policy gradient
@@ -70,7 +70,7 @@ def learn(env, policy, vf, gamma, lam, timesteps_per_batch, num_timesteps,
coord = tf.train.Coordinator() coord = tf.train.Coordinator()
for qr in [q_runner, vf.q_runner]: for qr in [q_runner, vf.q_runner]:
assert (qr != None) assert (qr != None)
enqueue_threads.extend(qr.create_threads(U.get_session(), coord=coord, start=True)) enqueue_threads.extend(qr.create_threads(tf.get_default_session(), coord=coord, start=True))
i = 0 i = 0
timesteps_so_far = 0 timesteps_so_far = 0
@@ -122,10 +122,10 @@ def learn(env, policy, vf, gamma, lam, timesteps_per_batch, num_timesteps,
kl = policy.compute_kl(ob_no, oldac_dist) kl = policy.compute_kl(ob_no, oldac_dist)
if kl > desired_kl * 2: if kl > desired_kl * 2:
logger.log("kl too high") logger.log("kl too high")
U.eval(tf.assign(stepsize, tf.maximum(min_stepsize, stepsize / 1.5))) tf.assign(stepsize, tf.maximum(min_stepsize, stepsize / 1.5)).eval()
elif kl < desired_kl / 2: elif kl < desired_kl / 2:
logger.log("kl too low") logger.log("kl too low")
U.eval(tf.assign(stepsize, tf.minimum(max_stepsize, stepsize * 1.5))) tf.assign(stepsize, tf.minimum(max_stepsize, stepsize * 1.5)).eval()
else: else:
logger.log("kl just right!") logger.log("kl just right!")

View File

@@ -7,16 +7,17 @@ from baselines import logger
from baselines.common import set_global_seeds, explained_variance from baselines.common import set_global_seeds, explained_variance
from baselines.acktr.utils import discount_with_dones from baselines.a2c.a2c import Runner
from baselines.acktr.utils import Scheduler, find_trainable_variables from baselines.a2c.utils import discount_with_dones
from baselines.acktr.utils import cat_entropy, mse from baselines.a2c.utils import Scheduler, find_trainable_variables
from baselines.a2c.utils import cat_entropy, mse
from baselines.acktr import kfac from baselines.acktr import kfac
class Model(object): class Model(object):
def __init__(self, policy, ob_space, ac_space, nenvs,total_timesteps, nprocs=32, nsteps=20, def __init__(self, policy, ob_space, ac_space, nenvs,total_timesteps, nprocs=32, nsteps=20,
nstack=4, ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5, ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5,
kfac_clip=0.001, lrschedule='linear'): kfac_clip=0.001, lrschedule='linear'):
config = tf.ConfigProto(allow_soft_placement=True, config = tf.ConfigProto(allow_soft_placement=True,
intra_op_parallelism_threads=nprocs, intra_op_parallelism_threads=nprocs,
@@ -31,8 +32,8 @@ class Model(object):
PG_LR = tf.placeholder(tf.float32, []) PG_LR = tf.placeholder(tf.float32, [])
VF_LR = tf.placeholder(tf.float32, []) VF_LR = tf.placeholder(tf.float32, [])
self.model = step_model = policy(sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False) self.model = step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False)
self.model2 = train_model = policy(sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True) self.model2 = train_model = policy(sess, ob_space, ac_space, nenvs*nsteps, nsteps, reuse=True)
logpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=A) logpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=A)
self.logits = logits = train_model.pi self.logits = logits = train_model.pi
@@ -71,7 +72,7 @@ class Model(object):
cur_lr = self.lr.value() cur_lr = self.lr.value()
td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, PG_LR:cur_lr} td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, PG_LR:cur_lr}
if states != []: if states is not None:
td_map[train_model.S] = states td_map[train_model.S] = states
td_map[train_model.M] = masks td_map[train_model.M] = masks
@@ -104,70 +105,8 @@ class Model(object):
self.initial_state = step_model.initial_state self.initial_state = step_model.initial_state
tf.global_variables_initializer().run(session=sess) tf.global_variables_initializer().run(session=sess)
class Runner(object):
def __init__(self, env, model, nsteps, nstack, gamma):
self.env = env
self.model = model
nh, nw, nc = env.observation_space.shape
nenv = env.num_envs
self.batch_ob_shape = (nenv*nsteps, nh, nw, nc*nstack)
self.obs = np.zeros((nenv, nh, nw, nc*nstack), dtype=np.uint8)
obs = env.reset()
self.update_obs(obs)
self.gamma = gamma
self.nsteps = nsteps
self.states = model.initial_state
self.dones = [False for _ in range(nenv)]
def update_obs(self, obs):
self.obs = np.roll(self.obs, shift=-1, axis=3)
self.obs[:, :, :, -1] = obs[:, :, :, 0]
def run(self):
mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[]
mb_states = self.states
for n in range(self.nsteps):
actions, values, states = self.model.step(self.obs, self.states, self.dones)
mb_obs.append(np.copy(self.obs))
mb_actions.append(actions)
mb_values.append(values)
mb_dones.append(self.dones)
obs, rewards, dones, _ = self.env.step(actions)
self.states = states
self.dones = dones
for n, done in enumerate(dones):
if done:
self.obs[n] = self.obs[n]*0
self.update_obs(obs)
mb_rewards.append(rewards)
mb_dones.append(self.dones)
#batch of steps to batch of rollouts
mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape(self.batch_ob_shape)
mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0)
mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0)
mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0)
mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
mb_masks = mb_dones[:, :-1]
mb_dones = mb_dones[:, 1:]
last_values = self.model.value(self.obs, self.states, self.dones).tolist()
#discount/bootstrap off value fn
for n, (rewards, dones, value) in enumerate(zip(mb_rewards, mb_dones, last_values)):
rewards = rewards.tolist()
dones = dones.tolist()
if dones[-1] == 0:
rewards = discount_with_dones(rewards+[value], dones+[0], self.gamma)[:-1]
else:
rewards = discount_with_dones(rewards, dones, self.gamma)
mb_rewards[n] = rewards
mb_rewards = mb_rewards.flatten()
mb_actions = mb_actions.flatten()
mb_values = mb_values.flatten()
mb_masks = mb_masks.flatten()
return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values
def learn(policy, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interval=1, nprocs=32, nsteps=20, def learn(policy, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interval=1, nprocs=32, nsteps=20,
nstack=4, ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5, ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5,
kfac_clip=0.001, save_interval=None, lrschedule='linear'): kfac_clip=0.001, save_interval=None, lrschedule='linear'):
tf.reset_default_graph() tf.reset_default_graph()
set_global_seeds(seed) set_global_seeds(seed)
@@ -176,7 +115,7 @@ def learn(policy, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interval
ob_space = env.observation_space ob_space = env.observation_space
ac_space = env.action_space ac_space = env.action_space
make_model = lambda : Model(policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=nprocs, nsteps make_model = lambda : Model(policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=nprocs, nsteps
=nsteps, nstack=nstack, ent_coef=ent_coef, vf_coef=vf_coef, vf_fisher_coef= =nsteps, ent_coef=ent_coef, vf_coef=vf_coef, vf_fisher_coef=
vf_fisher_coef, lr=lr, max_grad_norm=max_grad_norm, kfac_clip=kfac_clip, vf_fisher_coef, lr=lr, max_grad_norm=max_grad_norm, kfac_clip=kfac_clip,
lrschedule=lrschedule) lrschedule=lrschedule)
if save_interval and logger.get_dir(): if save_interval and logger.get_dir():
@@ -185,7 +124,7 @@ def learn(policy, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interval
fh.write(cloudpickle.dumps(make_model)) fh.write(cloudpickle.dumps(make_model))
model = make_model() model = make_model()
runner = Runner(env, model, nsteps=nsteps, nstack=nstack, gamma=gamma) runner = Runner(env, model, nsteps=nsteps, gamma=gamma)
nbatch = nenvs*nsteps nbatch = nenvs*nsteps
tstart = time.time() tstart = time.time()
coord = tf.train.Coordinator() coord = tf.train.Coordinator()

View File

@@ -1,4 +1,4 @@
from baselines.acktr.running_stat import RunningStat from .running_stat import RunningStat
from collections import deque from collections import deque
import numpy as np import numpy as np

View File

@@ -1,93 +1,55 @@
import tensorflow as tf import tensorflow as tf
import numpy as np
def gmatmul(a, b, transpose_a=False, transpose_b=False, reduce_dim=None): def gmatmul(a, b, transpose_a=False, transpose_b=False, reduce_dim=None):
if reduce_dim == None: assert reduce_dim is not None
# general batch matmul
if len(a.get_shape()) == 3 and len(b.get_shape()) == 3:
return tf.batch_matmul(a, b, adj_x=transpose_a, adj_y=transpose_b)
elif len(a.get_shape()) == 3 and len(b.get_shape()) == 2:
if transpose_b:
N = b.get_shape()[0].value
else:
N = b.get_shape()[1].value
B = a.get_shape()[0].value
if transpose_a:
K = a.get_shape()[1].value
a = tf.reshape(tf.transpose(a, [0, 2, 1]), [-1, K])
else:
K = a.get_shape()[-1].value
a = tf.reshape(a, [-1, K])
result = tf.matmul(a, b, transpose_b=transpose_b)
result = tf.reshape(result, [B, -1, N])
return result
elif len(a.get_shape()) == 2 and len(b.get_shape()) == 3:
if transpose_a:
M = a.get_shape()[1].value
else:
M = a.get_shape()[0].value
B = b.get_shape()[0].value
if transpose_b:
K = b.get_shape()[-1].value
b = tf.transpose(tf.reshape(b, [-1, K]), [1, 0])
else:
K = b.get_shape()[1].value
b = tf.transpose(tf.reshape(
tf.transpose(b, [0, 2, 1]), [-1, K]), [1, 0])
result = tf.matmul(a, b, transpose_a=transpose_a)
result = tf.transpose(tf.reshape(result, [M, B, -1]), [1, 0, 2])
return result
else:
return tf.matmul(a, b, transpose_a=transpose_a, transpose_b=transpose_b)
else:
# weird batch matmul
if len(a.get_shape()) == 2 and len(b.get_shape()) > 2:
# reshape reduce_dim to the left most dim in b
b_shape = b.get_shape()
if reduce_dim != 0:
b_dims = list(range(len(b_shape)))
b_dims.remove(reduce_dim)
b_dims.insert(0, reduce_dim)
b = tf.transpose(b, b_dims)
b_t_shape = b.get_shape()
b = tf.reshape(b, [int(b_shape[reduce_dim]), -1])
result = tf.matmul(a, b, transpose_a=transpose_a,
transpose_b=transpose_b)
result = tf.reshape(result, b_t_shape)
if reduce_dim != 0:
b_dims = list(range(len(b_shape)))
b_dims.remove(0)
b_dims.insert(reduce_dim, 0)
result = tf.transpose(result, b_dims)
return result
elif len(a.get_shape()) > 2 and len(b.get_shape()) == 2: # weird batch matmul
# reshape reduce_dim to the right most dim in a if len(a.get_shape()) == 2 and len(b.get_shape()) > 2:
a_shape = a.get_shape() # reshape reduce_dim to the left most dim in b
outter_dim = len(a_shape) - 1 b_shape = b.get_shape()
reduce_dim = len(a_shape) - reduce_dim - 1 if reduce_dim != 0:
if reduce_dim != outter_dim: b_dims = list(range(len(b_shape)))
a_dims = list(range(len(a_shape))) b_dims.remove(reduce_dim)
a_dims.remove(reduce_dim) b_dims.insert(0, reduce_dim)
a_dims.insert(outter_dim, reduce_dim) b = tf.transpose(b, b_dims)
a = tf.transpose(a, a_dims) b_t_shape = b.get_shape()
a_t_shape = a.get_shape() b = tf.reshape(b, [int(b_shape[reduce_dim]), -1])
a = tf.reshape(a, [-1, int(a_shape[reduce_dim])]) result = tf.matmul(a, b, transpose_a=transpose_a,
result = tf.matmul(a, b, transpose_a=transpose_a, transpose_b=transpose_b)
transpose_b=transpose_b) result = tf.reshape(result, b_t_shape)
result = tf.reshape(result, a_t_shape) if reduce_dim != 0:
if reduce_dim != outter_dim: b_dims = list(range(len(b_shape)))
a_dims = list(range(len(a_shape))) b_dims.remove(0)
a_dims.remove(outter_dim) b_dims.insert(reduce_dim, 0)
a_dims.insert(reduce_dim, outter_dim) result = tf.transpose(result, b_dims)
result = tf.transpose(result, a_dims) return result
return result
elif len(a.get_shape()) == 2 and len(b.get_shape()) == 2: elif len(a.get_shape()) > 2 and len(b.get_shape()) == 2:
return tf.matmul(a, b, transpose_a=transpose_a, transpose_b=transpose_b) # reshape reduce_dim to the right most dim in a
a_shape = a.get_shape()
outter_dim = len(a_shape) - 1
reduce_dim = len(a_shape) - reduce_dim - 1
if reduce_dim != outter_dim:
a_dims = list(range(len(a_shape)))
a_dims.remove(reduce_dim)
a_dims.insert(outter_dim, reduce_dim)
a = tf.transpose(a, a_dims)
a_t_shape = a.get_shape()
a = tf.reshape(a, [-1, int(a_shape[reduce_dim])])
result = tf.matmul(a, b, transpose_a=transpose_a,
transpose_b=transpose_b)
result = tf.reshape(result, a_t_shape)
if reduce_dim != outter_dim:
a_dims = list(range(len(a_shape)))
a_dims.remove(outter_dim)
a_dims.insert(reduce_dim, outter_dim)
result = tf.transpose(result, a_dims)
return result
assert False, 'something went wrong' elif len(a.get_shape()) == 2 and len(b.get_shape()) == 2:
return tf.matmul(a, b, transpose_a=transpose_a, transpose_b=transpose_b)
assert False, 'something went wrong'
def clipoutNeg(vec, threshold=1e-6): def clipoutNeg(vec, threshold=1e-6):

View File

@@ -1,43 +1,8 @@
import numpy as np import numpy as np
import tensorflow as tf import tensorflow as tf
from baselines.acktr.utils import conv, fc, dense, conv_to_fc, sample, kl_div from baselines.acktr.utils import dense, kl_div
import baselines.common.tf_util as U import baselines.common.tf_util as U
class CnnPolicy(object):
def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False):
nbatch = nenv*nsteps
nh, nw, nc = ob_space.shape
ob_shape = (nbatch, nh, nw, nc*nstack)
nact = ac_space.n
X = tf.placeholder(tf.uint8, ob_shape) #obs
with tf.variable_scope("model", reuse=reuse):
h = conv(tf.cast(X, tf.float32)/255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2))
h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))
h3 = conv(h2, 'c3', nf=32, rf=3, stride=1, init_scale=np.sqrt(2))
h3 = conv_to_fc(h3)
h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
pi = fc(h4, 'pi', nact, act=lambda x:x)
vf = fc(h4, 'v', 1, act=lambda x:x)
v0 = vf[:, 0]
a0 = sample(pi)
self.initial_state = [] #not stateful
def step(ob, *_args, **_kwargs):
a, v = sess.run([a0, v0], {X:ob})
return a, v, [] #dummy state
def value(ob, *_args, **_kwargs):
return sess.run(v0, {X:ob})
self.X = X
self.pi = pi
self.vf = vf
self.step = step
self.value = value
class GaussianMlpPolicy(object): class GaussianMlpPolicy(object):
def __init__(self, ob_dim, ac_dim): def __init__(self, ob_dim, ac_dim):
# Here we'll construct a bunch of expressions, which will be used in two places: # Here we'll construct a bunch of expressions, which will be used in two places:
@@ -60,12 +25,12 @@ class GaussianMlpPolicy(object):
std_na = tf.tile(std_1a, [tf.shape(mean_na)[0], 1]) std_na = tf.tile(std_1a, [tf.shape(mean_na)[0], 1])
ac_dist = tf.concat([tf.reshape(mean_na, [-1, ac_dim]), tf.reshape(std_na, [-1, ac_dim])], 1) ac_dist = tf.concat([tf.reshape(mean_na, [-1, ac_dim]), tf.reshape(std_na, [-1, ac_dim])], 1)
sampled_ac_na = tf.random_normal(tf.shape(ac_dist[:,ac_dim:])) * ac_dist[:,ac_dim:] + ac_dist[:,:ac_dim] # This is the sampled action we'll perform. sampled_ac_na = tf.random_normal(tf.shape(ac_dist[:,ac_dim:])) * ac_dist[:,ac_dim:] + ac_dist[:,:ac_dim] # This is the sampled action we'll perform.
logprobsampled_n = - U.sum(tf.log(ac_dist[:,ac_dim:]), axis=1) - 0.5 * tf.log(2.0*np.pi)*ac_dim - 0.5 * U.sum(tf.square(ac_dist[:,:ac_dim] - sampled_ac_na) / (tf.square(ac_dist[:,ac_dim:])), axis=1) # Logprob of sampled action logprobsampled_n = - tf.reduce_sum(tf.log(ac_dist[:,ac_dim:]), axis=1) - 0.5 * tf.log(2.0*np.pi)*ac_dim - 0.5 * tf.reduce_sum(tf.square(ac_dist[:,:ac_dim] - sampled_ac_na) / (tf.square(ac_dist[:,ac_dim:])), axis=1) # Logprob of sampled action
logprob_n = - U.sum(tf.log(ac_dist[:,ac_dim:]), axis=1) - 0.5 * tf.log(2.0*np.pi)*ac_dim - 0.5 * U.sum(tf.square(ac_dist[:,:ac_dim] - oldac_na) / (tf.square(ac_dist[:,ac_dim:])), axis=1) # Logprob of previous actions under CURRENT policy (whereas oldlogprob_n is under OLD policy) logprob_n = - tf.reduce_sum(tf.log(ac_dist[:,ac_dim:]), axis=1) - 0.5 * tf.log(2.0*np.pi)*ac_dim - 0.5 * tf.reduce_sum(tf.square(ac_dist[:,:ac_dim] - oldac_na) / (tf.square(ac_dist[:,ac_dim:])), axis=1) # Logprob of previous actions under CURRENT policy (whereas oldlogprob_n is under OLD policy)
kl = U.mean(kl_div(oldac_dist, ac_dist, ac_dim)) kl = tf.reduce_mean(kl_div(oldac_dist, ac_dist, ac_dim))
#kl = .5 * U.mean(tf.square(logprob_n - oldlogprob_n)) # Approximation of KL divergence between old policy used to generate actions, and new policy used to compute logprob_n #kl = .5 * tf.reduce_mean(tf.square(logprob_n - oldlogprob_n)) # Approximation of KL divergence between old policy used to generate actions, and new policy used to compute logprob_n
surr = - U.mean(adv_n * logprob_n) # Loss function that we'll differentiate to get the policy gradient surr = - tf.reduce_mean(adv_n * logprob_n) # Loss function that we'll differentiate to get the policy gradient
surr_sampled = - U.mean(logprob_n) # Sampled loss of the policy surr_sampled = - tf.reduce_mean(logprob_n) # Sampled loss of the policy
self._act = U.function([ob_no], [sampled_ac_na, ac_dist, logprobsampled_n]) # Generate a new action and its logprob self._act = U.function([ob_no], [sampled_ac_na, ac_dist, logprobsampled_n]) # Generate a new action and its logprob
#self.compute_kl = U.function([ob_no, oldac_na, oldlogprob_n], kl) # Compute (approximate) KL divergence between old policy and new policy #self.compute_kl = U.function([ob_no, oldac_na, oldlogprob_n], kl) # Compute (approximate) KL divergence between old policy and new policy
self.compute_kl = U.function([ob_no, oldac_dist], kl) self.compute_kl = U.function([ob_no, oldac_dist], kl)

View File

@@ -1,38 +1,21 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import os, logging, gym
from baselines import logger from baselines import logger
from baselines.common import set_global_seeds
from baselines import bench
from baselines.acktr.acktr_disc import learn from baselines.acktr.acktr_disc import learn
from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv from baselines.common.cmd_util import make_atari_env, atari_arg_parser
from baselines.common.atari_wrappers import make_atari, wrap_deepmind from baselines.common.vec_env.vec_frame_stack import VecFrameStack
from baselines.acktr.policies import CnnPolicy from baselines.ppo2.policies import CnnPolicy
def train(env_id, num_timesteps, seed, num_cpu): def train(env_id, num_timesteps, seed, num_cpu):
def make_env(rank): env = VecFrameStack(make_atari_env(env_id, num_cpu, seed), 4)
def _thunk():
env = make_atari(env_id)
env.seed(seed + rank)
env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
gym.logger.setLevel(logging.WARN)
return wrap_deepmind(env)
return _thunk
set_global_seeds(seed)
env = SubprocVecEnv([make_env(i) for i in range(num_cpu)])
policy_fn = CnnPolicy policy_fn = CnnPolicy
learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), nprocs=num_cpu) learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), nprocs=num_cpu)
env.close() env.close()
def main(): def main():
import argparse args = atari_arg_parser().parse_args()
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4')
parser.add_argument('--seed', help='RNG seed', type=int, default=0)
parser.add_argument('--num-timesteps', type=int, default=int(10e6))
args = parser.parse_args()
logger.configure() logger.configure()
train(args.env, num_timesteps=args.num_timesteps, seed=args.seed, num_cpu=32) train(args.env, num_timesteps=args.num_timesteps, seed=args.seed, num_cpu=32)
if __name__ == '__main__': if __name__ == '__main__':
main() main()

View File

@@ -1,22 +1,14 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import argparse
import logging
import os
import tensorflow as tf import tensorflow as tf
import gym
from baselines import logger from baselines import logger
from baselines.common import set_global_seeds from baselines.common.cmd_util import make_mujoco_env, mujoco_arg_parser
from baselines import bench
from baselines.acktr.acktr_cont import learn from baselines.acktr.acktr_cont import learn
from baselines.acktr.policies import GaussianMlpPolicy from baselines.acktr.policies import GaussianMlpPolicy
from baselines.acktr.value_functions import NeuralNetValueFunction from baselines.acktr.value_functions import NeuralNetValueFunction
def train(env_id, num_timesteps, seed): def train(env_id, num_timesteps, seed):
env=gym.make(env_id) env = make_mujoco_env(env_id, seed)
env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
set_global_seeds(seed)
env.seed(seed)
gym.logger.setLevel(logging.WARN)
with tf.Session(config=tf.ConfigProto()): with tf.Session(config=tf.ConfigProto()):
ob_dim = env.observation_space.shape[0] ob_dim = env.observation_space.shape[0]
@@ -33,11 +25,10 @@ def train(env_id, num_timesteps, seed):
env.close() env.close()
if __name__ == "__main__": def main():
parser = argparse.ArgumentParser(description='Run Mujoco benchmark.') args = mujoco_arg_parser().parse_args()
parser.add_argument('--seed', help='RNG seed', type=int, default=0)
parser.add_argument('--env', help='environment ID', type=str, default="Reacher-v1")
parser.add_argument('--num-timesteps', type=int, default=int(1e6))
args = parser.parse_args()
logger.configure() logger.configure()
train(args.env, num_timesteps=args.num_timesteps, seed=args.seed) train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
if __name__ == "__main__":
main()

View File

@@ -1,69 +1,8 @@
import os
import numpy as np
import tensorflow as tf import tensorflow as tf
import baselines.common.tf_util as U
from collections import deque
def sample(logits):
noise = tf.random_uniform(tf.shape(logits))
return tf.argmax(logits - tf.log(-tf.log(noise)), 1)
def std(x):
mean = tf.reduce_mean(x)
var = tf.reduce_mean(tf.square(x-mean))
return tf.sqrt(var)
def cat_entropy(logits):
a0 = logits - tf.reduce_max(logits, 1, keep_dims=True)
ea0 = tf.exp(a0)
z0 = tf.reduce_sum(ea0, 1, keep_dims=True)
p0 = ea0 / z0
return tf.reduce_sum(p0 * (tf.log(z0) - a0), 1)
def cat_entropy_softmax(p0):
return - tf.reduce_sum(p0 * tf.log(p0 + 1e-6), axis = 1)
def mse(pred, target):
return tf.square(pred-target)/2.
def ortho_init(scale=1.0):
def _ortho_init(shape, dtype, partition_info=None):
#lasagne ortho init for tf
shape = tuple(shape)
if len(shape) == 2:
flat_shape = shape
elif len(shape) == 4: # assumes NHWC
flat_shape = (np.prod(shape[:-1]), shape[-1])
else:
raise NotImplementedError
a = np.random.normal(0.0, 1.0, flat_shape)
u, _, v = np.linalg.svd(a, full_matrices=False)
q = u if u.shape == flat_shape else v # pick the one with the correct shape
q = q.reshape(shape)
return (scale * q[:shape[0], :shape[1]]).astype(np.float32)
return _ortho_init
def conv(x, scope, nf, rf, stride, pad='VALID', act=tf.nn.relu, init_scale=1.0):
with tf.variable_scope(scope):
nin = x.get_shape()[3].value
w = tf.get_variable("w", [rf, rf, nin, nf], initializer=ortho_init(init_scale))
b = tf.get_variable("b", [nf], initializer=tf.constant_initializer(0.0))
z = tf.nn.conv2d(x, w, strides=[1, stride, stride, 1], padding=pad)+b
h = act(z)
return h
def fc(x, scope, nh, act=tf.nn.relu, init_scale=1.0):
with tf.variable_scope(scope):
nin = x.get_shape()[1].value
w = tf.get_variable("w", [nin, nh], initializer=ortho_init(init_scale))
b = tf.get_variable("b", [nh], initializer=tf.constant_initializer(0.0))
z = tf.matmul(x, w)+b
h = act(z)
return h
def dense(x, size, name, weight_init=None, bias_init=0, weight_loss_dict=None, reuse=None): def dense(x, size, name, weight_init=None, bias_init=0, weight_loss_dict=None, reuse=None):
with tf.variable_scope(name, reuse=reuse): with tf.variable_scope(name, reuse=reuse):
assert (len(U.scope_name().split('/')) == 2) assert (len(tf.get_variable_scope().name.split('/')) == 2)
w = tf.get_variable("w", [x.get_shape()[1], size], initializer=weight_init) w = tf.get_variable("w", [x.get_shape()[1], size], initializer=weight_init)
b = tf.get_variable("b", [size], initializer=tf.constant_initializer(bias_init)) b = tf.get_variable("b", [size], initializer=tf.constant_initializer(bias_init))
@@ -75,15 +14,10 @@ def dense(x, size, name, weight_init=None, bias_init=0, weight_loss_dict=None, r
weight_loss_dict[w] = weight_decay_fc weight_loss_dict[w] = weight_decay_fc
weight_loss_dict[b] = 0.0 weight_loss_dict[b] = 0.0
tf.add_to_collection(U.scope_name().split('/')[0] + '_' + 'losses', weight_decay) tf.add_to_collection(tf.get_variable_scope().name.split('/')[0] + '_' + 'losses', weight_decay)
return tf.nn.bias_add(tf.matmul(x, w), b) return tf.nn.bias_add(tf.matmul(x, w), b)
def conv_to_fc(x):
nh = np.prod([v.value for v in x.get_shape()[1:]])
x = tf.reshape(x, [-1, nh])
return x
def kl_div(action_dist1, action_dist2, action_size): def kl_div(action_dist1, action_dist2, action_size):
mean1, std1 = action_dist1[:, :action_size], action_dist1[:, action_size:] mean1, std1 = action_dist1[:, :action_size], action_dist1[:, action_size:]
mean2, std2 = action_dist2[:, :action_size], action_dist2[:, action_size:] mean2, std2 = action_dist2[:, :action_size], action_dist2[:, action_size:]
@@ -92,109 +26,3 @@ def kl_div(action_dist1, action_dist2, action_size):
denominator = 2 * tf.square(std2) + 1e-8 denominator = 2 * tf.square(std2) + 1e-8
return tf.reduce_sum( return tf.reduce_sum(
numerator/denominator + tf.log(std2) - tf.log(std1),reduction_indices=-1) numerator/denominator + tf.log(std2) - tf.log(std1),reduction_indices=-1)
def discount_with_dones(rewards, dones, gamma):
discounted = []
r = 0
for reward, done in zip(rewards[::-1], dones[::-1]):
r = reward + gamma*r*(1.-done) # fixed off by one bug
discounted.append(r)
return discounted[::-1]
def find_trainable_variables(key):
with tf.variable_scope(key):
return tf.trainable_variables()
def make_path(f):
return os.makedirs(f, exist_ok=True)
def constant(p):
return 1
def linear(p):
return 1-p
def middle_drop(p):
eps = 0.75
if 1-p<eps:
return eps*0.1
return 1-p
def double_linear_con(p):
p *= 2
eps = 0.125
if 1-p<eps:
return eps
return 1-p
def double_middle_drop(p):
eps1 = 0.75
eps2 = 0.25
if 1-p<eps1:
if 1-p<eps2:
return eps2*0.5
return eps1*0.1
return 1-p
schedules = {
'linear':linear,
'constant':constant,
'double_linear_con':double_linear_con,
'middle_drop':middle_drop,
'double_middle_drop':double_middle_drop
}
class Scheduler(object):
def __init__(self, v, nvalues, schedule):
self.n = 0.
self.v = v
self.nvalues = nvalues
self.schedule = schedules[schedule]
def value(self):
current_value = self.v*self.schedule(self.n/self.nvalues)
self.n += 1.
return current_value
def value_steps(self, steps):
return self.v*self.schedule(steps/self.nvalues)
class EpisodeStats:
def __init__(self, nsteps, nenvs):
self.episode_rewards = []
for i in range(nenvs):
self.episode_rewards.append([])
self.lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths
self.rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards
self.nsteps = nsteps
self.nenvs = nenvs
def feed(self, rewards, masks):
rewards = np.reshape(rewards, [self.nenvs, self.nsteps])
masks = np.reshape(masks, [self.nenvs, self.nsteps])
for i in range(0, self.nenvs):
for j in range(0, self.nsteps):
self.episode_rewards[i].append(rewards[i][j])
if masks[i][j]:
l = len(self.episode_rewards[i])
s = sum(self.episode_rewards[i])
self.lenbuffer.append(l)
self.rewbuffer.append(s)
self.episode_rewards[i] = []
def mean_length(self):
if self.lenbuffer:
return np.mean(self.lenbuffer)
else:
return 0 # on the first params dump, no episodes are finished
def mean_reward(self):
if self.rewbuffer:
return np.mean(self.rewbuffer)
else:
return 0

View File

@@ -1,6 +1,6 @@
from baselines import logger from baselines import logger
import numpy as np import numpy as np
from baselines import common import baselines.common as common
from baselines.common import tf_util as U from baselines.common import tf_util as U
import tensorflow as tf import tensorflow as tf
from baselines.acktr import kfac from baselines.acktr import kfac
@@ -16,8 +16,8 @@ class NeuralNetValueFunction(object):
vpred_n = dense(h2, 1, "hfinal", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)[:,0] vpred_n = dense(h2, 1, "hfinal", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)[:,0]
sample_vpred_n = vpred_n + tf.random_normal(tf.shape(vpred_n)) sample_vpred_n = vpred_n + tf.random_normal(tf.shape(vpred_n))
wd_loss = tf.get_collection("vf_losses", None) wd_loss = tf.get_collection("vf_losses", None)
loss = U.mean(tf.square(vpred_n - vtarg_n)) + tf.add_n(wd_loss) loss = tf.reduce_mean(tf.square(vpred_n - vtarg_n)) + tf.add_n(wd_loss)
loss_sampled = U.mean(tf.square(vpred_n - tf.stop_gradient(sample_vpred_n))) loss_sampled = tf.reduce_mean(tf.square(vpred_n - tf.stop_gradient(sample_vpred_n)))
self._predict = U.function([X], vpred_n) self._predict = U.function([X], vpred_n)
optim = kfac.KfacOptimizer(learning_rate=0.001, cold_lr=0.001*(1-0.9), momentum=0.9, \ optim = kfac.KfacOptimizer(learning_rate=0.001, cold_lr=0.001*(1-0.9), momentum=0.9, \
clip_kl=0.3, epsilon=0.1, stats_decay=0.95, \ clip_kl=0.3, epsilon=0.1, stats_decay=0.95, \

View File

@@ -1,15 +1,24 @@
import re
import os.path as osp import os.path as osp
import os
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
_atari7 = ['BeamRider', 'Breakout', 'Enduro', 'Pong', 'Qbert', 'Seaquest', 'SpaceInvaders'] _atari7 = ['BeamRider', 'Breakout', 'Enduro', 'Pong', 'Qbert', 'Seaquest', 'SpaceInvaders']
_atariexpl7 = ['Freeway', 'Gravitar', 'MontezumaRevenge', 'Pitfall', 'PrivateEye', 'Solaris', 'Venture'] _atariexpl7 = ['Freeway', 'Gravitar', 'MontezumaRevenge', 'Pitfall', 'PrivateEye', 'Solaris', 'Venture']
_BENCHMARKS = [] _BENCHMARKS = []
remove_version_re = re.compile(r'-v\d+$')
def register_benchmark(benchmark): def register_benchmark(benchmark):
for b in _BENCHMARKS: for b in _BENCHMARKS:
if b['name'] == benchmark['name']: if b['name'] == benchmark['name']:
raise ValueError('Benchmark with name %s already registered!' % b['name']) raise ValueError('Benchmark with name %s already registered!' % b['name'])
# automatically add a description if it is not present
if 'tasks' in benchmark:
for t in benchmark['tasks']:
if 'desc' not in t:
t['desc'] = remove_version_re.sub('', t['env_id'])
_BENCHMARKS.append(benchmark) _BENCHMARKS.append(benchmark)
@@ -42,30 +51,28 @@ _ATARI_SUFFIX = 'NoFrameskip-v4'
register_benchmark({ register_benchmark({
'name': 'Atari50M', 'name': 'Atari50M',
'description': '7 Atari games from Mnih et al. (2013), with pixel observations, 50M timesteps', 'description': '7 Atari games from Mnih et al. (2013), with pixel observations, 50M timesteps',
'tasks': [{'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(50e6)} for _game in _atari7] 'tasks': [{'desc': _game, 'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(50e6)} for _game in _atari7]
}) })
register_benchmark({ register_benchmark({
'name': 'Atari10M', 'name': 'Atari10M',
'description': '7 Atari games from Mnih et al. (2013), with pixel observations, 10M timesteps', 'description': '7 Atari games from Mnih et al. (2013), with pixel observations, 10M timesteps',
'tasks': [{'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(10e6)} for _game in _atari7] 'tasks': [{'desc': _game, 'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(10e6)} for _game in _atari7]
}) })
register_benchmark({ register_benchmark({
'name': 'Atari1Hr', 'name': 'Atari1Hr',
'description': '7 Atari games from Mnih et al. (2013), with pixel observations, 1 hour of walltime', 'description': '7 Atari games from Mnih et al. (2013), with pixel observations, 1 hour of walltime',
'tasks': [{'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_seconds': 60 * 60} for _game in _atari7] 'tasks': [{'desc': _game, 'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_seconds': 60 * 60} for _game in _atari7]
}) })
register_benchmark({ register_benchmark({
'name': 'AtariExploration10M', 'name': 'AtariExploration10M',
'description': '7 Atari games emphasizing exploration, with pixel observations, 10M timesteps', 'description': '7 Atari games emphasizing exploration, with pixel observations, 10M timesteps',
'tasks': [{'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(10e6)} for _game in _atariexpl7] 'tasks': [{'desc': _game, 'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(10e6)} for _game in _atariexpl7]
}) })
# MuJoCo # MuJoCo
_mujocosmall = [ _mujocosmall = [
@@ -128,5 +135,6 @@ _atari50 = [ # actually 47
register_benchmark({ register_benchmark({
'name': 'Atari50_10M', 'name': 'Atari50_10M',
'description': '47 Atari games from Mnih et al. (2013), with pixel observations, 10M timesteps', 'description': '47 Atari games from Mnih et al. (2013), with pixel observations, 10M timesteps',
'tasks': [{'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(10e6)} for _game in _atari50] 'tasks': [{'desc': _game, 'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(10e6)} for _game in _atari50]
}) })

View File

@@ -25,8 +25,7 @@ class Monitor(Wrapper):
else: else:
filename = filename + "." + Monitor.EXT filename = filename + "." + Monitor.EXT
self.f = open(filename, "wt") self.f = open(filename, "wt")
self.f.write('#%s\n'%json.dumps({"t_start": self.tstart, "gym_version": gym.__version__, self.f.write('#%s\n'%json.dumps({"t_start": self.tstart, 'env_id' : env.spec and env.spec.id}))
"env_id": env.spec.id if env.spec else 'Unknown'}))
self.logger = csv.DictWriter(self.f, fieldnames=('r', 'l', 't')+reset_keywords) self.logger = csv.DictWriter(self.f, fieldnames=('r', 'l', 't')+reset_keywords)
self.logger.writeheader() self.logger.writeheader()
@@ -36,10 +35,11 @@ class Monitor(Wrapper):
self.needs_reset = True self.needs_reset = True
self.episode_rewards = [] self.episode_rewards = []
self.episode_lengths = [] self.episode_lengths = []
self.episode_times = []
self.total_steps = 0 self.total_steps = 0
self.current_reset_info = {} # extra info about the current episode, that was passed in during reset() self.current_reset_info = {} # extra info about the current episode, that was passed in during reset()
def _reset(self, **kwargs): def reset(self, **kwargs):
if not self.allow_early_resets and not self.needs_reset: if not self.allow_early_resets and not self.needs_reset:
raise RuntimeError("Tried to reset an environment before done. If you want to allow early resets, wrap your env with Monitor(env, path, allow_early_resets=True)") raise RuntimeError("Tried to reset an environment before done. If you want to allow early resets, wrap your env with Monitor(env, path, allow_early_resets=True)")
self.rewards = [] self.rewards = []
@@ -51,7 +51,7 @@ class Monitor(Wrapper):
self.current_reset_info[k] = v self.current_reset_info[k] = v
return self.env.reset(**kwargs) return self.env.reset(**kwargs)
def _step(self, action): def step(self, action):
if self.needs_reset: if self.needs_reset:
raise RuntimeError("Tried to step environment that needs reset") raise RuntimeError("Tried to step environment that needs reset")
ob, rew, done, info = self.env.step(action) ob, rew, done, info = self.env.step(action)
@@ -61,12 +61,13 @@ class Monitor(Wrapper):
eprew = sum(self.rewards) eprew = sum(self.rewards)
eplen = len(self.rewards) eplen = len(self.rewards)
epinfo = {"r": round(eprew, 6), "l": eplen, "t": round(time.time() - self.tstart, 6)} epinfo = {"r": round(eprew, 6), "l": eplen, "t": round(time.time() - self.tstart, 6)}
self.episode_rewards.append(eprew)
self.episode_lengths.append(eplen)
self.episode_times.append(time.time() - self.tstart)
epinfo.update(self.current_reset_info) epinfo.update(self.current_reset_info)
if self.logger: if self.logger:
self.logger.writerow(epinfo) self.logger.writerow(epinfo)
self.f.flush() self.f.flush()
self.episode_rewards.append(eprew)
self.episode_lengths.append(eplen)
info['episode'] = epinfo info['episode'] = epinfo
self.total_steps += 1 self.total_steps += 1
return (ob, rew, done, info) return (ob, rew, done, info)
@@ -84,6 +85,9 @@ class Monitor(Wrapper):
def get_episode_lengths(self): def get_episode_lengths(self):
return self.episode_lengths return self.episode_lengths
def get_episode_times(self):
return self.episode_times
class LoadMonitorResultsError(Exception): class LoadMonitorResultsError(Exception):
pass pass
@@ -92,7 +96,9 @@ def get_monitor_files(dir):
def load_results(dir): def load_results(dir):
import pandas import pandas
monitor_files = glob(osp.join(dir, "*monitor.*")) # get both csv and (old) json files monitor_files = (
glob(osp.join(dir, "*monitor.json")) +
glob(osp.join(dir, "*monitor.csv"))) # get both csv and (old) json files
if not monitor_files: if not monitor_files:
raise LoadMonitorResultsError("no monitor files of the form *%s found in %s" % (Monitor.EXT, dir)) raise LoadMonitorResultsError("no monitor files of the form *%s found in %s" % (Monitor.EXT, dir))
dfs = [] dfs = []
@@ -114,10 +120,37 @@ def load_results(dir):
episode = json.loads(line) episode = json.loads(line)
episodes.append(episode) episodes.append(episode)
df = pandas.DataFrame(episodes) df = pandas.DataFrame(episodes)
df['t'] += header['t_start'] else:
assert 0, 'unreachable'
df['t'] += header['t_start']
dfs.append(df) dfs.append(df)
df = pandas.concat(dfs) df = pandas.concat(dfs)
df.sort_values('t', inplace=True) df.sort_values('t', inplace=True)
df.reset_index(inplace=True)
df['t'] -= min(header['t_start'] for header in headers) df['t'] -= min(header['t_start'] for header in headers)
df.headers = headers # HACK to preserve backwards compatibility df.headers = headers # HACK to preserve backwards compatibility
return df return df
def test_monitor():
env = gym.make("CartPole-v1")
env.seed(0)
mon_file = "/tmp/baselines-test-%s.monitor.csv" % uuid.uuid4()
menv = Monitor(env, mon_file)
menv.reset()
for _ in range(1000):
_, _, done, _ = menv.step(0)
if done:
menv.reset()
f = open(mon_file, 'rt')
firstline = f.readline()
assert firstline.startswith('#')
metadata = json.loads(firstline[1:])
assert metadata['env_id'] == "CartPole-v1"
assert set(metadata.keys()) == {'env_id', 'gym_version', 't_start'}, "Incorrect keys in monitor metadata"
last_logline = pandas.read_csv(f, index_col=None)
assert set(last_logline.keys()) == {'l', 't', 'r'}, "Incorrect keys in monitor logline"
f.close()
os.remove(mon_file)

View File

@@ -3,6 +3,7 @@ from collections import deque
import gym import gym
from gym import spaces from gym import spaces
import cv2 import cv2
cv2.ocl.setUseOpenCL(False)
class NoopResetEnv(gym.Wrapper): class NoopResetEnv(gym.Wrapper):
def __init__(self, env, noop_max=30): def __init__(self, env, noop_max=30):
@@ -15,7 +16,7 @@ class NoopResetEnv(gym.Wrapper):
self.noop_action = 0 self.noop_action = 0
assert env.unwrapped.get_action_meanings()[0] == 'NOOP' assert env.unwrapped.get_action_meanings()[0] == 'NOOP'
def _reset(self, **kwargs): def reset(self, **kwargs):
""" Do no-op action for a number of steps in [1, noop_max].""" """ Do no-op action for a number of steps in [1, noop_max]."""
self.env.reset(**kwargs) self.env.reset(**kwargs)
if self.override_num_noops is not None: if self.override_num_noops is not None:
@@ -30,6 +31,9 @@ class NoopResetEnv(gym.Wrapper):
obs = self.env.reset(**kwargs) obs = self.env.reset(**kwargs)
return obs return obs
def step(self, ac):
return self.env.step(ac)
class FireResetEnv(gym.Wrapper): class FireResetEnv(gym.Wrapper):
def __init__(self, env): def __init__(self, env):
"""Take action on reset for environments that are fixed until firing.""" """Take action on reset for environments that are fixed until firing."""
@@ -37,7 +41,7 @@ class FireResetEnv(gym.Wrapper):
assert env.unwrapped.get_action_meanings()[1] == 'FIRE' assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
assert len(env.unwrapped.get_action_meanings()) >= 3 assert len(env.unwrapped.get_action_meanings()) >= 3
def _reset(self, **kwargs): def reset(self, **kwargs):
self.env.reset(**kwargs) self.env.reset(**kwargs)
obs, _, done, _ = self.env.step(1) obs, _, done, _ = self.env.step(1)
if done: if done:
@@ -47,6 +51,9 @@ class FireResetEnv(gym.Wrapper):
self.env.reset(**kwargs) self.env.reset(**kwargs)
return obs return obs
def step(self, ac):
return self.env.step(ac)
class EpisodicLifeEnv(gym.Wrapper): class EpisodicLifeEnv(gym.Wrapper):
def __init__(self, env): def __init__(self, env):
"""Make end-of-life == end-of-episode, but only reset on true game over. """Make end-of-life == end-of-episode, but only reset on true game over.
@@ -56,7 +63,7 @@ class EpisodicLifeEnv(gym.Wrapper):
self.lives = 0 self.lives = 0
self.was_real_done = True self.was_real_done = True
def _step(self, action): def step(self, action):
obs, reward, done, info = self.env.step(action) obs, reward, done, info = self.env.step(action)
self.was_real_done = done self.was_real_done = done
# check current lives, make loss of life terminal, # check current lives, make loss of life terminal,
@@ -70,7 +77,7 @@ class EpisodicLifeEnv(gym.Wrapper):
self.lives = lives self.lives = lives
return obs, reward, done, info return obs, reward, done, info
def _reset(self, **kwargs): def reset(self, **kwargs):
"""Reset only when lives are exhausted. """Reset only when lives are exhausted.
This way all states are still reachable even though lives are episodic, This way all states are still reachable even though lives are episodic,
and the learner need not know about any of this behind-the-scenes. and the learner need not know about any of this behind-the-scenes.
@@ -88,10 +95,13 @@ class MaxAndSkipEnv(gym.Wrapper):
"""Return only every `skip`-th frame""" """Return only every `skip`-th frame"""
gym.Wrapper.__init__(self, env) gym.Wrapper.__init__(self, env)
# most recent raw observations (for max pooling across time steps) # most recent raw observations (for max pooling across time steps)
self._obs_buffer = np.zeros((2,)+env.observation_space.shape, dtype='uint8') self._obs_buffer = np.zeros((2,)+env.observation_space.shape, dtype=np.uint8)
self._skip = skip self._skip = skip
def _step(self, action): def reset(self):
return self.env.reset()
def step(self, action):
"""Repeat action, sum reward, and max over last observations.""" """Repeat action, sum reward, and max over last observations."""
total_reward = 0.0 total_reward = 0.0
done = None done = None
@@ -108,8 +118,14 @@ class MaxAndSkipEnv(gym.Wrapper):
return max_frame, total_reward, done, info return max_frame, total_reward, done, info
def reset(self, **kwargs):
return self.env.reset(**kwargs)
class ClipRewardEnv(gym.RewardWrapper): class ClipRewardEnv(gym.RewardWrapper):
def _reward(self, reward): def __init__(self, env):
gym.RewardWrapper.__init__(self, env)
def reward(self, reward):
"""Bin reward to {+1, 0, -1} by its sign.""" """Bin reward to {+1, 0, -1} by its sign."""
return np.sign(reward) return np.sign(reward)
@@ -119,9 +135,10 @@ class WarpFrame(gym.ObservationWrapper):
gym.ObservationWrapper.__init__(self, env) gym.ObservationWrapper.__init__(self, env)
self.width = 84 self.width = 84
self.height = 84 self.height = 84
self.observation_space = spaces.Box(low=0, high=255, shape=(self.height, self.width, 1)) self.observation_space = spaces.Box(low=0, high=255,
shape=(self.height, self.width, 1), dtype=np.uint8)
def _observation(self, frame): def observation(self, frame):
frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY) frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
frame = cv2.resize(frame, (self.width, self.height), interpolation=cv2.INTER_AREA) frame = cv2.resize(frame, (self.width, self.height), interpolation=cv2.INTER_AREA)
return frame[:, :, None] return frame[:, :, None]
@@ -140,15 +157,15 @@ class FrameStack(gym.Wrapper):
self.k = k self.k = k
self.frames = deque([], maxlen=k) self.frames = deque([], maxlen=k)
shp = env.observation_space.shape shp = env.observation_space.shape
self.observation_space = spaces.Box(low=0, high=255, shape=(shp[0], shp[1], shp[2] * k)) self.observation_space = spaces.Box(low=0, high=255, shape=(shp[0], shp[1], shp[2] * k), dtype=np.uint8)
def _reset(self): def reset(self):
ob = self.env.reset() ob = self.env.reset()
for _ in range(self.k): for _ in range(self.k):
self.frames.append(ob) self.frames.append(ob)
return self._get_ob() return self._get_ob()
def _step(self, action): def step(self, action):
ob, reward, done, info = self.env.step(action) ob, reward, done, info = self.env.step(action)
self.frames.append(ob) self.frames.append(ob)
return self._get_ob(), reward, done, info return self._get_ob(), reward, done, info
@@ -158,7 +175,10 @@ class FrameStack(gym.Wrapper):
return LazyFrames(list(self.frames)) return LazyFrames(list(self.frames))
class ScaledFloatFrame(gym.ObservationWrapper): class ScaledFloatFrame(gym.ObservationWrapper):
def _observation(self, observation): def __init__(self, env):
gym.ObservationWrapper.__init__(self, env)
def observation(self, observation):
# careful! This undoes the memory optimization, use # careful! This undoes the memory optimization, use
# with smaller replay buffers only. # with smaller replay buffers only.
return np.array(observation).astype(np.float32) / 255.0 return np.array(observation).astype(np.float32) / 255.0

View File

@@ -0,0 +1,64 @@
"""
Helpers for scripts like run_atari.py.
"""
import os
import gym
from baselines import logger
from baselines.bench import Monitor
from baselines.common import set_global_seeds
from baselines.common.atari_wrappers import make_atari, wrap_deepmind
from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
from mpi4py import MPI
def make_atari_env(env_id, num_env, seed, wrapper_kwargs=None, start_index=0):
"""
Create a wrapped, monitored SubprocVecEnv for Atari.
"""
if wrapper_kwargs is None: wrapper_kwargs = {}
def make_env(rank): # pylint: disable=C0111
def _thunk():
env = make_atari(env_id)
env.seed(seed + rank)
env = Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
return wrap_deepmind(env, **wrapper_kwargs)
return _thunk
set_global_seeds(seed)
return SubprocVecEnv([make_env(i + start_index) for i in range(num_env)])
def make_mujoco_env(env_id, seed):
"""
Create a wrapped, monitored gym.Env for MuJoCo.
"""
set_global_seeds(seed)
env = gym.make(env_id)
env = Monitor(env, logger.get_dir())
env.seed(seed)
return env
def arg_parser():
"""
Create an empty argparse.ArgumentParser.
"""
import argparse
return argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
def atari_arg_parser():
"""
Create an argparse.ArgumentParser for run_atari.py.
"""
parser = arg_parser()
parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4')
parser.add_argument('--seed', help='RNG seed', type=int, default=0)
parser.add_argument('--num-timesteps', type=int, default=int(10e6))
return parser
def mujoco_arg_parser():
"""
Create an argparse.ArgumentParser for run_mujoco.py.
"""
parser = arg_parser()
parser.add_argument('--env', help='environment ID', type=str, default="Reacher-v1")
parser.add_argument('--seed', help='RNG seed', type=int, default=0)
parser.add_argument('--num-timesteps', type=int, default=int(1e6))
return parser

View File

@@ -57,14 +57,12 @@ class CategoricalPdType(PdType):
class MultiCategoricalPdType(PdType): class MultiCategoricalPdType(PdType):
def __init__(self, low, high): def __init__(self, nvec):
self.low = low self.ncats = nvec
self.high = high
self.ncats = high - low + 1
def pdclass(self): def pdclass(self):
return MultiCategoricalPd return MultiCategoricalPd
def pdfromflat(self, flat): def pdfromflat(self, flat):
return MultiCategoricalPd(self.low, self.high, flat) return MultiCategoricalPd(self.ncats, flat)
def param_shape(self): def param_shape(self):
return [sum(self.ncats)] return [sum(self.ncats)]
def sample_shape(self): def sample_shape(self):
@@ -125,7 +123,7 @@ class CategoricalPd(Pd):
def flatparam(self): def flatparam(self):
return self.logits return self.logits
def mode(self): def mode(self):
return U.argmax(self.logits, axis=-1) return tf.argmax(self.logits, axis=-1)
def neglogp(self, x): def neglogp(self, x):
# return tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=x) # return tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=x)
# Note: we can't use sparse_softmax_cross_entropy_with_logits because # Note: we can't use sparse_softmax_cross_entropy_with_logits because
@@ -135,20 +133,20 @@ class CategoricalPd(Pd):
logits=self.logits, logits=self.logits,
labels=one_hot_actions) labels=one_hot_actions)
def kl(self, other): def kl(self, other):
a0 = self.logits - U.max(self.logits, axis=-1, keepdims=True) a0 = self.logits - tf.reduce_max(self.logits, axis=-1, keep_dims=True)
a1 = other.logits - U.max(other.logits, axis=-1, keepdims=True) a1 = other.logits - tf.reduce_max(other.logits, axis=-1, keep_dims=True)
ea0 = tf.exp(a0) ea0 = tf.exp(a0)
ea1 = tf.exp(a1) ea1 = tf.exp(a1)
z0 = U.sum(ea0, axis=-1, keepdims=True) z0 = tf.reduce_sum(ea0, axis=-1, keep_dims=True)
z1 = U.sum(ea1, axis=-1, keepdims=True) z1 = tf.reduce_sum(ea1, axis=-1, keep_dims=True)
p0 = ea0 / z0 p0 = ea0 / z0
return U.sum(p0 * (a0 - tf.log(z0) - a1 + tf.log(z1)), axis=-1) return tf.reduce_sum(p0 * (a0 - tf.log(z0) - a1 + tf.log(z1)), axis=-1)
def entropy(self): def entropy(self):
a0 = self.logits - U.max(self.logits, axis=-1, keepdims=True) a0 = self.logits - tf.reduce_max(self.logits, axis=-1, keep_dims=True)
ea0 = tf.exp(a0) ea0 = tf.exp(a0)
z0 = U.sum(ea0, axis=-1, keepdims=True) z0 = tf.reduce_sum(ea0, axis=-1, keep_dims=True)
p0 = ea0 / z0 p0 = ea0 / z0
return U.sum(p0 * (tf.log(z0) - a0), axis=-1) return tf.reduce_sum(p0 * (tf.log(z0) - a0), axis=-1)
def sample(self): def sample(self):
u = tf.random_uniform(tf.shape(self.logits)) u = tf.random_uniform(tf.shape(self.logits))
return tf.argmax(self.logits - tf.log(-tf.log(u)), axis=-1) return tf.argmax(self.logits - tf.log(-tf.log(u)), axis=-1)
@@ -157,24 +155,21 @@ class CategoricalPd(Pd):
return cls(flat) return cls(flat)
class MultiCategoricalPd(Pd): class MultiCategoricalPd(Pd):
def __init__(self, low, high, flat): def __init__(self, nvec, flat):
self.flat = flat self.flat = flat
self.low = tf.constant(low, dtype=tf.int32) self.categoricals = list(map(CategoricalPd, tf.split(flat, nvec, axis=-1)))
self.categoricals = list(map(CategoricalPd, tf.split(flat, high - low + 1, axis=len(flat.get_shape()) - 1)))
def flatparam(self): def flatparam(self):
return self.flat return self.flat
def mode(self): def mode(self):
return self.low + tf.cast(tf.stack([p.mode() for p in self.categoricals], axis=-1), tf.int32) return tf.cast(tf.stack([p.mode() for p in self.categoricals], axis=-1), tf.int32)
def neglogp(self, x): def neglogp(self, x):
return tf.add_n([p.neglogp(px) for p, px in zip(self.categoricals, tf.unstack(x - self.low, axis=len(x.get_shape()) - 1))]) return tf.add_n([p.neglogp(px) for p, px in zip(self.categoricals, tf.unstack(x, axis=-1))])
def kl(self, other): def kl(self, other):
return tf.add_n([ return tf.add_n([p.kl(q) for p, q in zip(self.categoricals, other.categoricals)])
p.kl(q) for p, q in zip(self.categoricals, other.categoricals)
])
def entropy(self): def entropy(self):
return tf.add_n([p.entropy() for p in self.categoricals]) return tf.add_n([p.entropy() for p in self.categoricals])
def sample(self): def sample(self):
return self.low + tf.cast(tf.stack([p.sample() for p in self.categoricals], axis=-1), tf.int32) return tf.cast(tf.stack([p.sample() for p in self.categoricals], axis=-1), tf.int32)
@classmethod @classmethod
def fromflat(cls, flat): def fromflat(cls, flat):
raise NotImplementedError raise NotImplementedError
@@ -191,14 +186,14 @@ class DiagGaussianPd(Pd):
def mode(self): def mode(self):
return self.mean return self.mean
def neglogp(self, x): def neglogp(self, x):
return 0.5 * U.sum(tf.square((x - self.mean) / self.std), axis=-1) \ return 0.5 * tf.reduce_sum(tf.square((x - self.mean) / self.std), axis=-1) \
+ 0.5 * np.log(2.0 * np.pi) * tf.to_float(tf.shape(x)[-1]) \ + 0.5 * np.log(2.0 * np.pi) * tf.to_float(tf.shape(x)[-1]) \
+ U.sum(self.logstd, axis=-1) + tf.reduce_sum(self.logstd, axis=-1)
def kl(self, other): def kl(self, other):
assert isinstance(other, DiagGaussianPd) assert isinstance(other, DiagGaussianPd)
return U.sum(other.logstd - self.logstd + (tf.square(self.std) + tf.square(self.mean - other.mean)) / (2.0 * tf.square(other.std)) - 0.5, axis=-1) return tf.reduce_sum(other.logstd - self.logstd + (tf.square(self.std) + tf.square(self.mean - other.mean)) / (2.0 * tf.square(other.std)) - 0.5, axis=-1)
def entropy(self): def entropy(self):
return U.sum(self.logstd + .5 * np.log(2.0 * np.pi * np.e), axis=-1) return tf.reduce_sum(self.logstd + .5 * np.log(2.0 * np.pi * np.e), axis=-1)
def sample(self): def sample(self):
return self.mean + self.std * tf.random_normal(tf.shape(self.mean)) return self.mean + self.std * tf.random_normal(tf.shape(self.mean))
@classmethod @classmethod
@@ -214,11 +209,11 @@ class BernoulliPd(Pd):
def mode(self): def mode(self):
return tf.round(self.ps) return tf.round(self.ps)
def neglogp(self, x): def neglogp(self, x):
return U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=tf.to_float(x)), axis=-1) return tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=tf.to_float(x)), axis=-1)
def kl(self, other): def kl(self, other):
return U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=other.logits, labels=self.ps), axis=-1) - U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps), axis=-1) return tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=other.logits, labels=self.ps), axis=-1) - tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps), axis=-1)
def entropy(self): def entropy(self):
return U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps), axis=-1) return tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps), axis=-1)
def sample(self): def sample(self):
u = tf.random_uniform(tf.shape(self.ps)) u = tf.random_uniform(tf.shape(self.ps))
return tf.to_float(math_ops.less(u, self.ps)) return tf.to_float(math_ops.less(u, self.ps))
@@ -234,7 +229,7 @@ def make_pdtype(ac_space):
elif isinstance(ac_space, spaces.Discrete): elif isinstance(ac_space, spaces.Discrete):
return CategoricalPdType(ac_space.n) return CategoricalPdType(ac_space.n)
elif isinstance(ac_space, spaces.MultiDiscrete): elif isinstance(ac_space, spaces.MultiDiscrete):
return MultiCategoricalPdType(ac_space.low, ac_space.high) return MultiCategoricalPdType(ac_space.nvec)
elif isinstance(ac_space, spaces.MultiBinary): elif isinstance(ac_space, spaces.MultiBinary):
return BernoulliPdType(ac_space.n) return BernoulliPdType(ac_space.n)
else: else:
@@ -259,6 +254,11 @@ def test_probtypes():
categorical = CategoricalPdType(pdparam_categorical.size) #pylint: disable=E1101 categorical = CategoricalPdType(pdparam_categorical.size) #pylint: disable=E1101
validate_probtype(categorical, pdparam_categorical) validate_probtype(categorical, pdparam_categorical)
nvec = [1,2,3]
pdparam_multicategorical = np.array([-.2, .3, .5, .1, 1, -.1])
multicategorical = MultiCategoricalPdType(nvec) #pylint: disable=E1101
validate_probtype(multicategorical, pdparam_multicategorical)
pdparam_bernoulli = np.array([-.2, .3, .5]) pdparam_bernoulli = np.array([-.2, .3, .5])
bernoulli = BernoulliPdType(pdparam_bernoulli.size) #pylint: disable=E1101 bernoulli = BernoulliPdType(pdparam_bernoulli.size) #pylint: disable=E1101
validate_probtype(bernoulli, pdparam_bernoulli) validate_probtype(bernoulli, pdparam_bernoulli)
@@ -270,10 +270,10 @@ def validate_probtype(probtype, pdparam):
Mval = np.repeat(pdparam[None, :], N, axis=0) Mval = np.repeat(pdparam[None, :], N, axis=0)
M = probtype.param_placeholder([N]) M = probtype.param_placeholder([N])
X = probtype.sample_placeholder([N]) X = probtype.sample_placeholder([N])
pd = probtype.pdclass()(M) pd = probtype.pdfromflat(M)
calcloglik = U.function([X, M], pd.logp(X)) calcloglik = U.function([X, M], pd.logp(X))
calcent = U.function([M], pd.entropy()) calcent = U.function([M], pd.entropy())
Xval = U.eval(pd.sample(), feed_dict={M:Mval}) Xval = tf.get_default_session().run(pd.sample(), feed_dict={M:Mval})
logliks = calcloglik(Xval, Mval) logliks = calcloglik(Xval, Mval)
entval_ll = - logliks.mean() #pylint: disable=E1101 entval_ll = - logliks.mean() #pylint: disable=E1101
entval_ll_stderr = logliks.std() / np.sqrt(N) #pylint: disable=E1101 entval_ll_stderr = logliks.std() / np.sqrt(N) #pylint: disable=E1101
@@ -282,7 +282,7 @@ def validate_probtype(probtype, pdparam):
# Check to see if kldiv[p,q] = - ent[p] - E_p[log q] # Check to see if kldiv[p,q] = - ent[p] - E_p[log q]
M2 = probtype.param_placeholder([N]) M2 = probtype.param_placeholder([N])
pd2 = probtype.pdclass()(M2) pd2 = probtype.pdfromflat(M2)
q = pdparam + np.random.randn(pdparam.size) * 0.1 q = pdparam + np.random.randn(pdparam.size) * 0.1
Mval2 = np.repeat(q[None, :], N, axis=0) Mval2 = np.repeat(q[None, :], N, axis=0)
calckl = U.function([M, M2], pd.kl(pd2)) calckl = U.function([M, M2], pd.kl(pd2))
@@ -291,3 +291,5 @@ def validate_probtype(probtype, pdparam):
klval_ll = - entval - logliks.mean() #pylint: disable=E1101 klval_ll = - entval - logliks.mean() #pylint: disable=E1101
klval_ll_stderr = logliks.std() / np.sqrt(N) #pylint: disable=E1101 klval_ll_stderr = logliks.std() / np.sqrt(N) #pylint: disable=E1101
assert np.abs(klval - klval_ll) < 3 * klval_ll_stderr # within 3 sigmas assert np.abs(klval - klval_ll) < 3 * klval_ll_stderr # within 3 sigmas
print('ok on', probtype, pdparam)

View File

@@ -53,7 +53,7 @@ class MpiAdam(object):
def test_MpiAdam(): def test_MpiAdam():
np.random.seed(0) np.random.seed(0)
tf.set_random_seed(0) tf.set_random_seed(0)
a = tf.Variable(np.random.randn(3).astype('float32')) a = tf.Variable(np.random.randn(3).astype('float32'))
b = tf.Variable(np.random.randn(2,5).astype('float32')) b = tf.Variable(np.random.randn(2,5).astype('float32'))
loss = tf.reduce_sum(tf.square(a)) + tf.reduce_sum(tf.sin(b)) loss = tf.reduce_sum(tf.square(a)) + tf.reduce_sum(tf.sin(b))

View File

@@ -2,29 +2,41 @@ from mpi4py import MPI
import numpy as np import numpy as np
from baselines.common import zipsame from baselines.common import zipsame
def mpi_moments(x, axis=0): def mpi_mean(x, axis=0, comm=None, keepdims=False):
x = np.asarray(x, dtype='float64') x = np.asarray(x)
newshape = list(x.shape) assert x.ndim > 0
newshape.pop(axis) if comm is None: comm = MPI.COMM_WORLD
n = np.prod(newshape,dtype=int) xsum = x.sum(axis=axis, keepdims=keepdims)
totalvec = np.zeros(n*2+1, 'float64') n = xsum.size
addvec = np.concatenate([x.sum(axis=axis).ravel(), localsum = np.zeros(n+1, x.dtype)
np.square(x).sum(axis=axis).ravel(), localsum[:n] = xsum.ravel()
np.array([x.shape[axis]],dtype='float64')]) localsum[n] = x.shape[axis]
MPI.COMM_WORLD.Allreduce(addvec, totalvec, op=MPI.SUM) globalsum = np.zeros_like(localsum)
sum = totalvec[:n] comm.Allreduce(localsum, globalsum, op=MPI.SUM)
sumsq = totalvec[n:2*n] return globalsum[:n].reshape(xsum.shape) / globalsum[n], globalsum[n]
count = totalvec[2*n]
if count == 0: def mpi_moments(x, axis=0, comm=None, keepdims=False):
mean = np.empty(newshape); mean[:] = np.nan x = np.asarray(x)
std = np.empty(newshape); std[:] = np.nan assert x.ndim > 0
else: mean, count = mpi_mean(x, axis=axis, comm=comm, keepdims=True)
mean = sum/count sqdiffs = np.square(x - mean)
std = np.sqrt(np.maximum(sumsq/count - np.square(mean),0)) meansqdiff, count1 = mpi_mean(sqdiffs, axis=axis, comm=comm, keepdims=True)
assert count1 == count
std = np.sqrt(meansqdiff)
if not keepdims:
newshape = mean.shape[:axis] + mean.shape[axis+1:]
mean = mean.reshape(newshape)
std = std.reshape(newshape)
return mean, std, count return mean, std, count
def test_runningmeanstd(): def test_runningmeanstd():
import subprocess
subprocess.check_call(['mpirun', '-np', '3',
'python','-c',
'from baselines.common.mpi_moments import _helper_runningmeanstd; _helper_runningmeanstd()'])
def _helper_runningmeanstd():
comm = MPI.COMM_WORLD comm = MPI.COMM_WORLD
np.random.seed(0) np.random.seed(0)
for (triple,axis) in [ for (triple,axis) in [
@@ -45,6 +57,3 @@ def test_runningmeanstd():
assert np.allclose(a1, a2) assert np.allclose(a1, a2)
print("ok!") print("ok!")
if __name__ == "__main__":
#mpirun -np 3 python <script>
test_runningmeanstd()

View File

@@ -57,7 +57,7 @@ def test_runningmeanstd():
rms.update(x1) rms.update(x1)
rms.update(x2) rms.update(x2)
rms.update(x3) rms.update(x3)
ms2 = U.eval([rms.mean, rms.std]) ms2 = [rms.mean.eval(), rms.std.eval()]
assert np.allclose(ms1, ms2) assert np.allclose(ms1, ms2)
@@ -94,11 +94,11 @@ def test_dist():
assert checkallclose( assert checkallclose(
bigvec.mean(axis=0), bigvec.mean(axis=0),
U.eval(rms.mean) rms.mean.eval(),
) )
assert checkallclose( assert checkallclose(
bigvec.std(axis=0), bigvec.std(axis=0),
U.eval(rms.std) rms.std.eval(),
) )

View File

@@ -6,12 +6,13 @@ class RunningMeanStd(object):
self.var = np.ones(shape, 'float64') self.var = np.ones(shape, 'float64')
self.count = epsilon self.count = epsilon
def update(self, x): def update(self, x):
batch_mean = np.mean(x, axis=0) batch_mean = np.mean(x, axis=0)
batch_var = np.var(x, axis=0) batch_var = np.var(x, axis=0)
batch_count = x.shape[0] batch_count = x.shape[0]
self.update_from_moments(batch_mean, batch_var, batch_count)
def update_from_moments(self, batch_mean, batch_var, batch_count):
delta = batch_mean - self.mean delta = batch_mean - self.mean
tot_count = self.count + batch_count tot_count = self.count + batch_count
@@ -25,4 +26,21 @@ class RunningMeanStd(object):
self.mean = new_mean self.mean = new_mean
self.var = new_var self.var = new_var
self.count = new_count self.count = new_count
def test_runningmeanstd():
for (x1, x2, x3) in [
(np.random.randn(3), np.random.randn(4), np.random.randn(5)),
(np.random.randn(3,2), np.random.randn(4,2), np.random.randn(5,2)),
]:
rms = RunningMeanStd(epsilon=0.0, shape=x1.shape[1:])
x = np.concatenate([x1, x2, x3], axis=0)
ms1 = [x.mean(axis=0), x.var(axis=0)]
rms.update(x1)
rms.update(x2)
rms.update(x3)
ms2 = [rms.mean, rms.var]
assert np.allclose(ms1, ms2)

View File

@@ -3,30 +3,10 @@ import tensorflow as tf
from baselines.common.tf_util import ( from baselines.common.tf_util import (
function, function,
initialize, initialize,
set_value,
single_threaded_session single_threaded_session
) )
def test_set_value():
a = tf.Variable(42.)
with single_threaded_session():
set_value(a, 5)
assert a.eval() == 5
g = tf.get_default_graph()
g.finalize()
set_value(a, 6)
assert a.eval() == 6
# test the test
try:
assert a.eval() == 7
except AssertionError:
pass
else:
assert False, "assertion should have failed"
def test_function(): def test_function():
tf.reset_default_graph() tf.reset_default_graph()
x = tf.placeholder(tf.int32, (), name="x") x = tf.placeholder(tf.int32, (), name="x")
@@ -38,9 +18,7 @@ def test_function():
initialize() initialize()
assert lin(2) == 6 assert lin(2) == 6
assert lin(x=3) == 9
assert lin(2, 2) == 10 assert lin(2, 2) == 10
assert lin(x=2, y=3) == 12
def test_multikwargs(): def test_multikwargs():
@@ -56,14 +34,8 @@ def test_multikwargs():
assert lin(2) == 6 assert lin(2) == 6
assert lin(2, 2) == 10 assert lin(2, 2) == 10
expt_caught = False expt_caught = False
try:
lin(x=2)
except AssertionError:
expt_caught = True
assert expt_caught
if __name__ == '__main__': if __name__ == '__main__':
test_set_value()
test_function() test_function()
test_multikwargs() test_multikwargs()

View File

@@ -1,45 +1,10 @@
import numpy as np import numpy as np
import tensorflow as tf # pylint: ignore-module import tensorflow as tf # pylint: ignore-module
import builtins
import functools
import copy import copy
import os import os
import functools
import collections import collections
import multiprocessing
# ================================================================
# Make consistent with numpy
# ================================================================
clip = tf.clip_by_value
def sum(x, axis=None, keepdims=False):
axis = None if axis is None else [axis]
return tf.reduce_sum(x, axis=axis, keep_dims=keepdims)
def mean(x, axis=None, keepdims=False):
axis = None if axis is None else [axis]
return tf.reduce_mean(x, axis=axis, keep_dims=keepdims)
def var(x, axis=None, keepdims=False):
meanx = mean(x, axis=axis, keepdims=keepdims)
return mean(tf.square(x - meanx), axis=axis, keepdims=keepdims)
def std(x, axis=None, keepdims=False):
return tf.sqrt(var(x, axis=axis, keepdims=keepdims))
def max(x, axis=None, keepdims=False):
axis = None if axis is None else [axis]
return tf.reduce_max(x, axis=axis, keep_dims=keepdims)
def min(x, axis=None, keepdims=False):
axis = None if axis is None else [axis]
return tf.reduce_min(x, axis=axis, keep_dims=keepdims)
def concatenate(arrs, axis=0):
return tf.concat(axis=axis, values=arrs)
def argmax(x, axis=None):
return tf.argmax(x, axis=axis)
def switch(condition, then_expression, else_expression): def switch(condition, then_expression, else_expression):
"""Switches between two operations depending on a scalar value (int or bool). """Switches between two operations depending on a scalar value (int or bool).
@@ -62,105 +27,11 @@ def switch(condition, then_expression, else_expression):
# Extras # Extras
# ================================================================ # ================================================================
def l2loss(params):
if len(params) == 0:
return tf.constant(0.0)
else:
return tf.add_n([sum(tf.square(p)) for p in params])
def lrelu(x, leak=0.2): def lrelu(x, leak=0.2):
f1 = 0.5 * (1 + leak) f1 = 0.5 * (1 + leak)
f2 = 0.5 * (1 - leak) f2 = 0.5 * (1 - leak)
return f1 * x + f2 * abs(x) return f1 * x + f2 * abs(x)
def categorical_sample_logits(X):
# https://github.com/tensorflow/tensorflow/issues/456
U = tf.random_uniform(tf.shape(X))
return argmax(X - tf.log(-tf.log(U)), axis=1)
# ================================================================
# Inputs
# ================================================================
def is_placeholder(x):
return type(x) is tf.Tensor and len(x.op.inputs) == 0
class TfInput(object):
def __init__(self, name="(unnamed)"):
"""Generalized Tensorflow placeholder. The main differences are:
- possibly uses multiple placeholders internally and returns multiple values
- can apply light postprocessing to the value feed to placeholder.
"""
self.name = name
def get(self):
"""Return the tf variable(s) representing the possibly postprocessed value
of placeholder(s).
"""
raise NotImplemented()
def make_feed_dict(data):
"""Given data input it to the placeholder(s)."""
raise NotImplemented()
class PlacholderTfInput(TfInput):
def __init__(self, placeholder):
"""Wrapper for regular tensorflow placeholder."""
super().__init__(placeholder.name)
self._placeholder = placeholder
def get(self):
return self._placeholder
def make_feed_dict(self, data):
return {self._placeholder: data}
class BatchInput(PlacholderTfInput):
def __init__(self, shape, dtype=tf.float32, name=None):
"""Creates a placeholder for a batch of tensors of a given shape and dtype
Parameters
----------
shape: [int]
shape of a single elemenet of the batch
dtype: tf.dtype
number representation used for tensor contents
name: str
name of the underlying placeholder
"""
super().__init__(tf.placeholder(dtype, [None] + list(shape), name=name))
class Uint8Input(PlacholderTfInput):
def __init__(self, shape, name=None):
"""Takes input in uint8 format which is cast to float32 and divided by 255
before passing it to the model.
On GPU this ensures lower data transfer times.
Parameters
----------
shape: [int]
shape of the tensor.
name: str
name of the underlying placeholder
"""
super().__init__(tf.placeholder(tf.uint8, [None] + list(shape), name=name))
self._shape = shape
self._output = tf.cast(super().get(), tf.float32) / 255.0
def get(self):
return self._output
def ensure_tf_input(thing):
"""Takes either tf.placeholder of TfInput and outputs equivalent TfInput"""
if isinstance(thing, TfInput):
return thing
elif is_placeholder(thing):
return PlacholderTfInput(thing)
else:
raise ValueError("Must be a placeholder or TfInput")
# ================================================================ # ================================================================
# Mathematical utils # Mathematical utils
# ================================================================ # ================================================================
@@ -173,96 +44,42 @@ def huber_loss(x, delta=1.0):
delta * (tf.abs(x) - 0.5 * delta) delta * (tf.abs(x) - 0.5 * delta)
) )
def logsigmoid(a):
'''Equivalent to tf.log(tf.sigmoid(a))'''
return -tf.nn.softplus(-a)
""" Reference: https://github.com/openai/imitation/blob/99fbccf3e060b6e6c739bdf209758620fcdefd3c/policyopt/thutil.py#L48-L51"""
def logit_bernoulli_entropy(logits):
ent = (1.-tf.nn.sigmoid(logits))*logits - logsigmoid(logits)
return ent
# ================================================================
# Optimizer utils
# ================================================================
def minimize_and_clip(optimizer, objective, var_list, clip_val=10):
"""Minimized `objective` using `optimizer` w.r.t. variables in
`var_list` while ensure the norm of the gradients for each
variable is clipped to `clip_val`
"""
gradients = optimizer.compute_gradients(objective, var_list=var_list)
for i, (grad, var) in enumerate(gradients):
if grad is not None:
gradients[i] = (tf.clip_by_norm(grad, clip_val), var)
return optimizer.apply_gradients(gradients)
# ================================================================ # ================================================================
# Global session # Global session
# ================================================================ # ================================================================
def get_session(): def make_session(num_cpu=None, make_default=False):
"""Returns recently made Tensorflow session"""
return tf.get_default_session()
def make_session(num_cpu):
"""Returns a session that will use <num_cpu> CPU's only""" """Returns a session that will use <num_cpu> CPU's only"""
if num_cpu is None:
num_cpu = int(os.getenv('RCALL_NUM_CPU', multiprocessing.cpu_count()))
tf_config = tf.ConfigProto( tf_config = tf.ConfigProto(
inter_op_parallelism_threads=num_cpu, inter_op_parallelism_threads=num_cpu,
intra_op_parallelism_threads=num_cpu) intra_op_parallelism_threads=num_cpu)
return tf.Session(config=tf_config) tf_config.gpu_options.allocator_type = 'BFC'
if make_default:
return tf.InteractiveSession(config=tf_config)
else:
return tf.Session(config=tf_config)
def single_threaded_session(): def single_threaded_session():
"""Returns a session which will only use a single CPU""" """Returns a session which will only use a single CPU"""
return make_session(1) return make_session(num_cpu=1)
def in_session(f):
@functools.wraps(f)
def newfunc(*args, **kwargs):
with tf.Session():
f(*args, **kwargs)
return newfunc
ALREADY_INITIALIZED = set() ALREADY_INITIALIZED = set()
def initialize(): def initialize():
"""Initialize all the uninitialized variables in the global scope.""" """Initialize all the uninitialized variables in the global scope."""
new_variables = set(tf.global_variables()) - ALREADY_INITIALIZED new_variables = set(tf.global_variables()) - ALREADY_INITIALIZED
get_session().run(tf.variables_initializer(new_variables)) tf.get_default_session().run(tf.variables_initializer(new_variables))
ALREADY_INITIALIZED.update(new_variables) ALREADY_INITIALIZED.update(new_variables)
def eval(expr, feed_dict=None):
if feed_dict is None:
feed_dict = {}
return get_session().run(expr, feed_dict=feed_dict)
VALUE_SETTERS = collections.OrderedDict()
def set_value(v, val):
global VALUE_SETTERS
if v in VALUE_SETTERS:
set_op, set_endpoint = VALUE_SETTERS[v]
else:
set_endpoint = tf.placeholder(v.dtype)
set_op = v.assign(set_endpoint)
VALUE_SETTERS[v] = (set_op, set_endpoint)
get_session().run(set_op, feed_dict={set_endpoint: val})
# ================================================================
# Save tensorflow summary
# ================================================================
def file_writer(dir_path):
os.makedirs(dir_path, exist_ok=True)
return tf.summary.FileWriter(dir_path, get_session().graph)
# ================================================================
# Saving variables
# ================================================================
def load_state(fname, var_list=None):
saver = tf.train.Saver(var_list=var_list)
saver.restore(get_session(), fname)
def save_state(fname, var_list=None):
os.makedirs(os.path.dirname(fname), exist_ok=True)
saver = tf.train.Saver(var_list=var_list)
saver.save(get_session(), fname)
# ================================================================ # ================================================================
# Model components # Model components
# ================================================================ # ================================================================
@@ -303,36 +120,6 @@ def conv2d(x, num_filters, name, filter_size=(3, 3), stride=(1, 1), pad="SAME",
return tf.nn.conv2d(x, w, stride_shape, pad) + b return tf.nn.conv2d(x, w, stride_shape, pad) + b
def dense(x, size, name, weight_init=None, bias=True):
w = tf.get_variable(name + "/w", [x.get_shape()[1], size], initializer=weight_init)
ret = tf.matmul(x, w)
if bias:
b = tf.get_variable(name + "/b", [size], initializer=tf.zeros_initializer())
return ret + b
else:
return ret
def wndense(x, size, name, init_scale=1.0):
v = tf.get_variable(name + "/V", [int(x.get_shape()[1]), size],
initializer=tf.random_normal_initializer(0, 0.05))
g = tf.get_variable(name + "/g", [size], initializer=tf.constant_initializer(init_scale))
b = tf.get_variable(name + "/b", [size], initializer=tf.constant_initializer(0.0))
# use weight normalization (Salimans & Kingma, 2016)
x = tf.matmul(x, v)
scaler = g / tf.sqrt(sum(tf.square(v), axis=0, keepdims=True))
return tf.reshape(scaler, [1, size]) * x + tf.reshape(b, [1, size])
def densenobias(x, size, name, weight_init=None):
return dense(x, size, name, weight_init=weight_init, bias=False)
def dropout(x, pkeep, phase=None, mask=None):
mask = tf.floor(pkeep + tf.random_uniform(tf.shape(x))) if mask is None else mask
if phase is None:
return mask * x
else:
return switch(phase, mask * x, pkeep * x)
# ================================================================ # ================================================================
# Theano-like Function # Theano-like Function
# ================================================================ # ================================================================
@@ -362,7 +149,7 @@ def function(inputs, outputs, updates=None, givens=None):
Parameters Parameters
---------- ----------
inputs: [tf.placeholder or TfInput] inputs: [tf.placeholder, tf.constant, or object with make_feed_dict method]
list of input arguments list of input arguments
outputs: [tf.Variable] or tf.Variable outputs: [tf.Variable] or tf.Variable
list of outputs or a single output to be returned from function. Returned list of outputs or a single output to be returned from function. Returned
@@ -377,183 +164,36 @@ def function(inputs, outputs, updates=None, givens=None):
f = _Function(inputs, [outputs], updates, givens=givens) f = _Function(inputs, [outputs], updates, givens=givens)
return lambda *args, **kwargs: f(*args, **kwargs)[0] return lambda *args, **kwargs: f(*args, **kwargs)[0]
class _Function(object): class _Function(object):
def __init__(self, inputs, outputs, updates, givens, check_nan=False): def __init__(self, inputs, outputs, updates, givens):
for inpt in inputs: for inpt in inputs:
if not issubclass(type(inpt), TfInput): if not hasattr(inpt, 'make_feed_dict') and not (type(inpt) is tf.Tensor and len(inpt.op.inputs) == 0):
assert len(inpt.op.inputs) == 0, "inputs should all be placeholders of baselines.common.TfInput" assert False, "inputs should all be placeholders, constants, or have a make_feed_dict method"
self.inputs = inputs self.inputs = inputs
updates = updates or [] updates = updates or []
self.update_group = tf.group(*updates) self.update_group = tf.group(*updates)
self.outputs_update = list(outputs) + [self.update_group] self.outputs_update = list(outputs) + [self.update_group]
self.givens = {} if givens is None else givens self.givens = {} if givens is None else givens
self.check_nan = check_nan
def _feed_input(self, feed_dict, inpt, value): def _feed_input(self, feed_dict, inpt, value):
if issubclass(type(inpt), TfInput): if hasattr(inpt, 'make_feed_dict'):
feed_dict.update(inpt.make_feed_dict(value)) feed_dict.update(inpt.make_feed_dict(value))
elif is_placeholder(inpt): else:
feed_dict[inpt] = value feed_dict[inpt] = value
def __call__(self, *args, **kwargs): def __call__(self, *args):
assert len(args) <= len(self.inputs), "Too many arguments provided" assert len(args) <= len(self.inputs), "Too many arguments provided"
feed_dict = {} feed_dict = {}
# Update the args # Update the args
for inpt, value in zip(self.inputs, args): for inpt, value in zip(self.inputs, args):
self._feed_input(feed_dict, inpt, value) self._feed_input(feed_dict, inpt, value)
# Update the kwargs
kwargs_passed_inpt_names = set()
for inpt in self.inputs[len(args):]:
inpt_name = inpt.name.split(':')[0]
inpt_name = inpt_name.split('/')[-1]
assert inpt_name not in kwargs_passed_inpt_names, \
"this function has two arguments with the same name \"{}\", so kwargs cannot be used.".format(inpt_name)
if inpt_name in kwargs:
kwargs_passed_inpt_names.add(inpt_name)
self._feed_input(feed_dict, inpt, kwargs.pop(inpt_name))
else:
assert inpt in self.givens, "Missing argument " + inpt_name
assert len(kwargs) == 0, "Function got extra arguments " + str(list(kwargs.keys()))
# Update feed dict with givens. # Update feed dict with givens.
for inpt in self.givens: for inpt in self.givens:
feed_dict[inpt] = feed_dict.get(inpt, self.givens[inpt]) feed_dict[inpt] = feed_dict.get(inpt, self.givens[inpt])
results = get_session().run(self.outputs_update, feed_dict=feed_dict)[:-1] results = tf.get_default_session().run(self.outputs_update, feed_dict=feed_dict)[:-1]
if self.check_nan:
if any(np.isnan(r).any() for r in results):
raise RuntimeError("Nan detected")
return results return results
def mem_friendly_function(nondata_inputs, data_inputs, outputs, batch_size):
if isinstance(outputs, list):
return _MemFriendlyFunction(nondata_inputs, data_inputs, outputs, batch_size)
else:
f = _MemFriendlyFunction(nondata_inputs, data_inputs, [outputs], batch_size)
return lambda *inputs: f(*inputs)[0]
class _MemFriendlyFunction(object):
def __init__(self, nondata_inputs, data_inputs, outputs, batch_size):
self.nondata_inputs = nondata_inputs
self.data_inputs = data_inputs
self.outputs = list(outputs)
self.batch_size = batch_size
def __call__(self, *inputvals):
assert len(inputvals) == len(self.nondata_inputs) + len(self.data_inputs)
nondata_vals = inputvals[0:len(self.nondata_inputs)]
data_vals = inputvals[len(self.nondata_inputs):]
feed_dict = dict(zip(self.nondata_inputs, nondata_vals))
n = data_vals[0].shape[0]
for v in data_vals[1:]:
assert v.shape[0] == n
for i_start in range(0, n, self.batch_size):
slice_vals = [v[i_start:builtins.min(i_start + self.batch_size, n)] for v in data_vals]
for (var, val) in zip(self.data_inputs, slice_vals):
feed_dict[var] = val
results = tf.get_default_session().run(self.outputs, feed_dict=feed_dict)
if i_start == 0:
sum_results = results
else:
for i in range(len(results)):
sum_results[i] = sum_results[i] + results[i]
for i in range(len(results)):
sum_results[i] = sum_results[i] / n
return sum_results
# ================================================================
# Modules
# ================================================================
class Module(object):
def __init__(self, name):
self.name = name
self.first_time = True
self.scope = None
self.cache = {}
def __call__(self, *args):
if args in self.cache:
print("(%s) retrieving value from cache" % (self.name,))
return self.cache[args]
with tf.variable_scope(self.name, reuse=not self.first_time):
scope = tf.get_variable_scope().name
if self.first_time:
self.scope = scope
print("(%s) running function for the first time" % (self.name,))
else:
assert self.scope == scope, "Tried calling function with a different scope"
print("(%s) running function on new inputs" % (self.name,))
self.first_time = False
out = self._call(*args)
self.cache[args] = out
return out
def _call(self, *args):
raise NotImplementedError
@property
def trainable_variables(self):
assert self.scope is not None, "need to call module once before getting variables"
return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)
@property
def variables(self):
assert self.scope is not None, "need to call module once before getting variables"
return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope)
def module(name):
@functools.wraps
def wrapper(f):
class WrapperModule(Module):
def _call(self, *args):
return f(*args)
return WrapperModule(name)
return wrapper
# ================================================================
# Graph traversal
# ================================================================
VARIABLES = {}
def get_parents(node):
return node.op.inputs
def topsorted(outputs):
"""
Topological sort via non-recursive depth-first search
"""
assert isinstance(outputs, (list, tuple))
marks = {}
out = []
stack = [] # pylint: disable=W0621
# i: node
# jidx = number of children visited so far from that node
# marks: state of each node, which is one of
# 0: haven't visited
# 1: have visited, but not done visiting children
# 2: done visiting children
for x in outputs:
stack.append((x, 0))
while stack:
(i, jidx) = stack.pop()
if jidx == 0:
m = marks.get(i, 0)
if m == 0:
marks[i] = 1
elif m == 1:
raise ValueError("not a dag")
else:
continue
ps = get_parents(i)
if jidx == len(ps):
marks[i] = 2
out.append(i)
else:
stack.append((i, jidx + 1))
j = ps[jidx]
stack.append((j, 0))
return out
# ================================================================ # ================================================================
# Flat vectors # Flat vectors
# ================================================================ # ================================================================
@@ -595,88 +235,14 @@ class SetFromFlat(object):
self.op = tf.group(*assigns) self.op = tf.group(*assigns)
def __call__(self, theta): def __call__(self, theta):
get_session().run(self.op, feed_dict={self.theta: theta}) tf.get_default_session().run(self.op, feed_dict={self.theta: theta})
class GetFlat(object): class GetFlat(object):
def __init__(self, var_list): def __init__(self, var_list):
self.op = tf.concat(axis=0, values=[tf.reshape(v, [numel(v)]) for v in var_list]) self.op = tf.concat(axis=0, values=[tf.reshape(v, [numel(v)]) for v in var_list])
def __call__(self): def __call__(self):
return get_session().run(self.op) return tf.get_default_session().run(self.op)
# ================================================================
# Misc
# ================================================================
def fancy_slice_2d(X, inds0, inds1):
"""
like numpy X[inds0, inds1]
XXX this implementation is bad
"""
inds0 = tf.cast(inds0, tf.int64)
inds1 = tf.cast(inds1, tf.int64)
shape = tf.cast(tf.shape(X), tf.int64)
ncols = shape[1]
Xflat = tf.reshape(X, [-1])
return tf.gather(Xflat, inds0 * ncols + inds1)
# ================================================================
# Scopes
# ================================================================
def scope_vars(scope, trainable_only=False):
"""
Get variables inside a scope
The scope can be specified as a string
Parameters
----------
scope: str or VariableScope
scope in which the variables reside.
trainable_only: bool
whether or not to return only the variables that were marked as trainable.
Returns
-------
vars: [tf.Variable]
list of variables in `scope`.
"""
return tf.get_collection(
tf.GraphKeys.TRAINABLE_VARIABLES if trainable_only else tf.GraphKeys.GLOBAL_VARIABLES,
scope=scope if isinstance(scope, str) else scope.name
)
def scope_name():
"""Returns the name of current scope as a string, e.g. deepq/q_func"""
return tf.get_variable_scope().name
def absolute_scope_name(relative_scope_name):
"""Appends parent scope name to `relative_scope_name`"""
return scope_name() + "/" + relative_scope_name
def lengths_to_mask(lengths_b, max_length):
"""
Turns a vector of lengths into a boolean mask
Args:
lengths_b: an integer vector of lengths
max_length: maximum length to fill the mask
Returns:
a boolean array of shape (batch_size, max_length)
row[i] consists of True repeated lengths_b[i] times, followed by False
"""
lengths_b = tf.convert_to_tensor(lengths_b)
assert lengths_b.get_shape().ndims == 1
mask_bt = tf.expand_dims(tf.range(max_length), 0) < tf.expand_dims(lengths_b, 1)
return mask_bt
def in_session(f):
@functools.wraps(f)
def newfunc(*args, **kwargs):
with tf.Session():
f(*args, **kwargs)
return newfunc
_PLACEHOLDER_CACHE = {} # name -> (placeholder, dtype, shape) _PLACEHOLDER_CACHE = {} # name -> (placeholder, dtype, shape)
@@ -695,10 +261,3 @@ def get_placeholder_cached(name):
def flattenallbut0(x): def flattenallbut0(x):
return tf.reshape(x, [-1, intprod(x.get_shape().as_list()[1:])]) return tf.reshape(x, [-1, intprod(x.get_shape().as_list()[1:])])
def reset():
global _PLACEHOLDER_CACHE
global VARIABLES
_PLACEHOLDER_CACHE = {}
VARIABLES = {}
tf.reset_default_graph()

View File

@@ -1,19 +1,119 @@
class VecEnv(object): from abc import ABC, abstractmethod
""" from baselines import logger
Vectorized environment base class
"""
def step(self, vac):
"""
Apply sequence of actions to sequence of environments
actions -> (observations, rewards, news)
where 'news' is a boolean vector indicating whether each element is new. class AlreadySteppingError(Exception):
""" """
raise NotImplementedError Raised when an asynchronous step is running while
step_async() is called again.
"""
def __init__(self):
msg = 'already running an async step'
Exception.__init__(self, msg)
class NotSteppingError(Exception):
"""
Raised when an asynchronous step is not running but
step_wait() is called.
"""
def __init__(self):
msg = 'not running an async step'
Exception.__init__(self, msg)
class VecEnv(ABC):
def __init__(self, num_envs, observation_space, action_space):
self.num_envs = num_envs
self.observation_space = observation_space
self.action_space = action_space
"""
An abstract asynchronous, vectorized environment.
"""
@abstractmethod
def reset(self): def reset(self):
""" """
Reset all environments Reset all the environments and return an array of
observations.
If step_async is still doing work, that work will
be cancelled and step_wait() should not be called
until step_async() is invoked again.
""" """
raise NotImplementedError pass
@abstractmethod
def step_async(self, actions):
"""
Tell all the environments to start taking a step
with the given actions.
Call step_wait() to get the results of the step.
You should not call this if a step_async run is
already pending.
"""
pass
@abstractmethod
def step_wait(self):
"""
Wait for the step taken with step_async().
Returns (obs, rews, dones, infos):
- obs: an array of observations
- rews: an array of rewards
- dones: an array of "episode done" booleans
- infos: an array of info objects
"""
pass
@abstractmethod
def close(self): def close(self):
pass """
Clean up the environments' resources.
"""
pass
def step(self, actions):
self.step_async(actions)
return self.step_wait()
def render(self):
logger.warn('Render not defined for %s'%self)
class VecEnvWrapper(VecEnv):
def __init__(self, venv, observation_space=None, action_space=None):
self.venv = venv
VecEnv.__init__(self,
num_envs=venv.num_envs,
observation_space=observation_space or venv.observation_space,
action_space=action_space or venv.action_space)
def step_async(self, actions):
self.venv.step_async(actions)
@abstractmethod
def reset(self):
pass
@abstractmethod
def step_wait(self):
pass
def close(self):
return self.venv.close()
def render(self):
self.venv.render()
class CloudpickleWrapper(object):
"""
Uses cloudpickle to serialize contents (otherwise multiprocessing tries to use pickle)
"""
def __init__(self, x):
self.x = x
def __getstate__(self):
import cloudpickle
return cloudpickle.dumps(self.x)
def __setstate__(self, ob):
import pickle
self.x = pickle.loads(ob)

View File

@@ -4,22 +4,28 @@ from . import VecEnv
class DummyVecEnv(VecEnv): class DummyVecEnv(VecEnv):
def __init__(self, env_fns): def __init__(self, env_fns):
self.envs = [fn() for fn in env_fns] self.envs = [fn() for fn in env_fns]
env = self.envs[0] env = self.envs[0]
self.action_space = env.action_space VecEnv.__init__(self, len(env_fns), env.observation_space, env.action_space)
self.observation_space = env.observation_space
self.ts = np.zeros(len(self.envs), dtype='int') self.ts = np.zeros(len(self.envs), dtype='int')
def step(self, action_n): self.actions = None
results = [env.step(a) for (a,env) in zip(action_n, self.envs)]
def step_async(self, actions):
self.actions = actions
def step_wait(self):
results = [env.step(a) for (a,env) in zip(self.actions, self.envs)]
obs, rews, dones, infos = map(np.array, zip(*results)) obs, rews, dones, infos = map(np.array, zip(*results))
self.ts += 1 self.ts += 1
for (i, done) in enumerate(dones): for (i, done) in enumerate(dones):
if done: if done:
obs[i] = self.envs[i].reset() obs[i] = self.envs[i].reset()
self.ts[i] = 0 self.ts[i] = 0
self.actions = None
return np.array(obs), np.array(rews), np.array(dones), infos return np.array(obs), np.array(rews), np.array(dones), infos
def reset(self): def reset(self):
results = [env.reset() for env in self.envs] results = [env.reset() for env in self.envs]
return np.array(results) return np.array(results)
@property
def num_envs(self): def close(self):
return len(self.envs) return

View File

@@ -1,6 +1,6 @@
import numpy as np import numpy as np
from multiprocessing import Process, Pipe from multiprocessing import Process, Pipe
from baselines.common.vec_env import VecEnv from baselines.common.vec_env import VecEnv, CloudpickleWrapper
def worker(remote, parent_remote, env_fn_wrapper): def worker(remote, parent_remote, env_fn_wrapper):
@@ -23,30 +23,17 @@ def worker(remote, parent_remote, env_fn_wrapper):
remote.close() remote.close()
break break
elif cmd == 'get_spaces': elif cmd == 'get_spaces':
remote.send((env.action_space, env.observation_space)) remote.send((env.observation_space, env.action_space))
else: else:
raise NotImplementedError raise NotImplementedError
class CloudpickleWrapper(object):
"""
Uses cloudpickle to serialize contents (otherwise multiprocessing tries to use pickle)
"""
def __init__(self, x):
self.x = x
def __getstate__(self):
import cloudpickle
return cloudpickle.dumps(self.x)
def __setstate__(self, ob):
import pickle
self.x = pickle.loads(ob)
class SubprocVecEnv(VecEnv): class SubprocVecEnv(VecEnv):
def __init__(self, env_fns): def __init__(self, env_fns, spaces=None):
""" """
envs: list of gym environments to run in subprocesses envs: list of gym environments to run in subprocesses
""" """
self.waiting = False
self.closed = False self.closed = False
nenvs = len(env_fns) nenvs = len(env_fns)
self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(nenvs)]) self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(nenvs)])
@@ -59,13 +46,17 @@ class SubprocVecEnv(VecEnv):
remote.close() remote.close()
self.remotes[0].send(('get_spaces', None)) self.remotes[0].send(('get_spaces', None))
self.action_space, self.observation_space = self.remotes[0].recv() observation_space, action_space = self.remotes[0].recv()
VecEnv.__init__(self, len(env_fns), observation_space, action_space)
def step_async(self, actions):
def step(self, actions):
for remote, action in zip(self.remotes, actions): for remote, action in zip(self.remotes, actions):
remote.send(('step', action)) remote.send(('step', action))
self.waiting = True
def step_wait(self):
results = [remote.recv() for remote in self.remotes] results = [remote.recv() for remote in self.remotes]
self.waiting = False
obs, rews, dones, infos = zip(*results) obs, rews, dones, infos = zip(*results)
return np.stack(obs), np.stack(rews), np.stack(dones), infos return np.stack(obs), np.stack(rews), np.stack(dones), infos
@@ -82,13 +73,11 @@ class SubprocVecEnv(VecEnv):
def close(self): def close(self):
if self.closed: if self.closed:
return return
if self.waiting:
for remote in self.remotes:
remote.recv()
for remote in self.remotes: for remote in self.remotes:
remote.send(('close', None)) remote.send(('close', None))
for p in self.ps: for p in self.ps:
p.join() p.join()
self.closed = True self.closed = True
@property
def num_envs(self):
return len(self.remotes)

View File

@@ -1,8 +1,8 @@
from baselines.common.vec_env import VecEnv from baselines.common.vec_env import VecEnvWrapper
import numpy as np import numpy as np
from gym import spaces from gym import spaces
class VecFrameStack(VecEnv): class VecFrameStack(VecEnvWrapper):
""" """
Vectorized environment base class Vectorized environment base class
""" """
@@ -13,22 +13,18 @@ class VecFrameStack(VecEnv):
low = np.repeat(wos.low, self.nstack, axis=-1) low = np.repeat(wos.low, self.nstack, axis=-1)
high = np.repeat(wos.high, self.nstack, axis=-1) high = np.repeat(wos.high, self.nstack, axis=-1)
self.stackedobs = np.zeros((venv.num_envs,)+low.shape, low.dtype) self.stackedobs = np.zeros((venv.num_envs,)+low.shape, low.dtype)
self._observation_space = spaces.Box(low=low, high=high) observation_space = spaces.Box(low=low, high=high, dtype=venv.observation_space.dtype)
self._action_space = venv.action_space VecEnvWrapper.__init__(self, venv, observation_space=observation_space)
def step(self, vac):
"""
Apply sequence of actions to sequence of environments
actions -> (observations, rewards, news)
where 'news' is a boolean vector indicating whether each element is new. def step_wait(self):
""" obs, rews, news, infos = self.venv.step_wait()
obs, rews, news, infos = self.venv.step(vac)
self.stackedobs = np.roll(self.stackedobs, shift=-1, axis=-1) self.stackedobs = np.roll(self.stackedobs, shift=-1, axis=-1)
for (i, new) in enumerate(news): for (i, new) in enumerate(news):
if new: if new:
self.stackedobs[i] = 0 self.stackedobs[i] = 0
self.stackedobs[..., -obs.shape[-1]:] = obs self.stackedobs[..., -obs.shape[-1]:] = obs
return self.stackedobs, rews, news, infos return self.stackedobs, rews, news, infos
def reset(self): def reset(self):
""" """
Reset all environments Reset all environments
@@ -37,14 +33,6 @@ class VecFrameStack(VecEnv):
self.stackedobs[...] = 0 self.stackedobs[...] = 0
self.stackedobs[..., -obs.shape[-1]:] = obs self.stackedobs[..., -obs.shape[-1]:] = obs
return self.stackedobs return self.stackedobs
@property
def action_space(self):
return self._action_space
@property
def observation_space(self):
return self._observation_space
def close(self): def close(self):
self.venv.close() self.venv.close()
@property
def num_envs(self):
return self.venv.num_envs

View File

@@ -1,104 +1,47 @@
from baselines.common.vec_env import VecEnv from baselines.common.vec_env import VecEnvWrapper
from baselines.common.running_mean_std import RunningMeanStd from baselines.common.running_mean_std import RunningMeanStd
import numpy as np import numpy as np
class VecNormalize(VecEnv): class VecNormalize(VecEnvWrapper):
""" """
Vectorized environment base class Vectorized environment base class
""" """
def __init__(self, venv, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8): def __init__(self, venv, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8):
self.venv = venv VecEnvWrapper.__init__(self, venv)
self._observation_space = self.venv.observation_space self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if ob else None
self._action_space = venv.action_space
self.ob_rms = RunningMeanStd(shape=self._observation_space.shape) if ob else None
self.ret_rms = RunningMeanStd(shape=()) if ret else None self.ret_rms = RunningMeanStd(shape=()) if ret else None
self.clipob = clipob self.clipob = clipob
self.cliprew = cliprew self.cliprew = cliprew
self.ret = np.zeros(self.num_envs) self.ret = np.zeros(self.num_envs)
self.gamma = gamma self.gamma = gamma
self.epsilon = epsilon self.epsilon = epsilon
def step(self, vac):
def step_wait(self):
""" """
Apply sequence of actions to sequence of environments Apply sequence of actions to sequence of environments
actions -> (observations, rewards, news) actions -> (observations, rewards, news)
where 'news' is a boolean vector indicating whether each element is new. where 'news' is a boolean vector indicating whether each element is new.
""" """
obs, rews, news, infos = self.venv.step(vac) obs, rews, news, infos = self.venv.step_wait()
self.ret = self.ret * self.gamma + rews self.ret = self.ret * self.gamma + rews
obs = self._obfilt(obs) obs = self._obfilt(obs)
if self.ret_rms: if self.ret_rms:
self.ret_rms.update(self.ret) self.ret_rms.update(self.ret)
rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew) rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew)
return obs, rews, news, infos return obs, rews, news, infos
def _obfilt(self, obs): def _obfilt(self, obs):
if self.ob_rms: if self.ob_rms:
self.ob_rms.update(obs) self.ob_rms.update(obs)
obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob) obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob)
return obs return obs
else: else:
return obs return obs
def reset(self): def reset(self):
""" """
Reset all environments Reset all environments
""" """
obs = self.venv.reset() obs = self.venv.reset()
return self._obfilt(obs) return self._obfilt(obs)
@property
def action_space(self):
return self._action_space
@property
def observation_space(self):
return self._observation_space
def close(self):
self.venv.close()
@property
def num_envs(self):
return self.venv.num_envs
class RunningMeanStd(object):
# https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
def __init__(self, epsilon=1e-4, shape=()):
self.mean = np.zeros(shape, 'float64')
self.var = np.zeros(shape, 'float64')
self.count = epsilon
def update(self, x):
batch_mean = np.mean(x, axis=0)
batch_var = np.var(x, axis=0)
batch_count = x.shape[0]
delta = batch_mean - self.mean
tot_count = self.count + batch_count
new_mean = self.mean + delta * batch_count / tot_count
m_a = self.var * (self.count)
m_b = batch_var * (batch_count)
M2 = m_a + m_b + np.square(delta) * self.count * batch_count / (self.count + batch_count)
new_var = M2 / (self.count + batch_count)
new_count = batch_count + self.count
self.mean = new_mean
self.var = new_var
self.count = new_count
def test_runningmeanstd():
for (x1, x2, x3) in [
(np.random.randn(3), np.random.randn(4), np.random.randn(5)),
(np.random.randn(3,2), np.random.randn(4,2), np.random.randn(5,2)),
]:
rms = RunningMeanStd(epsilon=0.0, shape=x1.shape[1:])
x = np.concatenate([x1, x2, x3], axis=0)
ms1 = [x.mean(axis=0), x.var(axis=0)]
rms.update(x1)
rms.update(x2)
rms.update(x3)
ms2 = [rms.mean, rms.var]
assert np.allclose(ms1, ms2)

View File

@@ -9,8 +9,7 @@ from baselines import logger
from baselines.common.mpi_adam import MpiAdam from baselines.common.mpi_adam import MpiAdam
import baselines.common.tf_util as U import baselines.common.tf_util as U
from baselines.common.mpi_running_mean_std import RunningMeanStd from baselines.common.mpi_running_mean_std import RunningMeanStd
from baselines.ddpg.util import reduce_std, mpi_mean from mpi4py import MPI
def normalize(x, stats): def normalize(x, stats):
if stats is None: if stats is None:
@@ -23,6 +22,13 @@ def denormalize(x, stats):
return x return x
return x * stats.std + stats.mean return x * stats.std + stats.mean
def reduce_std(x, axis=None, keepdims=False):
return tf.sqrt(reduce_var(x, axis=axis, keepdims=keepdims))
def reduce_var(x, axis=None, keepdims=False):
m = tf.reduce_mean(x, axis=axis, keep_dims=True)
devs_squared = tf.square(x - m)
return tf.reduce_mean(devs_squared, axis=axis, keep_dims=keepdims)
def get_target_updates(vars, target_vars, tau): def get_target_updates(vars, target_vars, tau):
logger.info('setting up target updates ...') logger.info('setting up target updates ...')
@@ -198,7 +204,7 @@ class DDPG(object):
new_std = self.ret_rms.std new_std = self.ret_rms.std
self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean') self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean')
new_mean = self.ret_rms.mean new_mean = self.ret_rms.mean
self.renormalize_Q_outputs_op = [] self.renormalize_Q_outputs_op = []
for vs in [self.critic.output_vars, self.target_critic.output_vars]: for vs in [self.critic.output_vars, self.target_critic.output_vars]:
assert len(vs) == 2 assert len(vs) == 2
@@ -213,15 +219,15 @@ class DDPG(object):
def setup_stats(self): def setup_stats(self):
ops = [] ops = []
names = [] names = []
if self.normalize_returns: if self.normalize_returns:
ops += [self.ret_rms.mean, self.ret_rms.std] ops += [self.ret_rms.mean, self.ret_rms.std]
names += ['ret_rms_mean', 'ret_rms_std'] names += ['ret_rms_mean', 'ret_rms_std']
if self.normalize_observations: if self.normalize_observations:
ops += [tf.reduce_mean(self.obs_rms.mean), tf.reduce_mean(self.obs_rms.std)] ops += [tf.reduce_mean(self.obs_rms.mean), tf.reduce_mean(self.obs_rms.std)]
names += ['obs_rms_mean', 'obs_rms_std'] names += ['obs_rms_mean', 'obs_rms_std']
ops += [tf.reduce_mean(self.critic_tf)] ops += [tf.reduce_mean(self.critic_tf)]
names += ['reference_Q_mean'] names += ['reference_Q_mean']
ops += [reduce_std(self.critic_tf)] ops += [reduce_std(self.critic_tf)]
@@ -231,7 +237,7 @@ class DDPG(object):
names += ['reference_actor_Q_mean'] names += ['reference_actor_Q_mean']
ops += [reduce_std(self.critic_with_actor_tf)] ops += [reduce_std(self.critic_with_actor_tf)]
names += ['reference_actor_Q_std'] names += ['reference_actor_Q_std']
ops += [tf.reduce_mean(self.actor_tf)] ops += [tf.reduce_mean(self.actor_tf)]
names += ['reference_action_mean'] names += ['reference_action_mean']
ops += [reduce_std(self.actor_tf)] ops += [reduce_std(self.actor_tf)]
@@ -347,7 +353,7 @@ class DDPG(object):
def adapt_param_noise(self): def adapt_param_noise(self):
if self.param_noise is None: if self.param_noise is None:
return 0. return 0.
# Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation. # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation.
batch = self.memory.sample(batch_size=self.batch_size) batch = self.memory.sample(batch_size=self.batch_size)
self.sess.run(self.perturb_adaptive_policy_ops, feed_dict={ self.sess.run(self.perturb_adaptive_policy_ops, feed_dict={
@@ -358,7 +364,7 @@ class DDPG(object):
self.param_noise_stddev: self.param_noise.current_stddev, self.param_noise_stddev: self.param_noise.current_stddev,
}) })
mean_distance = mpi_mean(distance) mean_distance = MPI.COMM_WORLD.allreduce(distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size()
self.param_noise.adapt(mean_distance) self.param_noise.adapt(mean_distance)
return mean_distance return mean_distance

View File

@@ -25,7 +25,6 @@ def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs):
# Create envs. # Create envs.
env = gym.make(env_id) env = gym.make(env_id)
env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
gym.logger.setLevel(logging.WARN)
if evaluation and rank==0: if evaluation and rank==0:
eval_env = gym.make(env_id) eval_env = gym.make(env_id)

View File

@@ -4,7 +4,6 @@ from collections import deque
import pickle import pickle
from baselines.ddpg.ddpg import DDPG from baselines.ddpg.ddpg import DDPG
from baselines.ddpg.util import mpi_mean, mpi_std, mpi_max, mpi_sum
import baselines.common.tf_util as U import baselines.common.tf_util as U
from baselines import logger from baselines import logger
@@ -35,7 +34,7 @@ def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, pa
saver = tf.train.Saver() saver = tf.train.Saver()
else: else:
saver = None saver = None
step = 0 step = 0
episode = 0 episode = 0
eval_episode_rewards_history = deque(maxlen=100) eval_episode_rewards_history = deque(maxlen=100)
@@ -138,42 +137,46 @@ def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, pa
eval_episode_rewards_history.append(eval_episode_reward) eval_episode_rewards_history.append(eval_episode_reward)
eval_episode_reward = 0. eval_episode_reward = 0.
mpi_size = MPI.COMM_WORLD.Get_size()
# Log stats. # Log stats.
epoch_train_duration = time.time() - epoch_start_time # XXX shouldn't call np.mean on variable length lists
duration = time.time() - start_time duration = time.time() - start_time
stats = agent.get_stats() stats = agent.get_stats()
combined_stats = {} combined_stats = stats.copy()
for key in sorted(stats.keys()): combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
combined_stats[key] = mpi_mean(stats[key]) combined_stats['rollout/return_history'] = np.mean(episode_rewards_history)
combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps)
# Rollout statistics. combined_stats['rollout/actions_mean'] = np.mean(epoch_actions)
combined_stats['rollout/return'] = mpi_mean(epoch_episode_rewards) combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
combined_stats['rollout/return_history'] = mpi_mean(np.mean(episode_rewards_history)) combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
combined_stats['rollout/episode_steps'] = mpi_mean(epoch_episode_steps) combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses)
combined_stats['rollout/episodes'] = mpi_sum(epoch_episodes) combined_stats['train/param_noise_distance'] = np.mean(epoch_adaptive_distances)
combined_stats['rollout/actions_mean'] = mpi_mean(epoch_actions) combined_stats['total/duration'] = duration
combined_stats['rollout/actions_std'] = mpi_std(epoch_actions) combined_stats['total/steps_per_second'] = float(t) / float(duration)
combined_stats['rollout/Q_mean'] = mpi_mean(epoch_qs) combined_stats['total/episodes'] = episodes
combined_stats['rollout/episodes'] = epoch_episodes
# Train statistics. combined_stats['rollout/actions_std'] = np.std(epoch_actions)
combined_stats['train/loss_actor'] = mpi_mean(epoch_actor_losses)
combined_stats['train/loss_critic'] = mpi_mean(epoch_critic_losses)
combined_stats['train/param_noise_distance'] = mpi_mean(epoch_adaptive_distances)
# Evaluation statistics. # Evaluation statistics.
if eval_env is not None: if eval_env is not None:
combined_stats['eval/return'] = mpi_mean(eval_episode_rewards) combined_stats['eval/return'] = eval_episode_rewards
combined_stats['eval/return_history'] = mpi_mean(np.mean(eval_episode_rewards_history)) combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history)
combined_stats['eval/Q'] = mpi_mean(eval_qs) combined_stats['eval/Q'] = eval_qs
combined_stats['eval/episodes'] = mpi_mean(len(eval_episode_rewards)) combined_stats['eval/episodes'] = len(eval_episode_rewards)
def as_scalar(x):
if isinstance(x, np.ndarray):
assert x.size == 1
return x[0]
elif np.isscalar(x):
return x
else:
raise ValueError('expected scalar, got %s'%x)
combined_stats_sums = MPI.COMM_WORLD.allreduce(np.array([as_scalar(x) for x in combined_stats.values()]))
combined_stats = {k : v / mpi_size for (k,v) in zip(combined_stats.keys(), combined_stats_sums)}
# Total statistics. # Total statistics.
combined_stats['total/duration'] = mpi_mean(duration)
combined_stats['total/steps_per_second'] = mpi_mean(float(t) / float(duration))
combined_stats['total/episodes'] = mpi_mean(episodes)
combined_stats['total/epochs'] = epoch + 1 combined_stats['total/epochs'] = epoch + 1
combined_stats['total/steps'] = t combined_stats['total/steps'] = t
for key in sorted(combined_stats.keys()): for key in sorted(combined_stats.keys()):
logger.record_tabular(key, combined_stats[key]) logger.record_tabular(key, combined_stats[key])
logger.dump_tabular() logger.dump_tabular()

View File

@@ -1,44 +0,0 @@
import numpy as np
import tensorflow as tf
from mpi4py import MPI
from baselines.common.mpi_moments import mpi_moments
def reduce_var(x, axis=None, keepdims=False):
m = tf.reduce_mean(x, axis=axis, keep_dims=True)
devs_squared = tf.square(x - m)
return tf.reduce_mean(devs_squared, axis=axis, keep_dims=keepdims)
def reduce_std(x, axis=None, keepdims=False):
return tf.sqrt(reduce_var(x, axis=axis, keepdims=keepdims))
def mpi_mean(value):
if value == []:
value = [0.]
if not isinstance(value, list):
value = [value]
return mpi_moments(np.array(value))[0][0]
def mpi_std(value):
if value == []:
value = [0.]
if not isinstance(value, list):
value = [value]
return mpi_moments(np.array(value))[1][0]
def mpi_max(value):
global_max = np.zeros(1, dtype='float64')
local_max = np.max(value).astype('float64')
MPI.COMM_WORLD.Reduce(local_max, global_max, op=MPI.MAX)
return global_max[0]
def mpi_sum(value):
global_sum = np.zeros(1, dtype='float64')
local_sum = np.sum(np.array(value)).astype('float64')
MPI.COMM_WORLD.Reduce(local_sum, global_sum, op=MPI.SUM)
return global_sum[0]

View File

@@ -143,7 +143,7 @@ def build_act(make_obs_ph, q_func, num_actions, scope="deepq", reuse=None):
` See the top of the file for details. ` See the top of the file for details.
""" """
with tf.variable_scope(scope, reuse=reuse): with tf.variable_scope(scope, reuse=reuse):
observations_ph = U.ensure_tf_input(make_obs_ph("observation")) observations_ph = make_obs_ph("observation")
stochastic_ph = tf.placeholder(tf.bool, (), name="stochastic") stochastic_ph = tf.placeholder(tf.bool, (), name="stochastic")
update_eps_ph = tf.placeholder(tf.float32, (), name="update_eps") update_eps_ph = tf.placeholder(tf.float32, (), name="update_eps")
@@ -159,10 +159,12 @@ def build_act(make_obs_ph, q_func, num_actions, scope="deepq", reuse=None):
output_actions = tf.cond(stochastic_ph, lambda: stochastic_actions, lambda: deterministic_actions) output_actions = tf.cond(stochastic_ph, lambda: stochastic_actions, lambda: deterministic_actions)
update_eps_expr = eps.assign(tf.cond(update_eps_ph >= 0, lambda: update_eps_ph, lambda: eps)) update_eps_expr = eps.assign(tf.cond(update_eps_ph >= 0, lambda: update_eps_ph, lambda: eps))
act = U.function(inputs=[observations_ph, stochastic_ph, update_eps_ph], _act = U.function(inputs=[observations_ph, stochastic_ph, update_eps_ph],
outputs=output_actions, outputs=output_actions,
givens={update_eps_ph: -1.0, stochastic_ph: True}, givens={update_eps_ph: -1.0, stochastic_ph: True},
updates=[update_eps_expr]) updates=[update_eps_expr])
def act(ob, stochastic=True, update_eps=-1):
return _act(ob, stochastic, update_eps)
return act return act
@@ -203,7 +205,7 @@ def build_act_with_param_noise(make_obs_ph, q_func, num_actions, scope="deepq",
param_noise_filter_func = default_param_noise_filter param_noise_filter_func = default_param_noise_filter
with tf.variable_scope(scope, reuse=reuse): with tf.variable_scope(scope, reuse=reuse):
observations_ph = U.ensure_tf_input(make_obs_ph("observation")) observations_ph = make_obs_ph("observation")
stochastic_ph = tf.placeholder(tf.bool, (), name="stochastic") stochastic_ph = tf.placeholder(tf.bool, (), name="stochastic")
update_eps_ph = tf.placeholder(tf.float32, (), name="update_eps") update_eps_ph = tf.placeholder(tf.float32, (), name="update_eps")
update_param_noise_threshold_ph = tf.placeholder(tf.float32, (), name="update_param_noise_threshold") update_param_noise_threshold_ph = tf.placeholder(tf.float32, (), name="update_param_noise_threshold")
@@ -342,20 +344,20 @@ def build_train(make_obs_ph, q_func, num_actions, optimizer, grad_norm_clipping=
with tf.variable_scope(scope, reuse=reuse): with tf.variable_scope(scope, reuse=reuse):
# set up placeholders # set up placeholders
obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t")) obs_t_input = make_obs_ph("obs_t")
act_t_ph = tf.placeholder(tf.int32, [None], name="action") act_t_ph = tf.placeholder(tf.int32, [None], name="action")
rew_t_ph = tf.placeholder(tf.float32, [None], name="reward") rew_t_ph = tf.placeholder(tf.float32, [None], name="reward")
obs_tp1_input = U.ensure_tf_input(make_obs_ph("obs_tp1")) obs_tp1_input = make_obs_ph("obs_tp1")
done_mask_ph = tf.placeholder(tf.float32, [None], name="done") done_mask_ph = tf.placeholder(tf.float32, [None], name="done")
importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight") importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight")
# q network evaluation # q network evaluation
q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True) # reuse parameters from act q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True) # reuse parameters from act
q_func_vars = U.scope_vars(U.absolute_scope_name("q_func")) q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/q_func")
# target q network evalution # target q network evalution
q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func") q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func")
target_q_func_vars = U.scope_vars(U.absolute_scope_name("target_q_func")) target_q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/target_q_func")
# q scores for actions which we know were selected in the given state. # q scores for actions which we know were selected in the given state.
q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions), 1) q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions), 1)
@@ -363,7 +365,7 @@ def build_train(make_obs_ph, q_func, num_actions, optimizer, grad_norm_clipping=
# compute estimate of best possible value starting from state at t + 1 # compute estimate of best possible value starting from state at t + 1
if double_q: if double_q:
q_tp1_using_online_net = q_func(obs_tp1_input.get(), num_actions, scope="q_func", reuse=True) q_tp1_using_online_net = q_func(obs_tp1_input.get(), num_actions, scope="q_func", reuse=True)
q_tp1_best_using_online_net = tf.arg_max(q_tp1_using_online_net, 1) q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1)
q_tp1_best = tf.reduce_sum(q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions), 1) q_tp1_best = tf.reduce_sum(q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions), 1)
else: else:
q_tp1_best = tf.reduce_max(q_tp1, 1) q_tp1_best = tf.reduce_max(q_tp1, 1)
@@ -379,10 +381,11 @@ def build_train(make_obs_ph, q_func, num_actions, optimizer, grad_norm_clipping=
# compute optimization op (potentially with gradient clipping) # compute optimization op (potentially with gradient clipping)
if grad_norm_clipping is not None: if grad_norm_clipping is not None:
optimize_expr = U.minimize_and_clip(optimizer, gradients = optimizer.compute_gradients(weighted_error, var_list=q_func_vars)
weighted_error, for i, (grad, var) in enumerate(gradients):
var_list=q_func_vars, if grad is not None:
clip_val=grad_norm_clipping) gradients[i] = (tf.clip_by_norm(grad, grad_norm_clipping), var)
optimize_expr = optimizer.apply_gradients(gradients)
else: else:
optimize_expr = optimizer.minimize(weighted_error, var_list=q_func_vars) optimize_expr = optimizer.minimize(weighted_error, var_list=q_func_vars)

View File

@@ -14,6 +14,7 @@ from baselines.common.misc_util import (
from baselines import bench from baselines import bench
from baselines.common.atari_wrappers_deprecated import wrap_dqn from baselines.common.atari_wrappers_deprecated import wrap_dqn
from baselines.deepq.experiments.atari.model import model, dueling_model from baselines.deepq.experiments.atari.model import model, dueling_model
from baselines.deepq.utils import Uint8Input, load_state
def parse_args(): def parse_args():
@@ -63,8 +64,8 @@ if __name__ == '__main__':
args = parse_args() args = parse_args()
env = make_env(args.env) env = make_env(args.env)
act = deepq.build_act( act = deepq.build_act(
make_obs_ph=lambda name: U.Uint8Input(env.observation_space.shape, name=name), make_obs_ph=lambda name: Uint8Input(env.observation_space.shape, name=name),
q_func=dueling_model if args.dueling else model, q_func=dueling_model if args.dueling else model,
num_actions=env.action_space.n) num_actions=env.action_space.n)
U.load_state(os.path.join(args.model_dir, "saved")) load_state(os.path.join(args.model_dir, "saved"))
play(env, act, args.stochastic, args.video) play(env, act, args.stochastic, args.video)

View File

@@ -2,14 +2,7 @@ import tensorflow as tf
import tensorflow.contrib.layers as layers import tensorflow.contrib.layers as layers
def layer_norm_fn(x, relu=True): def model(img_in, num_actions, scope, reuse=False):
x = layers.layer_norm(x, scale=True, center=True)
if relu:
x = tf.nn.relu(x)
return x
def model(img_in, num_actions, scope, reuse=False, layer_norm=False):
"""As described in https://storage.googleapis.com/deepmind-data/assets/papers/DeepMindNature14236Paper.pdf""" """As described in https://storage.googleapis.com/deepmind-data/assets/papers/DeepMindNature14236Paper.pdf"""
with tf.variable_scope(scope, reuse=reuse): with tf.variable_scope(scope, reuse=reuse):
out = img_in out = img_in
@@ -22,15 +15,12 @@ def model(img_in, num_actions, scope, reuse=False, layer_norm=False):
with tf.variable_scope("action_value"): with tf.variable_scope("action_value"):
value_out = layers.fully_connected(conv_out, num_outputs=512, activation_fn=None) value_out = layers.fully_connected(conv_out, num_outputs=512, activation_fn=None)
if layer_norm: value_out = tf.nn.relu(value_out)
value_out = layer_norm_fn(value_out, relu=True)
else:
value_out = tf.nn.relu(value_out)
value_out = layers.fully_connected(value_out, num_outputs=num_actions, activation_fn=None) value_out = layers.fully_connected(value_out, num_outputs=num_actions, activation_fn=None)
return value_out return value_out
def dueling_model(img_in, num_actions, scope, reuse=False, layer_norm=False): def dueling_model(img_in, num_actions, scope, reuse=False):
"""As described in https://arxiv.org/abs/1511.06581""" """As described in https://arxiv.org/abs/1511.06581"""
with tf.variable_scope(scope, reuse=reuse): with tf.variable_scope(scope, reuse=reuse):
out = img_in out = img_in
@@ -43,17 +33,11 @@ def dueling_model(img_in, num_actions, scope, reuse=False, layer_norm=False):
with tf.variable_scope("state_value"): with tf.variable_scope("state_value"):
state_hidden = layers.fully_connected(conv_out, num_outputs=512, activation_fn=None) state_hidden = layers.fully_connected(conv_out, num_outputs=512, activation_fn=None)
if layer_norm: state_hidden = tf.nn.relu(state_hidden)
state_hidden = layer_norm_fn(state_hidden, relu=True)
else:
state_hidden = tf.nn.relu(state_hidden)
state_score = layers.fully_connected(state_hidden, num_outputs=1, activation_fn=None) state_score = layers.fully_connected(state_hidden, num_outputs=1, activation_fn=None)
with tf.variable_scope("action_value"): with tf.variable_scope("action_value"):
actions_hidden = layers.fully_connected(conv_out, num_outputs=512, activation_fn=None) actions_hidden = layers.fully_connected(conv_out, num_outputs=512, activation_fn=None)
if layer_norm: actions_hidden = tf.nn.relu(actions_hidden)
actions_hidden = layer_norm_fn(actions_hidden, relu=True)
else:
actions_hidden = tf.nn.relu(actions_hidden)
action_scores = layers.fully_connected(actions_hidden, num_outputs=num_actions, activation_fn=None) action_scores = layers.fully_connected(actions_hidden, num_outputs=num_actions, activation_fn=None)
action_scores_mean = tf.reduce_mean(action_scores, 1) action_scores_mean = tf.reduce_mean(action_scores, 1)
action_scores = action_scores - tf.expand_dims(action_scores_mean, 1) action_scores = action_scores - tf.expand_dims(action_scores_mean, 1)

View File

@@ -25,6 +25,7 @@ from baselines import bench
from baselines.common.atari_wrappers_deprecated import wrap_dqn from baselines.common.atari_wrappers_deprecated import wrap_dqn
from baselines.common.azure_utils import Container from baselines.common.azure_utils import Container
from .model import model, dueling_model from .model import model, dueling_model
from baselines.deepq.utils import Uint8Input, load_state, save_state
def parse_args(): def parse_args():
@@ -73,7 +74,7 @@ def maybe_save_model(savedir, container, state):
return return
start_time = time.time() start_time = time.time()
model_dir = "model-{}".format(state["num_iters"]) model_dir = "model-{}".format(state["num_iters"])
U.save_state(os.path.join(savedir, model_dir, "saved")) save_state(os.path.join(savedir, model_dir, "saved"))
if container is not None: if container is not None:
container.put(os.path.join(savedir, model_dir), model_dir) container.put(os.path.join(savedir, model_dir), model_dir)
relatively_safe_pickle_dump(state, os.path.join(savedir, 'training_state.pkl.zip'), compression=True) relatively_safe_pickle_dump(state, os.path.join(savedir, 'training_state.pkl.zip'), compression=True)
@@ -101,14 +102,14 @@ def maybe_load_model(savedir, container):
model_dir = "model-{}".format(state["num_iters"]) model_dir = "model-{}".format(state["num_iters"])
if container is not None: if container is not None:
container.get(savedir, model_dir) container.get(savedir, model_dir)
U.load_state(os.path.join(savedir, model_dir, "saved")) load_state(os.path.join(savedir, model_dir, "saved"))
logger.log("Loaded models checkpoint at {} iterations".format(state["num_iters"])) logger.log("Loaded models checkpoint at {} iterations".format(state["num_iters"]))
return state return state
if __name__ == '__main__': if __name__ == '__main__':
args = parse_args() args = parse_args()
# Parse savedir and azure container. # Parse savedir and azure container.
savedir = args.save_dir savedir = args.save_dir
if savedir is None: if savedir is None:
@@ -143,7 +144,7 @@ if __name__ == '__main__':
actual_model = dueling_model if args.dueling else model actual_model = dueling_model if args.dueling else model
return actual_model(img_in, num_actions, scope, layer_norm=args.layer_norm, **kwargs) return actual_model(img_in, num_actions, scope, layer_norm=args.layer_norm, **kwargs)
act, train, update_target, debug = deepq.build_train( act, train, update_target, debug = deepq.build_train(
make_obs_ph=lambda name: U.Uint8Input(env.observation_space.shape, name=name), make_obs_ph=lambda name: Uint8Input(env.observation_space.shape, name=name),
q_func=model_wrapper, q_func=model_wrapper,
num_actions=env.action_space.n, num_actions=env.action_space.n,
optimizer=tf.train.AdamOptimizer(learning_rate=args.lr, epsilon=1e-4), optimizer=tf.train.AdamOptimizer(learning_rate=args.lr, epsilon=1e-4),

View File

@@ -9,6 +9,7 @@ from baselines import deepq, bench
from baselines.common.misc_util import get_wrapper_by_name, boolean_flag, set_global_seeds from baselines.common.misc_util import get_wrapper_by_name, boolean_flag, set_global_seeds
from baselines.common.atari_wrappers_deprecated import wrap_dqn from baselines.common.atari_wrappers_deprecated import wrap_dqn
from baselines.deepq.experiments.atari.model import model, dueling_model from baselines.deepq.experiments.atari.model import model, dueling_model
from baselines.deepq.utils import Uint8Input, load_state
def make_env(game_name): def make_env(game_name):
@@ -69,11 +70,11 @@ def main():
with U.make_session(4): # noqa with U.make_session(4): # noqa
_, env = make_env(args.env) _, env = make_env(args.env)
act = deepq.build_act( act = deepq.build_act(
make_obs_ph=lambda name: U.Uint8Input(env.observation_space.shape, name=name), make_obs_ph=lambda name: Uint8Input(env.observation_space.shape, name=name),
q_func=dueling_model if args.dueling else model, q_func=dueling_model if args.dueling else model,
num_actions=env.action_space.n) num_actions=env.action_space.n)
U.load_state(os.path.join(args.model_dir, "saved")) load_state(os.path.join(args.model_dir, "saved"))
wang2015_eval(args.env, act, stochastic=args.stochastic) wang2015_eval(args.env, act, stochastic=args.stochastic)

View File

@@ -9,6 +9,7 @@ import baselines.common.tf_util as U
from baselines import logger from baselines import logger
from baselines import deepq from baselines import deepq
from baselines.deepq.replay_buffer import ReplayBuffer from baselines.deepq.replay_buffer import ReplayBuffer
from baselines.deepq.utils import BatchInput
from baselines.common.schedules import LinearSchedule from baselines.common.schedules import LinearSchedule
@@ -27,7 +28,7 @@ if __name__ == '__main__':
env = gym.make("CartPole-v0") env = gym.make("CartPole-v0")
# Create all the functions necessary to train the model # Create all the functions necessary to train the model
act, train, update_target, debug = deepq.build_train( act, train, update_target, debug = deepq.build_train(
make_obs_ph=lambda name: U.BatchInput(env.observation_space.shape, name=name), make_obs_ph=lambda name: BatchInput(env.observation_space.shape, name=name),
q_func=model, q_func=model,
num_actions=env.action_space.n, num_actions=env.action_space.n,
optimizer=tf.train.AdamOptimizer(learning_rate=5e-4), optimizer=tf.train.AdamOptimizer(learning_rate=5e-4),

View File

@@ -1,5 +1,3 @@
import gym
from baselines import deepq from baselines import deepq
from baselines.common import set_global_seeds from baselines.common import set_global_seeds
from baselines import bench from baselines import bench

View File

@@ -3,7 +3,7 @@ import gym
from baselines import deepq from baselines import deepq
def callback(lcl, glb): def callback(lcl, _glb):
# stop training if reward exceeds 199 # stop training if reward exceeds 199
is_solved = lcl['t'] > 100 and sum(lcl['episode_rewards'][-101:-1]) / 100 >= 199 is_solved = lcl['t'] > 100 and sum(lcl['episode_rewards'][-101:-1]) / 100 >= 199
return is_solved return is_solved

View File

@@ -12,6 +12,7 @@ from baselines import logger
from baselines.common.schedules import LinearSchedule from baselines.common.schedules import LinearSchedule
from baselines import deepq from baselines import deepq
from baselines.deepq.replay_buffer import ReplayBuffer, PrioritizedReplayBuffer from baselines.deepq.replay_buffer import ReplayBuffer, PrioritizedReplayBuffer
from baselines.deepq.utils import BatchInput, load_state, save_state
class ActWrapper(object): class ActWrapper(object):
@@ -32,7 +33,7 @@ class ActWrapper(object):
f.write(model_data) f.write(model_data)
zipfile.ZipFile(arc_path, 'r', zipfile.ZIP_DEFLATED).extractall(td) zipfile.ZipFile(arc_path, 'r', zipfile.ZIP_DEFLATED).extractall(td)
U.load_state(os.path.join(td, "model")) load_state(os.path.join(td, "model"))
return ActWrapper(act, act_params) return ActWrapper(act, act_params)
@@ -45,7 +46,7 @@ class ActWrapper(object):
path = os.path.join(logger.get_dir(), "model.pkl") path = os.path.join(logger.get_dir(), "model.pkl")
with tempfile.TemporaryDirectory() as td: with tempfile.TemporaryDirectory() as td:
U.save_state(os.path.join(td, "model")) save_state(os.path.join(td, "model"))
arc_name = os.path.join(td, "packed.zip") arc_name = os.path.join(td, "packed.zip")
with zipfile.ZipFile(arc_name, 'w') as zipf: with zipfile.ZipFile(arc_name, 'w') as zipf:
for root, dirs, files in os.walk(td): for root, dirs, files in os.walk(td):
@@ -171,7 +172,7 @@ def learn(env,
# by cloudpickle when serializing make_obs_ph # by cloudpickle when serializing make_obs_ph
observation_space_shape = env.observation_space.shape observation_space_shape = env.observation_space.shape
def make_obs_ph(name): def make_obs_ph(name):
return U.BatchInput(observation_space_shape, name=name) return BatchInput(observation_space_shape, name=name)
act, train, update_target, debug = deepq.build_train( act, train, update_target, debug = deepq.build_train(
make_obs_ph=make_obs_ph, make_obs_ph=make_obs_ph,
@@ -283,12 +284,12 @@ def learn(env,
if print_freq is not None: if print_freq is not None:
logger.log("Saving model due to mean reward increase: {} -> {}".format( logger.log("Saving model due to mean reward increase: {} -> {}".format(
saved_mean_reward, mean_100ep_reward)) saved_mean_reward, mean_100ep_reward))
U.save_state(model_file) save_state(model_file)
model_saved = True model_saved = True
saved_mean_reward = mean_100ep_reward saved_mean_reward = mean_100ep_reward
if model_saved: if model_saved:
if print_freq is not None: if print_freq is not None:
logger.log("Restored model with mean reward: {}".format(saved_mean_reward)) logger.log("Restored model with mean reward: {}".format(saved_mean_reward))
U.load_state(model_file) load_state(model_file)
return act return act

88
baselines/deepq/utils.py Normal file
View File

@@ -0,0 +1,88 @@
import os
import tensorflow as tf
# ================================================================
# Saving variables
# ================================================================
def load_state(fname):
saver = tf.train.Saver()
saver.restore(tf.get_default_session(), fname)
def save_state(fname):
os.makedirs(os.path.dirname(fname), exist_ok=True)
saver = tf.train.Saver()
saver.save(tf.get_default_session(), fname)
# ================================================================
# Placeholders
# ================================================================
class TfInput(object):
def __init__(self, name="(unnamed)"):
"""Generalized Tensorflow placeholder. The main differences are:
- possibly uses multiple placeholders internally and returns multiple values
- can apply light postprocessing to the value feed to placeholder.
"""
self.name = name
def get(self):
"""Return the tf variable(s) representing the possibly postprocessed value
of placeholder(s).
"""
raise NotImplemented()
def make_feed_dict(data):
"""Given data input it to the placeholder(s)."""
raise NotImplemented()
class PlaceholderTfInput(TfInput):
def __init__(self, placeholder):
"""Wrapper for regular tensorflow placeholder."""
super().__init__(placeholder.name)
self._placeholder = placeholder
def get(self):
return self._placeholder
def make_feed_dict(self, data):
return {self._placeholder: data}
class BatchInput(PlaceholderTfInput):
def __init__(self, shape, dtype=tf.float32, name=None):
"""Creates a placeholder for a batch of tensors of a given shape and dtype
Parameters
----------
shape: [int]
shape of a single elemenet of the batch
dtype: tf.dtype
number representation used for tensor contents
name: str
name of the underlying placeholder
"""
super().__init__(tf.placeholder(dtype, [None] + list(shape), name=name))
class Uint8Input(PlaceholderTfInput):
def __init__(self, shape, name=None):
"""Takes input in uint8 format which is cast to float32 and divided by 255
before passing it to the model.
On GPU this ensures lower data transfer times.
Parameters
----------
shape: [int]
shape of the tensor.
name: str
name of the underlying placeholder
"""
super().__init__(tf.placeholder(tf.uint8, [None] + list(shape), name=name))
self._shape = shape
self._output = tf.cast(super().get(), tf.float32) / 255.0
def get(self):
return self._output

View File

@@ -8,6 +8,14 @@ import numpy as np
from baselines.common.mpi_running_mean_std import RunningMeanStd from baselines.common.mpi_running_mean_std import RunningMeanStd
from baselines.common import tf_util as U from baselines.common import tf_util as U
def logsigmoid(a):
'''Equivalent to tf.log(tf.sigmoid(a))'''
return -tf.nn.softplus(-a)
""" Reference: https://github.com/openai/imitation/blob/99fbccf3e060b6e6c739bdf209758620fcdefd3c/policyopt/thutil.py#L48-L51"""
def logit_bernoulli_entropy(logits):
ent = (1.-tf.nn.sigmoid(logits))*logits - logsigmoid(logits)
return ent
class TransitionClassifier(object): class TransitionClassifier(object):
def __init__(self, env, hidden_size, entcoeff=0.001, lr_rate=1e-3, scope="adversary"): def __init__(self, env, hidden_size, entcoeff=0.001, lr_rate=1e-3, scope="adversary"):

View File

@@ -130,14 +130,14 @@ def learn(env, policy_func, reward_giver, expert_dataset, rank,
kloldnew = oldpi.pd.kl(pi.pd) kloldnew = oldpi.pd.kl(pi.pd)
ent = pi.pd.entropy() ent = pi.pd.entropy()
meankl = U.mean(kloldnew) meankl = tf_util.reduce_mean(kloldnew)
meanent = U.mean(ent) meanent = tf_util.reduce_mean(ent)
entbonus = entcoeff * meanent entbonus = entcoeff * meanent
vferr = U.mean(tf.square(pi.vpred - ret)) vferr = tf_util.reduce_mean(tf.square(pi.vpred - ret))
ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold
surrgain = U.mean(ratio * atarg) surrgain = tf_util.reduce_mean(ratio * atarg)
optimgain = surrgain + entbonus optimgain = surrgain + entbonus
losses = [optimgain, meankl, entbonus, surrgain, meanent] losses = [optimgain, meankl, entbonus, surrgain, meanent]

View File

@@ -6,7 +6,6 @@ import json
import time import time
import datetime import datetime
import tempfile import tempfile
from mpi4py import MPI
LOG_OUTPUT_FORMATS = ['stdout', 'log', 'csv'] LOG_OUTPUT_FORMATS = ['stdout', 'log', 'csv']
# Also valid: json, tensorboard # Also valid: json, tensorboard
@@ -170,6 +169,7 @@ class TensorBoardOutputFormat(KVWriter):
self.writer = None self.writer = None
def make_output_format(format, ev_dir): def make_output_format(format, ev_dir):
from mpi4py import MPI
os.makedirs(ev_dir, exist_ok=True) os.makedirs(ev_dir, exist_ok=True)
rank = MPI.COMM_WORLD.Get_rank() rank = MPI.COMM_WORLD.Get_rank()
if format == 'stdout': if format == 'stdout':

View File

@@ -17,25 +17,25 @@ class CnnPolicy(object):
sequence_length = None sequence_length = None
ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))
x = ob / 255.0 x = ob / 255.0
if kind == 'small': # from A3C paper if kind == 'small': # from A3C paper
x = tf.nn.relu(U.conv2d(x, 16, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 16, "l1", [8, 8], [4, 4], pad="VALID"))
x = tf.nn.relu(U.conv2d(x, 32, "l2", [4, 4], [2, 2], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 32, "l2", [4, 4], [2, 2], pad="VALID"))
x = U.flattenallbut0(x) x = U.flattenallbut0(x)
x = tf.nn.relu(U.dense(x, 256, 'lin', U.normc_initializer(1.0))) x = tf.nn.relu(tf.layers.dense(x, 256, name='lin', kernel_initializer=U.normc_initializer(1.0)))
elif kind == 'large': # Nature DQN elif kind == 'large': # Nature DQN
x = tf.nn.relu(U.conv2d(x, 32, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 32, "l1", [8, 8], [4, 4], pad="VALID"))
x = tf.nn.relu(U.conv2d(x, 64, "l2", [4, 4], [2, 2], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 64, "l2", [4, 4], [2, 2], pad="VALID"))
x = tf.nn.relu(U.conv2d(x, 64, "l3", [3, 3], [1, 1], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 64, "l3", [3, 3], [1, 1], pad="VALID"))
x = U.flattenallbut0(x) x = U.flattenallbut0(x)
x = tf.nn.relu(U.dense(x, 512, 'lin', U.normc_initializer(1.0))) x = tf.nn.relu(tf.layers.dense(x, 512, name='lin', kernel_initializer=U.normc_initializer(1.0)))
else: else:
raise NotImplementedError raise NotImplementedError
logits = U.dense(x, pdtype.param_shape()[0], "logits", U.normc_initializer(0.01)) logits = tf.layers.dense(x, pdtype.param_shape()[0], name='logits', kernel_initializer=U.normc_initializer(0.01))
self.pd = pdtype.pdfromflat(logits) self.pd = pdtype.pdfromflat(logits)
self.vpred = U.dense(x, 1, "value", U.normc_initializer(1.0))[:,0] self.vpred = tf.layers.dense(x, 1, name='value', kernel_initializer=U.normc_initializer(1.0))[:,0]
self.state_in = [] self.state_in = []
self.state_out = [] self.state_out = []

View File

@@ -18,25 +18,25 @@ class MlpPolicy(object):
sequence_length = None sequence_length = None
ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))
with tf.variable_scope("obfilter"): with tf.variable_scope("obfilter"):
self.ob_rms = RunningMeanStd(shape=ob_space.shape) self.ob_rms = RunningMeanStd(shape=ob_space.shape)
obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
last_out = obz last_out = obz
for i in range(num_hid_layers): for i in range(num_hid_layers):
last_out = tf.nn.tanh(U.dense(last_out, hid_size, "vffc%i"%(i+1), weight_init=U.normc_initializer(1.0))) last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name="vffc%i"%(i+1), kernel_initializer=U.normc_initializer(1.0)))
self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:,0] self.vpred = tf.layers.dense(last_out, 1, name='vffinal', kernel_initializer=U.normc_initializer(1.0))[:,0]
last_out = obz last_out = obz
for i in range(num_hid_layers): for i in range(num_hid_layers):
last_out = tf.nn.tanh(U.dense(last_out, hid_size, "polfc%i"%(i+1), weight_init=U.normc_initializer(1.0))) last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name='polfc%i'%(i+1), kernel_initializer=U.normc_initializer(1.0)))
if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
mean = U.dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01)) mean = tf.layers.dense(last_out, pdtype.param_shape()[0]//2, name='polfinal', kernel_initializer=U.normc_initializer(0.01))
logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer()) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer())
pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
else: else:
pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) pdparam = tf.layers.dense(last_out, pdtype.param_shape()[0], name='polfinal', kernel_initializer=U.normc_initializer(0.01))
self.pd = pdtype.pdfromflat(pdparam) self.pd = pdtype.pdfromflat(pdparam)

View File

@@ -77,7 +77,7 @@ def add_vtarg_and_adv(seg, gamma, lam):
gaelam[t] = lastgaelam = delta + gamma * lam * nonterminal * lastgaelam gaelam[t] = lastgaelam = delta + gamma * lam * nonterminal * lastgaelam
seg["tdlamret"] = seg["adv"] + seg["vpred"] seg["tdlamret"] = seg["adv"] + seg["vpred"]
def learn(env, policy_func, *, def learn(env, policy_fn, *,
timesteps_per_actorbatch, # timesteps per actor per update timesteps_per_actorbatch, # timesteps per actor per update
clip_param, entcoeff, # clipping parameter epsilon, entropy coeff clip_param, entcoeff, # clipping parameter epsilon, entropy coeff
optim_epochs, optim_stepsize, optim_batchsize,# optimization hypers optim_epochs, optim_stepsize, optim_batchsize,# optimization hypers
@@ -91,8 +91,8 @@ def learn(env, policy_func, *,
# ---------------------------------------- # ----------------------------------------
ob_space = env.observation_space ob_space = env.observation_space
ac_space = env.action_space ac_space = env.action_space
pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy pi = policy_fn("pi", ob_space, ac_space) # Construct network for new policy
oldpi = policy_func("oldpi", ob_space, ac_space) # Network for old policy oldpi = policy_fn("oldpi", ob_space, ac_space) # Network for old policy
atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable)
ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return
@@ -104,15 +104,15 @@ def learn(env, policy_func, *,
kloldnew = oldpi.pd.kl(pi.pd) kloldnew = oldpi.pd.kl(pi.pd)
ent = pi.pd.entropy() ent = pi.pd.entropy()
meankl = U.mean(kloldnew) meankl = tf.reduce_mean(kloldnew)
meanent = U.mean(ent) meanent = tf.reduce_mean(ent)
pol_entpen = (-entcoeff) * meanent pol_entpen = (-entcoeff) * meanent
ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold
surr1 = ratio * atarg # surrogate from conservative policy iteration surr1 = ratio * atarg # surrogate from conservative policy iteration
surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg #
pol_surr = - U.mean(tf.minimum(surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) pol_surr = - tf.reduce_mean(tf.minimum(surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP)
vf_loss = U.mean(tf.square(pi.vpred - ret)) vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret))
total_loss = pol_surr + pol_entpen + vf_loss total_loss = pol_surr + pol_entpen + vf_loss
losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]
@@ -181,7 +181,7 @@ def learn(env, policy_func, *,
losses = [] # list of tuples, each of which gives the loss for a minibatch losses = [] # list of tuples, each of which gives the loss for a minibatch
for batch in d.iterate_once(optim_batchsize): for batch in d.iterate_once(optim_batchsize):
*newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult)
adam.update(g, optim_stepsize * cur_lrmult) adam.update(g, optim_stepsize * cur_lrmult)
losses.append(newlosses) losses.append(newlosses)
logger.log(fmt_row(13, np.mean(losses, axis=0))) logger.log(fmt_row(13, np.mean(losses, axis=0)))
@@ -189,7 +189,7 @@ def learn(env, policy_func, *,
losses = [] losses = []
for batch in d.iterate_once(optim_batchsize): for batch in d.iterate_once(optim_batchsize):
newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult)
losses.append(newlosses) losses.append(newlosses)
meanlosses,_,_ = mpi_moments(losses, axis=0) meanlosses,_,_ = mpi_moments(losses, axis=0)
logger.log(fmt_row(13, meanlosses)) logger.log(fmt_row(13, meanlosses))
for (lossval, name) in zipsame(meanlosses, loss_names): for (lossval, name) in zipsame(meanlosses, loss_names):

View File

@@ -4,9 +4,9 @@ from mpi4py import MPI
from baselines.common import set_global_seeds from baselines.common import set_global_seeds
from baselines import bench from baselines import bench
import os.path as osp import os.path as osp
import gym, logging
from baselines import logger from baselines import logger
from baselines.common.atari_wrappers import make_atari, wrap_deepmind from baselines.common.atari_wrappers import make_atari, wrap_deepmind
from baselines.common.cmd_util import atari_arg_parser
def train(env_id, num_timesteps, seed): def train(env_id, num_timesteps, seed):
from baselines.ppo1 import pposgd_simple, cnn_policy from baselines.ppo1 import pposgd_simple, cnn_policy
@@ -26,7 +26,6 @@ def train(env_id, num_timesteps, seed):
env = bench.Monitor(env, logger.get_dir() and env = bench.Monitor(env, logger.get_dir() and
osp.join(logger.get_dir(), str(rank))) osp.join(logger.get_dir(), str(rank)))
env.seed(workerseed) env.seed(workerseed)
gym.logger.setLevel(logging.WARN)
env = wrap_deepmind(env) env = wrap_deepmind(env)
env.seed(workerseed) env.seed(workerseed)
@@ -42,12 +41,7 @@ def train(env_id, num_timesteps, seed):
env.close() env.close()
def main(): def main():
import argparse args = atari_arg_parser().parse_args()
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--env', help='environment ID', default='PongNoFrameskip-v4')
parser.add_argument('--seed', help='RNG seed', type=int, default=0)
parser.add_argument('--num-timesteps', type=int, default=int(10e6))
args = parser.parse_args()
train(args.env, num_timesteps=args.num_timesteps, seed=args.seed) train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
if __name__ == '__main__': if __name__ == '__main__':

View File

@@ -1,20 +1,16 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
from baselines.common import set_global_seeds, tf_util as U
from baselines import bench from baselines.common.cmd_util import make_mujoco_env, mujoco_arg_parser
import gym, logging from baselines.common import tf_util as U
from baselines import logger from baselines import logger
def train(env_id, num_timesteps, seed): def train(env_id, num_timesteps, seed):
from baselines.ppo1 import mlp_policy, pposgd_simple from baselines.ppo1 import mlp_policy, pposgd_simple
U.make_session(num_cpu=1).__enter__() U.make_session(num_cpu=1).__enter__()
set_global_seeds(seed)
env = gym.make(env_id)
def policy_fn(name, ob_space, ac_space): def policy_fn(name, ob_space, ac_space):
return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
hid_size=64, num_hid_layers=2) hid_size=64, num_hid_layers=2)
env = bench.Monitor(env, logger.get_dir()) env = make_mujoco_env(env_id, seed)
env.seed(seed)
gym.logger.setLevel(logging.WARN)
pposgd_simple.learn(env, policy_fn, pposgd_simple.learn(env, policy_fn,
max_timesteps=num_timesteps, max_timesteps=num_timesteps,
timesteps_per_actorbatch=2048, timesteps_per_actorbatch=2048,
@@ -25,15 +21,9 @@ def train(env_id, num_timesteps, seed):
env.close() env.close()
def main(): def main():
import argparse args = mujoco_arg_parser().parse_args()
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--env', help='environment ID', default='Hopper-v1')
parser.add_argument('--seed', help='RNG seed', type=int, default=0)
parser.add_argument('--num-timesteps', type=int, default=int(1e6))
args = parser.parse_args()
logger.configure() logger.configure()
train(args.env, num_timesteps=args.num_timesteps, seed=args.seed) train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
if __name__ == '__main__': if __name__ == '__main__':
main() main()

View File

@@ -3,6 +3,18 @@ import tensorflow as tf
from baselines.a2c.utils import conv, fc, conv_to_fc, batch_to_seq, seq_to_batch, lstm, lnlstm from baselines.a2c.utils import conv, fc, conv_to_fc, batch_to_seq, seq_to_batch, lstm, lnlstm
from baselines.common.distributions import make_pdtype from baselines.common.distributions import make_pdtype
def nature_cnn(unscaled_images):
"""
CNN from Nature paper.
"""
scaled_images = tf.cast(unscaled_images, tf.float32) / 255.
activ = tf.nn.relu
h = activ(conv(scaled_images, 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2)))
h2 = activ(conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2)))
h3 = activ(conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2)))
h3 = conv_to_fc(h3)
return activ(fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2)))
class LnLstmPolicy(object): class LnLstmPolicy(object):
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False): def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False):
nenv = nbatch // nsteps nenv = nbatch // nsteps
@@ -13,17 +25,13 @@ class LnLstmPolicy(object):
M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
with tf.variable_scope("model", reuse=reuse): with tf.variable_scope("model", reuse=reuse):
h = conv(tf.cast(X, tf.float32)/255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2)) h = nature_cnn(X)
h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2)) xs = batch_to_seq(h, nenv, nsteps)
h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2))
h3 = conv_to_fc(h3)
h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
xs = batch_to_seq(h4, nenv, nsteps)
ms = batch_to_seq(M, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps)
h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm) h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm)
h5 = seq_to_batch(h5) h5 = seq_to_batch(h5)
pi = fc(h5, 'pi', nact, act=lambda x:x) pi = fc(h5, 'pi', nact)
vf = fc(h5, 'v', 1, act=lambda x:x) vf = fc(h5, 'v', 1)
self.pdtype = make_pdtype(ac_space) self.pdtype = make_pdtype(ac_space)
self.pd = self.pdtype.pdfromflat(pi) self.pd = self.pdtype.pdfromflat(pi)
@@ -59,17 +67,13 @@ class LstmPolicy(object):
M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
with tf.variable_scope("model", reuse=reuse): with tf.variable_scope("model", reuse=reuse):
h = conv(tf.cast(X, tf.float32)/255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2)) h = nature_cnn(X)
h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2)) xs = batch_to_seq(h, nenv, nsteps)
h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2))
h3 = conv_to_fc(h3)
h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
xs = batch_to_seq(h4, nenv, nsteps)
ms = batch_to_seq(M, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps)
h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm) h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
h5 = seq_to_batch(h5) h5 = seq_to_batch(h5)
pi = fc(h5, 'pi', nact, act=lambda x:x) pi = fc(h5, 'pi', nact)
vf = fc(h5, 'v', 1, act=lambda x:x) vf = fc(h5, 'v', 1)
self.pdtype = make_pdtype(ac_space) self.pdtype = make_pdtype(ac_space)
self.pd = self.pdtype.pdfromflat(pi) self.pd = self.pdtype.pdfromflat(pi)
@@ -101,13 +105,9 @@ class CnnPolicy(object):
nact = ac_space.n nact = ac_space.n
X = tf.placeholder(tf.uint8, ob_shape) #obs X = tf.placeholder(tf.uint8, ob_shape) #obs
with tf.variable_scope("model", reuse=reuse): with tf.variable_scope("model", reuse=reuse):
h = conv(tf.cast(X, tf.float32)/255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2)) h = nature_cnn(X)
h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2)) pi = fc(h, 'pi', nact, init_scale=0.01)
h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2)) vf = fc(h, 'v', 1)[:,0]
h3 = conv_to_fc(h3)
h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
pi = fc(h4, 'pi', nact, act=lambda x:x, init_scale=0.01)
vf = fc(h4, 'v', 1, act=lambda x:x)[:,0]
self.pdtype = make_pdtype(ac_space) self.pdtype = make_pdtype(ac_space)
self.pd = self.pdtype.pdfromflat(pi) self.pd = self.pdtype.pdfromflat(pi)
@@ -135,13 +135,14 @@ class MlpPolicy(object):
actdim = ac_space.shape[0] actdim = ac_space.shape[0]
X = tf.placeholder(tf.float32, ob_shape, name='Ob') #obs X = tf.placeholder(tf.float32, ob_shape, name='Ob') #obs
with tf.variable_scope("model", reuse=reuse): with tf.variable_scope("model", reuse=reuse):
h1 = fc(X, 'pi_fc1', nh=64, init_scale=np.sqrt(2), act=tf.tanh) activ = tf.tanh
h2 = fc(h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2), act=tf.tanh) h1 = activ(fc(X, 'pi_fc1', nh=64, init_scale=np.sqrt(2)))
pi = fc(h2, 'pi', actdim, act=lambda x:x, init_scale=0.01) h2 = activ(fc(h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2)))
h1 = fc(X, 'vf_fc1', nh=64, init_scale=np.sqrt(2), act=tf.tanh) pi = fc(h2, 'pi', actdim, init_scale=0.01)
h2 = fc(h1, 'vf_fc2', nh=64, init_scale=np.sqrt(2), act=tf.tanh) h1 = activ(fc(X, 'vf_fc1', nh=64, init_scale=np.sqrt(2)))
vf = fc(h2, 'vf', 1, act=lambda x:x)[:,0] h2 = activ(fc(h1, 'vf_fc2', nh=64, init_scale=np.sqrt(2)))
logstd = tf.get_variable(name="logstd", shape=[1, actdim], vf = fc(h2, 'vf', 1)[:,0]
logstd = tf.get_variable(name="logstd", shape=[1, actdim],
initializer=tf.zeros_initializer()) initializer=tf.zeros_initializer())
pdparam = tf.concat([pi, pi * 0.0 + logstd], axis=1) pdparam = tf.concat([pi, pi * 0.0 + logstd], axis=1)
@@ -164,4 +165,4 @@ class MlpPolicy(object):
self.pi = pi self.pi = pi
self.vf = vf self.vf = vf
self.step = step self.step = step
self.value = value self.value = value

View File

@@ -51,7 +51,7 @@ class Model(object):
def train(lr, cliprange, obs, returns, masks, actions, values, neglogpacs, states=None): def train(lr, cliprange, obs, returns, masks, actions, values, neglogpacs, states=None):
advs = returns - values advs = returns - values
advs = (advs - advs.mean()) / (advs.std() + 1e-8) advs = (advs - advs.mean()) / (advs.std() + 1e-8)
td_map = {train_model.X:obs, A:actions, ADV:advs, R:returns, LR:lr, td_map = {train_model.X:obs, A:actions, ADV:advs, R:returns, LR:lr,
CLIPRANGE:cliprange, OLDNEGLOGPAC:neglogpacs, OLDVPRED:values} CLIPRANGE:cliprange, OLDNEGLOGPAC:neglogpacs, OLDVPRED:values}
if states is not None: if states is not None:
td_map[train_model.S] = states td_map[train_model.S] = states
@@ -107,7 +107,7 @@ class Runner(object):
mb_actions.append(actions) mb_actions.append(actions)
mb_values.append(values) mb_values.append(values)
mb_neglogpacs.append(neglogpacs) mb_neglogpacs.append(neglogpacs)
mb_dones.append(self.dones) mb_dones.append(self.dones)
self.obs[:], rewards, self.dones, infos = self.env.step(actions) self.obs[:], rewards, self.dones, infos = self.env.step(actions)
for info in infos: for info in infos:
maybeepinfo = info.get('episode') maybeepinfo = info.get('episode')
@@ -124,7 +124,7 @@ class Runner(object):
#discount/bootstrap off value fn #discount/bootstrap off value fn
mb_returns = np.zeros_like(mb_rewards) mb_returns = np.zeros_like(mb_rewards)
mb_advs = np.zeros_like(mb_rewards) mb_advs = np.zeros_like(mb_rewards)
lastgaelam = 0 lastgaelam = 0
for t in reversed(range(self.nsteps)): for t in reversed(range(self.nsteps)):
if t == self.nsteps - 1: if t == self.nsteps - 1:
nextnonterminal = 1.0 - self.dones nextnonterminal = 1.0 - self.dones
@@ -135,7 +135,7 @@ class Runner(object):
delta = mb_rewards[t] + self.gamma * nextvalues * nextnonterminal - mb_values[t] delta = mb_rewards[t] + self.gamma * nextvalues * nextnonterminal - mb_values[t]
mb_advs[t] = lastgaelam = delta + self.gamma * self.lam * nextnonterminal * lastgaelam mb_advs[t] = lastgaelam = delta + self.gamma * self.lam * nextnonterminal * lastgaelam
mb_returns = mb_advs + mb_values mb_returns = mb_advs + mb_values
return (*map(sf01, (mb_obs, mb_returns, mb_dones, mb_actions, mb_values, mb_neglogpacs)), return (*map(sf01, (mb_obs, mb_returns, mb_dones, mb_actions, mb_values, mb_neglogpacs)),
mb_states, epinfos) mb_states, epinfos)
# obs, returns, masks, actions, values, neglogpacs, states = runner.run() # obs, returns, masks, actions, values, neglogpacs, states = runner.run()
def sf01(arr): def sf01(arr):
@@ -150,8 +150,8 @@ def constfn(val):
return val return val
return f return f
def learn(*, policy, env, nsteps, total_timesteps, ent_coef, lr, def learn(*, policy, env, nsteps, total_timesteps, ent_coef, lr,
vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95,
log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2, log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2,
save_interval=0): save_interval=0):
@@ -167,7 +167,7 @@ def learn(*, policy, env, nsteps, total_timesteps, ent_coef, lr,
nbatch = nenvs * nsteps nbatch = nenvs * nsteps
nbatch_train = nbatch // nminibatches nbatch_train = nbatch // nminibatches
make_model = lambda : Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, make_model = lambda : Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train,
nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef,
max_grad_norm=max_grad_norm) max_grad_norm=max_grad_norm)
if save_interval and logger.get_dir(): if save_interval and logger.get_dir():
@@ -214,7 +214,7 @@ def learn(*, policy, env, nsteps, total_timesteps, ent_coef, lr,
mbflatinds = flatinds[mbenvinds].ravel() mbflatinds = flatinds[mbenvinds].ravel()
slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs))
mbstates = states[mbenvinds] mbstates = states[mbenvinds]
mblossvals.append(model.train(lrnow, cliprangenow, *slices, mbstates)) mblossvals.append(model.train(lrnow, cliprangenow, *slices, mbstates))
lossvals = np.mean(mblossvals, axis=0) lossvals = np.mean(mblossvals, axis=0)
tnow = time.time() tnow = time.time()

View File

@@ -1,40 +1,25 @@
#!/usr/bin/env python #!/usr/bin/env python3
import sys import sys
import argparse from baselines import logger
from baselines import bench, logger from baselines.common.cmd_util import make_atari_env, atari_arg_parser
from baselines.common.vec_env.vec_frame_stack import VecFrameStack
from baselines.ppo2 import ppo2
from baselines.ppo2.policies import CnnPolicy, LstmPolicy, LnLstmPolicy
import multiprocessing
import tensorflow as tf
def train(env_id, num_timesteps, seed, policy): def train(env_id, num_timesteps, seed, policy):
from baselines.common import set_global_seeds
from baselines.common.atari_wrappers import make_atari, wrap_deepmind
from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
from baselines.common.vec_env.vec_frame_stack import VecFrameStack
from baselines.ppo2 import ppo2
from baselines.ppo2.policies import CnnPolicy, LstmPolicy, LnLstmPolicy
import gym
import logging
import multiprocessing
import os.path as osp
import tensorflow as tf
ncpu = multiprocessing.cpu_count() ncpu = multiprocessing.cpu_count()
if sys.platform == 'darwin': ncpu //= 2 if sys.platform == 'darwin': ncpu //= 2
config = tf.ConfigProto(allow_soft_placement=True, config = tf.ConfigProto(allow_soft_placement=True,
intra_op_parallelism_threads=ncpu, intra_op_parallelism_threads=ncpu,
inter_op_parallelism_threads=ncpu) inter_op_parallelism_threads=ncpu)
config.gpu_options.allow_growth = True #pylint: disable=E1101 config.gpu_options.allow_growth = True #pylint: disable=E1101
gym.logger.setLevel(logging.WARN)
tf.Session(config=config).__enter__() tf.Session(config=config).__enter__()
def make_env(rank): env = VecFrameStack(make_atari_env(env_id, 8, seed), 4)
def env_fn():
env = make_atari(env_id)
env.seed(seed + rank)
env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), str(rank)))
return wrap_deepmind(env)
return env_fn
nenvs = 8
env = SubprocVecEnv([make_env(i) for i in range(nenvs)])
set_global_seeds(seed)
env = VecFrameStack(env, 4)
policy = {'cnn' : CnnPolicy, 'lstm' : LstmPolicy, 'lnlstm' : LnLstmPolicy}[policy] policy = {'cnn' : CnnPolicy, 'lstm' : LstmPolicy, 'lnlstm' : LnLstmPolicy}[policy]
ppo2.learn(policy=policy, env=env, nsteps=128, nminibatches=4, ppo2.learn(policy=policy, env=env, nsteps=128, nminibatches=4,
lam=0.95, gamma=0.99, noptepochs=4, log_interval=1, lam=0.95, gamma=0.99, noptepochs=4, log_interval=1,
@@ -44,11 +29,8 @@ def train(env_id, num_timesteps, seed, policy):
total_timesteps=int(num_timesteps * 1.1)) total_timesteps=int(num_timesteps * 1.1))
def main(): def main():
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser = atari_arg_parser()
parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4')
parser.add_argument('--seed', help='RNG seed', type=int, default=0)
parser.add_argument('--policy', help='Policy architecture', choices=['cnn', 'lstm', 'lnlstm'], default='cnn') parser.add_argument('--policy', help='Policy architecture', choices=['cnn', 'lstm', 'lnlstm'], default='cnn')
parser.add_argument('--num-timesteps', type=int, default=int(10e6))
args = parser.parse_args() args = parser.parse_args()
logger.configure() logger.configure()
train(args.env, num_timesteps=args.num_timesteps, seed=args.seed, train(args.env, num_timesteps=args.num_timesteps, seed=args.seed,

View File

@@ -1,5 +1,6 @@
#!/usr/bin/env python #!/usr/bin/env python3
import argparse import argparse
from baselines.common.cmd_util import mujoco_arg_parser
from baselines import bench, logger from baselines import bench, logger
def train(env_id, num_timesteps, seed): def train(env_id, num_timesteps, seed):
@@ -33,15 +34,10 @@ def train(env_id, num_timesteps, seed):
def main(): def main():
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) args = mujoco_arg_parser().parse_args()
parser.add_argument('--env', help='environment ID', default='Hopper-v1')
parser.add_argument('--seed', help='RNG seed', type=int, default=0)
parser.add_argument('--num-timesteps', type=int, default=int(1e6))
args = parser.parse_args()
logger.configure() logger.configure()
train(args.env, num_timesteps=args.num_timesteps, seed=args.seed) train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
if __name__ == '__main__': if __name__ == '__main__':
main() main()

View File

@@ -17,7 +17,7 @@ class CnnPolicy(object):
sequence_length = None sequence_length = None
ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))
obscaled = ob / 255.0 obscaled = ob / 255.0
with tf.variable_scope("pol"): with tf.variable_scope("pol"):
@@ -25,16 +25,16 @@ class CnnPolicy(object):
x = tf.nn.relu(U.conv2d(x, 8, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 8, "l1", [8, 8], [4, 4], pad="VALID"))
x = tf.nn.relu(U.conv2d(x, 16, "l2", [4, 4], [2, 2], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 16, "l2", [4, 4], [2, 2], pad="VALID"))
x = U.flattenallbut0(x) x = U.flattenallbut0(x)
x = tf.nn.relu(U.dense(x, 128, 'lin', U.normc_initializer(1.0))) x = tf.nn.relu(tf.layers.dense(x, 128, name='lin', kernel_initializer=U.normc_initializer(1.0)))
logits = U.dense(x, pdtype.param_shape()[0], "logits", U.normc_initializer(0.01)) logits = tf.layers.dense(x, pdtype.param_shape()[0], name='logits', kernel_initializer=U.normc_initializer(0.01))
self.pd = pdtype.pdfromflat(logits) self.pd = pdtype.pdfromflat(logits)
with tf.variable_scope("vf"): with tf.variable_scope("vf"):
x = obscaled x = obscaled
x = tf.nn.relu(U.conv2d(x, 8, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 8, "l1", [8, 8], [4, 4], pad="VALID"))
x = tf.nn.relu(U.conv2d(x, 16, "l2", [4, 4], [2, 2], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 16, "l2", [4, 4], [2, 2], pad="VALID"))
x = U.flattenallbut0(x) x = U.flattenallbut0(x)
x = tf.nn.relu(U.dense(x, 128, 'lin', U.normc_initializer(1.0))) x = tf.nn.relu(tf.layers.dense(x, 128, name='lin', kernel_initializer=U.normc_initializer(1.0)))
self.vpred = U.dense(x, 1, "value", U.normc_initializer(1.0)) self.vpred = tf.layers.dense(x, 1, name='value', kernel_initializer=U.normc_initializer(1.0))
self.vpredz = self.vpred self.vpredz = self.vpred
self.state_in = [] self.state_in = []

View File

@@ -1,4 +1,4 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
from mpi4py import MPI from mpi4py import MPI
from baselines.common import set_global_seeds from baselines.common import set_global_seeds
import os.path as osp import os.path as osp
@@ -6,6 +6,7 @@ import gym, logging
from baselines import logger from baselines import logger
from baselines import bench from baselines import bench
from baselines.common.atari_wrappers import make_atari, wrap_deepmind from baselines.common.atari_wrappers import make_atari, wrap_deepmind
from baselines.common.cmd_util import atari_arg_parser
def train(env_id, num_timesteps, seed): def train(env_id, num_timesteps, seed):
from baselines.trpo_mpi.nosharing_cnn_policy import CnnPolicy from baselines.trpo_mpi.nosharing_cnn_policy import CnnPolicy
@@ -26,7 +27,6 @@ def train(env_id, num_timesteps, seed):
return CnnPolicy(name=name, ob_space=env.observation_space, ac_space=env.action_space) return CnnPolicy(name=name, ob_space=env.observation_space, ac_space=env.action_space)
env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), str(rank))) env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), str(rank)))
env.seed(workerseed) env.seed(workerseed)
gym.logger.setLevel(logging.WARN)
env = wrap_deepmind(env) env = wrap_deepmind(env)
env.seed(workerseed) env.seed(workerseed)
@@ -36,14 +36,8 @@ def train(env_id, num_timesteps, seed):
env.close() env.close()
def main(): def main():
import argparse args = atari_arg_parser().parse_args()
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--env', help='environment ID', default='PongNoFrameskip-v4')
parser.add_argument('--seed', help='RNG seed', type=int, default=0)
parser.add_argument('--num-timesteps', type=int, default=int(10e6))
args = parser.parse_args()
train(args.env, num_timesteps=args.num_timesteps, seed=args.seed) train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
if __name__ == "__main__": if __name__ == "__main__":
main() main()

View File

@@ -1,17 +1,10 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
# noinspection PyUnresolvedReferences # noinspection PyUnresolvedReferences
import mujoco_py # Mujoco must come before other imports. https://openai.slack.com/archives/C1H6P3R7B/p1492828680631850
from mpi4py import MPI from mpi4py import MPI
from baselines.common import set_global_seeds from baselines.common.cmd_util import make_mujoco_env, mujoco_arg_parser
import os.path as osp
import gym
import logging
from baselines import logger from baselines import logger
from baselines.ppo1.mlp_policy import MlpPolicy from baselines.ppo1.mlp_policy import MlpPolicy
from baselines.common.mpi_fork import mpi_fork
from baselines import bench
from baselines.trpo_mpi import trpo_mpi from baselines.trpo_mpi import trpo_mpi
import sys
def train(env_id, num_timesteps, seed): def train(env_id, num_timesteps, seed):
import baselines.common.tf_util as U import baselines.common.tf_util as U
@@ -22,27 +15,16 @@ def train(env_id, num_timesteps, seed):
if rank != 0: if rank != 0:
logger.set_level(logger.DISABLED) logger.set_level(logger.DISABLED)
workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
set_global_seeds(workerseed)
env = gym.make(env_id)
def policy_fn(name, ob_space, ac_space): def policy_fn(name, ob_space, ac_space):
return MlpPolicy(name=name, ob_space=env.observation_space, ac_space=env.action_space, return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
hid_size=32, num_hid_layers=2) hid_size=32, num_hid_layers=2)
env = bench.Monitor(env, logger.get_dir() and env = make_mujoco_env(env_id, workerseed)
osp.join(logger.get_dir(), str(rank)))
env.seed(workerseed)
gym.logger.setLevel(logging.WARN)
trpo_mpi.learn(env, policy_fn, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1, trpo_mpi.learn(env, policy_fn, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1,
max_timesteps=num_timesteps, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3) max_timesteps=num_timesteps, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3)
env.close() env.close()
def main(): def main():
import argparse args = mujoco_arg_parser().parse_args()
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--env', help='environment ID', default='Hopper-v1')
parser.add_argument('--seed', help='RNG seed', type=int, default=0)
parser.add_argument('--num-timesteps', type=int, default=int(1e6))
args = parser.parse_args()
logger.configure() logger.configure()
train(args.env, num_timesteps=args.num_timesteps, seed=args.seed) train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)

View File

@@ -41,7 +41,7 @@ def traj_segment_generator(pi, env, horizon, stochastic):
yield {"ob" : obs, "rew" : rews, "vpred" : vpreds, "new" : news, yield {"ob" : obs, "rew" : rews, "vpred" : vpreds, "new" : news,
"ac" : acs, "prevac" : prevacs, "nextvpred": vpred * (1 - new), "ac" : acs, "prevac" : prevacs, "nextvpred": vpred * (1 - new),
"ep_rets" : ep_rets, "ep_lens" : ep_lens} "ep_rets" : ep_rets, "ep_lens" : ep_lens}
_, vpred = pi.act(stochastic, ob) _, vpred = pi.act(stochastic, ob)
# Be careful!!! if you change the downstream algorithm to aggregate # Be careful!!! if you change the downstream algorithm to aggregate
# several of these batches, then be sure to do a deepcopy # several of these batches, then be sure to do a deepcopy
ep_rets = [] ep_rets = []
@@ -79,7 +79,7 @@ def add_vtarg_and_adv(seg, gamma, lam):
gaelam[t] = lastgaelam = delta + gamma * lam * nonterminal * lastgaelam gaelam[t] = lastgaelam = delta + gamma * lam * nonterminal * lastgaelam
seg["tdlamret"] = seg["adv"] + seg["vpred"] seg["tdlamret"] = seg["adv"] + seg["vpred"]
def learn(env, policy_func, *, def learn(env, policy_fn, *,
timesteps_per_batch, # what to train on timesteps_per_batch, # what to train on
max_kl, cg_iters, max_kl, cg_iters,
gamma, lam, # advantage estimation gamma, lam, # advantage estimation
@@ -92,13 +92,13 @@ def learn(env, policy_func, *,
): ):
nworkers = MPI.COMM_WORLD.Get_size() nworkers = MPI.COMM_WORLD.Get_size()
rank = MPI.COMM_WORLD.Get_rank() rank = MPI.COMM_WORLD.Get_rank()
np.set_printoptions(precision=3) np.set_printoptions(precision=3)
# Setup losses and stuff # Setup losses and stuff
# ---------------------------------------- # ----------------------------------------
ob_space = env.observation_space ob_space = env.observation_space
ac_space = env.action_space ac_space = env.action_space
pi = policy_func("pi", ob_space, ac_space) pi = policy_fn("pi", ob_space, ac_space)
oldpi = policy_func("oldpi", ob_space, ac_space) oldpi = policy_fn("oldpi", ob_space, ac_space)
atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable)
ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return
@@ -107,14 +107,14 @@ def learn(env, policy_func, *,
kloldnew = oldpi.pd.kl(pi.pd) kloldnew = oldpi.pd.kl(pi.pd)
ent = pi.pd.entropy() ent = pi.pd.entropy()
meankl = U.mean(kloldnew) meankl = tf.reduce_mean(kloldnew)
meanent = U.mean(ent) meanent = tf.reduce_mean(ent)
entbonus = entcoeff * meanent entbonus = entcoeff * meanent
vferr = U.mean(tf.square(pi.vpred - ret)) vferr = tf.reduce_mean(tf.square(pi.vpred - ret))
ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold
surrgain = U.mean(ratio * atarg) surrgain = tf.reduce_mean(ratio * atarg)
optimgain = surrgain + entbonus optimgain = surrgain + entbonus
losses = [optimgain, meankl, entbonus, surrgain, meanent] losses = [optimgain, meankl, entbonus, surrgain, meanent]
@@ -138,7 +138,7 @@ def learn(env, policy_func, *,
sz = U.intprod(shape) sz = U.intprod(shape)
tangents.append(tf.reshape(flat_tangent[start:start+sz], shape)) tangents.append(tf.reshape(flat_tangent[start:start+sz], shape))
start += sz start += sz
gvp = tf.add_n([U.sum(g*tangent) for (g, tangent) in zipsame(klgrads, tangents)]) #pylint: disable=E1111 gvp = tf.add_n([tf.reduce_sum(g*tangent) for (g, tangent) in zipsame(klgrads, tangents)]) #pylint: disable=E1111
fvp = U.flatgrad(gvp, var_list) fvp = U.flatgrad(gvp, var_list)
assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv) assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv)
@@ -157,7 +157,7 @@ def learn(env, policy_func, *,
print(colorize("done in %.3f seconds"%(time.time() - tstart), color='magenta')) print(colorize("done in %.3f seconds"%(time.time() - tstart), color='magenta'))
else: else:
yield yield
def allmean(x): def allmean(x):
assert isinstance(x, np.ndarray) assert isinstance(x, np.ndarray)
out = np.empty_like(x) out = np.empty_like(x)
@@ -185,7 +185,7 @@ def learn(env, policy_func, *,
assert sum([max_iters>0, max_timesteps>0, max_episodes>0])==1 assert sum([max_iters>0, max_timesteps>0, max_episodes>0])==1
while True: while True:
if callback: callback(locals(), globals()) if callback: callback(locals(), globals())
if max_timesteps and timesteps_so_far >= max_timesteps: if max_timesteps and timesteps_so_far >= max_timesteps:
break break
@@ -260,7 +260,7 @@ def learn(env, policy_func, *,
with timed("vf"): with timed("vf"):
for _ in range(vf_iters): for _ in range(vf_iters):
for (mbob, mbret) in dataset.iterbatches((seg["ob"], seg["tdlamret"]), for (mbob, mbret) in dataset.iterbatches((seg["ob"], seg["tdlamret"]),
include_final_partial_batch=False, batch_size=64): include_final_partial_batch=False, batch_size=64):
g = allmean(compute_vflossandgrad(mbob, mbret)) g = allmean(compute_vflossandgrad(mbob, mbret))
vfadam.update(g, vf_stepsize) vfadam.update(g, vf_stepsize)