ACKTR + A2C

This commit is contained in:
John Schulman
2017-08-18 09:25:39 -07:00
parent 882251878f
commit 3f676f7d1e
31 changed files with 2920 additions and 144 deletions

View File

@@ -9,10 +9,14 @@ These algorithms will make it easier for the research community to replicate, re
You can install it by typing: You can install it by typing:
```bash ```bash
pip install baselines git clone https://github.com/openai/baselines.git
cd baselines
pip install -e .
``` ```
- [A2C](baselines/a2c)
- [ACKTR](baselines/acktr)
- [DDPG](baselines/ddpg)
- [DQN](baselines/deepq) - [DQN](baselines/deepq)
- [PPO](baselines/pposgd) - [PPO](baselines/pposgd)
- [TRPO](baselines/trpo_mpi) - [TRPO](baselines/trpo_mpi)
- [DDPG](baselines/ddpg)

View File

208
baselines/a2c/a2c.py Normal file
View File

@@ -0,0 +1,208 @@
import os.path as osp
import gym
import time
import joblib
import logging
import numpy as np
import tensorflow as tf
from baselines import logger
from baselines.common import set_global_seeds, explained_variance
from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
from baselines.common.atari_wrappers import wrap_deepmind
from baselines.a2c.utils import discount_with_dones
from baselines.a2c.utils import Scheduler, make_path, find_trainable_variables
from baselines.a2c.policies import CnnPolicy
from baselines.a2c.utils import cat_entropy, mse
class Model(object):
def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, nstack, num_procs,
ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4,
alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear'):
config = tf.ConfigProto(allow_soft_placement=True,
intra_op_parallelism_threads=num_procs,
inter_op_parallelism_threads=num_procs)
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)
nact = ac_space.n
nbatch = nenvs*nsteps
A = tf.placeholder(tf.int32, [nbatch])
ADV = tf.placeholder(tf.float32, [nbatch])
R = tf.placeholder(tf.float32, [nbatch])
LR = tf.placeholder(tf.float32, [])
step_model = policy(sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False)
train_model = policy(sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True)
neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=A)
pg_loss = tf.reduce_mean(ADV * neglogpac)
vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R))
entropy = tf.reduce_mean(cat_entropy(train_model.pi))
loss = pg_loss - entropy*ent_coef + vf_loss * vf_coef
params = find_trainable_variables("model")
grads = tf.gradients(loss, params)
if max_grad_norm is not None:
grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
grads = list(zip(grads, params))
trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon)
_train = trainer.apply_gradients(grads)
lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)
def train(obs, states, rewards, masks, actions, values):
advs = rewards - values
for step in range(len(obs)):
cur_lr = lr.value()
td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, LR:cur_lr}
if states != []:
td_map[train_model.S] = states
td_map[train_model.M] = masks
policy_loss, value_loss, policy_entropy, _ = sess.run(
[pg_loss, vf_loss, entropy, _train],
td_map
)
return policy_loss, value_loss, policy_entropy
def save(save_path):
ps = sess.run(params)
make_path(save_path)
joblib.dump(ps, save_path)
def load(load_path):
loaded_params = joblib.load(load_path)
restores = []
for p, loaded_p in zip(params, loaded_params):
restores.append(p.assign(loaded_p))
ps = sess.run(restores)
self.train = train
self.train_model = train_model
self.step_model = step_model
self.step = step_model.step
self.value = step_model.value
self.initial_state = step_model.initial_state
self.save = save
self.load = load
tf.global_variables_initializer().run(session=sess)
class Runner(object):
def __init__(self, env, model, nsteps=5, nstack=4, gamma=0.99):
self.env = env
self.model = model
nh, nw, nc = env.observation_space.shape
nenv = env.num_envs
self.batch_ob_shape = (nenv*nsteps, nh, nw, nc*nstack)
self.obs = np.zeros((nenv, nh, nw, nc*nstack), dtype=np.uint8)
obs = env.reset()
self.update_obs(obs)
self.gamma = gamma
self.nsteps = nsteps
self.states = model.initial_state
self.dones = [False for _ in range(nenv)]
def update_obs(self, obs):
# Do frame-stacking here instead of the FrameStack wrapper to reduce
# IPC overhead
self.obs = np.roll(self.obs, shift=-1, axis=3)
self.obs[:, :, :, -1] = obs[:, :, :, 0]
def run(self):
mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[]
mb_states = self.states
for n in range(self.nsteps):
actions, values, states = self.model.step(self.obs, self.states, self.dones)
mb_obs.append(np.copy(self.obs))
mb_actions.append(actions)
mb_values.append(values)
mb_dones.append(self.dones)
obs, rewards, dones, _ = self.env.step(actions)
self.states = states
self.dones = dones
for n, done in enumerate(dones):
if done:
self.obs[n] = self.obs[n]*0
self.update_obs(obs)
mb_rewards.append(rewards)
mb_dones.append(self.dones)
#batch of steps to batch of rollouts
mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape(self.batch_ob_shape)
mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0)
mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0)
mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0)
mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
mb_masks = mb_dones[:, :-1]
mb_dones = mb_dones[:, 1:]
last_values = self.model.value(self.obs, self.states, self.dones).tolist()
#discount/bootstrap off value fn
for n, (rewards, dones, value) in enumerate(zip(mb_rewards, mb_dones, last_values)):
rewards = rewards.tolist()
dones = dones.tolist()
if dones[-1] == 0:
rewards = discount_with_dones(rewards+[value], dones+[0], self.gamma)[:-1]
else:
rewards = discount_with_dones(rewards, dones, self.gamma)
mb_rewards[n] = rewards
mb_rewards = mb_rewards.flatten()
mb_actions = mb_actions.flatten()
mb_values = mb_values.flatten()
mb_masks = mb_masks.flatten()
return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values
def learn(policy, env, seed, nsteps=5, nstack=4, total_timesteps=int(80e6), vf_coef=0.5, ent_coef=0.01, max_grad_norm=0.5, lr=7e-4, lrschedule='linear', epsilon=1e-5, alpha=0.99, gamma=0.99, log_interval=100):
tf.reset_default_graph()
set_global_seeds(seed)
nenvs = env.num_envs
ob_space = env.observation_space
ac_space = env.action_space
num_procs = len(env.remotes) # HACK
model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nenvs=nenvs, nsteps=nsteps, nstack=nstack, num_procs=num_procs, ent_coef=ent_coef, vf_coef=vf_coef,
max_grad_norm=max_grad_norm, lr=lr, alpha=alpha, epsilon=epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule)
runner = Runner(env, model, nsteps=nsteps, nstack=nstack, gamma=gamma)
nbatch = nenvs*nsteps
tstart = time.time()
for update in range(1, total_timesteps//nbatch+1):
obs, states, rewards, masks, actions, values = runner.run()
policy_loss, value_loss, policy_entropy = model.train(obs, states, rewards, masks, actions, values)
nseconds = time.time()-tstart
fps = int((update*nbatch)/nseconds)
if update % log_interval == 0 or update == 1:
ev = explained_variance(values, rewards)
logger.record_tabular("nupdates", update)
logger.record_tabular("total_timesteps", update*nbatch)
logger.record_tabular("fps", fps)
logger.record_tabular("policy_entropy", float(policy_entropy))
logger.record_tabular("value_loss", float(value_loss))
logger.record_tabular("explained_variance", float(ev))
logger.dump_tabular()
env.close()
def main():
env_id = 'SpaceInvaders'
seed = 42
nenvs = 4
def make_env(rank):
def env_fn():
env = gym.make('{}NoFrameskip-v4'.format(env_id))
env.seed(seed + rank)
if logger.get_dir():
from baselines import bench
env = bench.Monitor(env, osp.join(logger.get_dir(), "{}.monitor.json".format(rank)))
gym.logger.setLevel(logging.WARN)
return wrap_deepmind(env)
return env_fn
set_global_seeds(seed)
env = SubprocVecEnv([make_env(i) for i in range(nenvs)])
policy = CnnPolicy
learn(policy, env, seed)
if __name__ == '__main__':
main()

207
baselines/a2c/policies.py Normal file
View File

@@ -0,0 +1,207 @@
import numpy as np
import tensorflow as tf
from baselines.a2c.utils import conv, fc, conv_to_fc, batch_to_seq, seq_to_batch, lstm, lnlstm, sample, check_shape
from baselines.common.distributions import make_pdtype
import baselines.common.tf_util as U
import gym
class LnLstmPolicy(object):
def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, nlstm=256, reuse=False):
nbatch = nenv*nsteps
nh, nw, nc = ob_space.shape
ob_shape = (nbatch, nh, nw, nc*nstack)
nact = ac_space.n
X = tf.placeholder(tf.uint8, ob_shape) #obs
M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
with tf.variable_scope("model", reuse=reuse):
h = conv(tf.cast(X, tf.float32)/255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2))
h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))
h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2))
h3 = conv_to_fc(h3)
h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
xs = batch_to_seq(h4, nenv, nsteps)
ms = batch_to_seq(M, nenv, nsteps)
h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm)
h5 = seq_to_batch(h5)
pi = fc(h5, 'pi', nact, act=lambda x:x)
vf = fc(h5, 'v', 1, act=lambda x:x)
v0 = vf[:, 0]
a0 = sample(pi)
self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)
def step(ob, state, mask):
a, v, s = sess.run([a0, v0, snew], {X:ob, S:state, M:mask})
return a, v, s
def value(ob, state, mask):
return sess.run(v0, {X:ob, S:state, M:mask})
self.X = X
self.M = M
self.S = S
self.pi = pi
self.vf = vf
self.step = step
self.value = value
class LstmPolicy(object):
def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, nlstm=256, reuse=False):
nbatch = nenv*nsteps
nh, nw, nc = ob_space.shape
ob_shape = (nbatch, nh, nw, nc*nstack)
nact = ac_space.n
X = tf.placeholder(tf.uint8, ob_shape) #obs
M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
with tf.variable_scope("model", reuse=reuse):
h = conv(tf.cast(X, tf.float32)/255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2))
h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))
h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2))
h3 = conv_to_fc(h3)
h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
xs = batch_to_seq(h4, nenv, nsteps)
ms = batch_to_seq(M, nenv, nsteps)
h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
h5 = seq_to_batch(h5)
pi = fc(h5, 'pi', nact, act=lambda x:x)
vf = fc(h5, 'v', 1, act=lambda x:x)
v0 = vf[:, 0]
a0 = sample(pi)
self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)
def step(ob, state, mask):
a, v, s = sess.run([a0, v0, snew], {X:ob, S:state, M:mask})
return a, v, s
def value(ob, state, mask):
return sess.run(v0, {X:ob, S:state, M:mask})
self.X = X
self.M = M
self.S = S
self.pi = pi
self.vf = vf
self.step = step
self.value = value
class CnnPolicy(object):
def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False):
nbatch = nenv*nsteps
nh, nw, nc = ob_space.shape
ob_shape = (nbatch, nh, nw, nc*nstack)
nact = ac_space.n
X = tf.placeholder(tf.uint8, ob_shape) #obs
with tf.variable_scope("model", reuse=reuse):
h = conv(tf.cast(X, tf.float32)/255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2))
h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))
h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2))
h3 = conv_to_fc(h3)
h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
pi = fc(h4, 'pi', nact, act=lambda x:x)
vf = fc(h4, 'v', 1, act=lambda x:x)
v0 = vf[:, 0]
a0 = sample(pi)
self.initial_state = [] #not stateful
def step(ob, *_args, **_kwargs):
a, v = sess.run([a0, v0], {X:ob})
return a, v, [] #dummy state
def value(ob, *_args, **_kwargs):
return sess.run(v0, {X:ob})
self.X = X
self.pi = pi
self.vf = vf
self.step = step
self.value = value
class AcerCnnPolicy(object):
def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False):
nbatch = nenv * nsteps
nh, nw, nc = ob_space.shape
ob_shape = (nbatch, nh, nw, nc * nstack)
nact = ac_space.n
X = tf.placeholder(tf.uint8, ob_shape) # obs
with tf.variable_scope("model", reuse=reuse):
h = conv(tf.cast(X, tf.float32) / 255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2))
h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))
h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2))
h3 = conv_to_fc(h3)
h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
pi_logits = fc(h4, 'pi', nact, act=lambda x: x, init_scale=0.01)
pi = tf.nn.softmax(pi_logits)
q = fc(h4, 'q', nact, act=lambda x: x)
a = sample(pi_logits) # could change this to use self.pi instead
self.initial_state = [] # not stateful
self.X = X
self.pi = pi # actual policy params now
self.q = q
def step(ob, *args, **kwargs):
# returns actions, mus, states
a0, pi0 = sess.run([a, pi], {X: ob})
return a0, pi0, [] # dummy state
def out(ob, *args, **kwargs):
pi0, q0 = sess.run([pi, q], {X: ob})
return pi0, q0
def act(ob, *args, **kwargs):
return sess.run(a, {X: ob})
self.step = step
self.out = out
self.act = act
class AcerLstmPolicy(object):
def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False, nlstm=256):
nbatch = nenv * nsteps
nh, nw, nc = ob_space.shape
ob_shape = (nbatch, nh, nw, nc * nstack)
nact = ac_space.n
X = tf.placeholder(tf.uint8, ob_shape) # obs
M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
with tf.variable_scope("model", reuse=reuse):
h = conv(tf.cast(X, tf.float32) / 255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2))
h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))
h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2))
h3 = conv_to_fc(h3)
h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
# lstm
xs = batch_to_seq(h4, nenv, nsteps)
ms = batch_to_seq(M, nenv, nsteps)
h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
h5 = seq_to_batch(h5)
pi_logits = fc(h5, 'pi', nact, act=lambda x: x, init_scale=0.01)
pi = tf.nn.softmax(pi_logits)
q = fc(h5, 'q', nact, act=lambda x: x)
a = sample(pi_logits) # could change this to use self.pi instead
self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)
self.X = X
self.M = M
self.S = S
self.pi = pi # actual policy params now
self.q = q
def step(ob, state, mask, *args, **kwargs):
# returns actions, mus, states
a0, pi0, s = sess.run([a, pi, snew], {X: ob, S: state, M: mask})
return a0, pi0, s
self.step = step
# For Mujoco. Taken from PPOSGD

View File

@@ -0,0 +1,41 @@
#!/usr/bin/env python
import os, logging, gym
from baselines import logger
from baselines.common import set_global_seeds
from baselines import bench
from baselines.a2c.a2c import learn
from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
from baselines.common.atari_wrappers import wrap_deepmind
from baselines.a2c.policies import CnnPolicy, LstmPolicy, LnLstmPolicy
def train(env_id, num_timesteps, seed, policy, lrschedule, num_cpu):
num_timesteps //= 4
def make_env(rank):
def _thunk():
env = gym.make(env_id)
env.seed(seed + rank)
env = bench.Monitor(env, os.path.join(logger.get_dir(), "{}.monitor.json".format(rank)))
gym.logger.setLevel(logging.WARN)
return wrap_deepmind(env)
return _thunk
set_global_seeds(seed)
env = SubprocVecEnv([make_env(i) for i in range(num_cpu)])
if policy == 'cnn':
policy_fn = CnnPolicy
elif policy == 'lstm':
policy_fn = LstmPolicy
elif policy == 'lnlstm':
policy_fn = LnLstmPolicy
learn(policy_fn, env, seed, total_timesteps=num_timesteps, lrschedule=lrschedule)
env.close()
def main():
train('BreakoutNoFrameskip-v4', num_timesteps=int(40e6), seed=0, policy='cnn', lrschedule='linear', num_cpu=16)
if __name__ == '__main__':
main()

255
baselines/a2c/utils.py Normal file
View File

@@ -0,0 +1,255 @@
import os
import gym
import numpy as np
import tensorflow as tf
from gym import spaces
from collections import deque
def sample(logits):
noise = tf.random_uniform(tf.shape(logits))
return tf.argmax(logits - tf.log(-tf.log(noise)), 1)
def cat_entropy(logits):
a0 = logits - tf.reduce_max(logits, 1, keep_dims=True)
ea0 = tf.exp(a0)
z0 = tf.reduce_sum(ea0, 1, keep_dims=True)
p0 = ea0 / z0
return tf.reduce_sum(p0 * (tf.log(z0) - a0), 1)
def cat_entropy_softmax(p0):
return - tf.reduce_sum(p0 * tf.log(p0 + 1e-6), axis = 1)
def mse(pred, target):
return tf.square(pred-target)/2.
def ortho_init(scale=1.0):
def _ortho_init(shape, dtype, partition_info=None):
#lasagne ortho init for tf
shape = tuple(shape)
if len(shape) == 2:
flat_shape = shape
elif len(shape) == 4: # assumes NHWC
flat_shape = (np.prod(shape[:-1]), shape[-1])
else:
raise NotImplementedError
a = np.random.normal(0.0, 1.0, flat_shape)
u, _, v = np.linalg.svd(a, full_matrices=False)
q = u if u.shape == flat_shape else v # pick the one with the correct shape
q = q.reshape(shape)
return (scale * q[:shape[0], :shape[1]]).astype(np.float32)
return _ortho_init
def conv(x, scope, nf, rf, stride, pad='VALID', act=tf.nn.relu, init_scale=1.0):
with tf.variable_scope(scope):
nin = x.get_shape()[3].value
w = tf.get_variable("w", [rf, rf, nin, nf], initializer=ortho_init(init_scale))
b = tf.get_variable("b", [nf], initializer=tf.constant_initializer(0.0))
z = tf.nn.conv2d(x, w, strides=[1, stride, stride, 1], padding=pad)+b
h = act(z)
return h
def fc(x, scope, nh, act=tf.nn.relu, init_scale=1.0):
with tf.variable_scope(scope):
nin = x.get_shape()[1].value
w = tf.get_variable("w", [nin, nh], initializer=ortho_init(init_scale))
b = tf.get_variable("b", [nh], initializer=tf.constant_initializer(0.0))
z = tf.matmul(x, w)+b
h = act(z)
return h
def batch_to_seq(h, nbatch, nsteps, flat=False):
if flat:
h = tf.reshape(h, [nbatch, nsteps])
else:
h = tf.reshape(h, [nbatch, nsteps, -1])
return [tf.squeeze(v, [1]) for v in tf.split(axis=1, num_or_size_splits=nsteps, value=h)]
def seq_to_batch(h, flat = False):
shape = h[0].get_shape().as_list()
if not flat:
assert(len(shape) > 1)
nh = h[0].get_shape()[-1].value
return tf.reshape(tf.concat(axis=1, values=h), [-1, nh])
else:
return tf.reshape(tf.stack(values=h, axis=1), [-1])
def lstm(xs, ms, s, scope, nh, init_scale=1.0):
nbatch, nin = [v.value for v in xs[0].get_shape()]
nsteps = len(xs)
with tf.variable_scope(scope):
wx = tf.get_variable("wx", [nin, nh*4], initializer=ortho_init(init_scale))
wh = tf.get_variable("wh", [nh, nh*4], initializer=ortho_init(init_scale))
b = tf.get_variable("b", [nh*4], initializer=tf.constant_initializer(0.0))
c, h = tf.split(axis=1, num_or_size_splits=2, value=s)
for idx, (x, m) in enumerate(zip(xs, ms)):
c = c*(1-m)
h = h*(1-m)
z = tf.matmul(x, wx) + tf.matmul(h, wh) + b
i, f, o, u = tf.split(axis=1, num_or_size_splits=4, value=z)
i = tf.nn.sigmoid(i)
f = tf.nn.sigmoid(f)
o = tf.nn.sigmoid(o)
u = tf.tanh(u)
c = f*c + i*u
h = o*tf.tanh(c)
xs[idx] = h
s = tf.concat(axis=1, values=[c, h])
return xs, s
def _ln(x, g, b, e=1e-5, axes=[1]):
u, s = tf.nn.moments(x, axes=axes, keep_dims=True)
x = (x-u)/tf.sqrt(s+e)
x = x*g+b
return x
def lnlstm(xs, ms, s, scope, nh, init_scale=1.0):
nbatch, nin = [v.value for v in xs[0].get_shape()]
nsteps = len(xs)
with tf.variable_scope(scope):
wx = tf.get_variable("wx", [nin, nh*4], initializer=ortho_init(init_scale))
gx = tf.get_variable("gx", [nh*4], initializer=tf.constant_initializer(1.0))
bx = tf.get_variable("bx", [nh*4], initializer=tf.constant_initializer(0.0))
wh = tf.get_variable("wh", [nh, nh*4], initializer=ortho_init(init_scale))
gh = tf.get_variable("gh", [nh*4], initializer=tf.constant_initializer(1.0))
bh = tf.get_variable("bh", [nh*4], initializer=tf.constant_initializer(0.0))
b = tf.get_variable("b", [nh*4], initializer=tf.constant_initializer(0.0))
gc = tf.get_variable("gc", [nh], initializer=tf.constant_initializer(1.0))
bc = tf.get_variable("bc", [nh], initializer=tf.constant_initializer(0.0))
c, h = tf.split(axis=1, num_or_size_splits=2, value=s)
for idx, (x, m) in enumerate(zip(xs, ms)):
c = c*(1-m)
h = h*(1-m)
z = _ln(tf.matmul(x, wx), gx, bx) + _ln(tf.matmul(h, wh), gh, bh) + b
i, f, o, u = tf.split(axis=1, num_or_size_splits=4, value=z)
i = tf.nn.sigmoid(i)
f = tf.nn.sigmoid(f)
o = tf.nn.sigmoid(o)
u = tf.tanh(u)
c = f*c + i*u
h = o*tf.tanh(_ln(c, gc, bc))
xs[idx] = h
s = tf.concat(axis=1, values=[c, h])
return xs, s
def conv_to_fc(x):
nh = np.prod([v.value for v in x.get_shape()[1:]])
x = tf.reshape(x, [-1, nh])
return x
def discount_with_dones(rewards, dones, gamma):
discounted = []
r = 0
for reward, done in zip(rewards[::-1], dones[::-1]):
r = reward + gamma*r*(1.-done) # fixed off by one bug
discounted.append(r)
return discounted[::-1]
def find_trainable_variables(key):
with tf.variable_scope(key):
return tf.trainable_variables()
def make_path(f):
return os.makedirs(f, exist_ok=True)
def constant(p):
return 1
def linear(p):
return 1-p
schedules = {
'linear':linear,
'constant':constant
}
class Scheduler(object):
def __init__(self, v, nvalues, schedule):
self.n = 0.
self.v = v
self.nvalues = nvalues
self.schedule = schedules[schedule]
def value(self):
current_value = self.v*self.schedule(self.n/self.nvalues)
self.n += 1.
return current_value
def value_steps(self, steps):
return self.v*self.schedule(steps/self.nvalues)
class EpisodeStats:
def __init__(self, nsteps, nenvs):
self.episode_rewards = []
for i in range(nenvs):
self.episode_rewards.append([])
self.lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths
self.rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards
self.nsteps = nsteps
self.nenvs = nenvs
def feed(self, rewards, masks):
rewards = np.reshape(rewards, [self.nenvs, self.nsteps])
masks = np.reshape(masks, [self.nenvs, self.nsteps])
for i in range(0, self.nenvs):
for j in range(0, self.nsteps):
self.episode_rewards[i].append(rewards[i][j])
if masks[i][j]:
l = len(self.episode_rewards[i])
s = sum(self.episode_rewards[i])
self.lenbuffer.append(l)
self.rewbuffer.append(s)
self.episode_rewards[i] = []
def mean_length(self):
if self.lenbuffer:
return np.mean(self.lenbuffer)
else:
return 0 # on the first params dump, no episodes are finished
def mean_reward(self):
if self.rewbuffer:
return np.mean(self.rewbuffer)
else:
return 0
# For ACER
def get_by_index(x, idx):
assert(len(x.get_shape()) == 2)
assert(len(idx.get_shape()) == 1)
idx_flattened = tf.range(0, x.shape[0]) * x.shape[1] + idx
y = tf.gather(tf.reshape(x, [-1]), # flatten input
idx_flattened) # use flattened indices
return y
def check_shape(ts,shapes):
i = 0
for (t,shape) in zip(ts,shapes):
assert t.get_shape().as_list()==shape, "id " + str(i) + " shape " + str(t.get_shape()) + str(shape)
i += 1
def avg_norm(t):
return tf.reduce_mean(tf.sqrt(tf.reduce_sum(tf.square(t), axis=-1)))
def myadd(g1, g2, param):
print([g1, g2, param.name])
assert (not (g1 is None and g2 is None)), param.name
if g1 is None:
return g2
elif g2 is None:
return g1
else:
return g1 + g2
def my_explained_variance(qpred, q):
_, vary = tf.nn.moments(q, axes=[0, 1])
_, varpred = tf.nn.moments(q - qpred, axes=[0, 1])
check_shape([vary, varpred], [[]] * 2)
return 1.0 - (varpred / vary)

View File

View File

@@ -0,0 +1,138 @@
import numpy as np
import tensorflow as tf
from baselines import logger
from baselines import common
from baselines.common import tf_util as U
from baselines.acktr import kfac
from baselines.acktr.filters import ZFilter
def pathlength(path):
return path["reward"].shape[0]# Loss function that we'll differentiate to get the policy gradient
def rollout(env, policy, max_pathlength, animate=False, obfilter=None):
"""
Simulate the env and policy for max_pathlength steps
"""
ob = env.reset()
prev_ob = np.float32(np.zeros(ob.shape))
if obfilter: ob = obfilter(ob)
terminated = False
obs = []
acs = []
ac_dists = []
logps = []
rewards = []
for _ in range(max_pathlength):
if animate:
env.render()
state = np.concatenate([ob, prev_ob], -1)
obs.append(state)
ac, ac_dist, logp = policy.act(state)
acs.append(ac)
ac_dists.append(ac_dist)
logps.append(logp)
prev_ob = np.copy(ob)
scaled_ac = env.action_space.low + (ac + 1.) * 0.5 * (env.action_space.high - env.action_space.low)
scaled_ac = np.clip(scaled_ac, env.action_space.low, env.action_space.high)
ob, rew, done, _ = env.step(scaled_ac)
if obfilter: ob = obfilter(ob)
rewards.append(rew)
if done:
terminated = True
break
return {"observation" : np.array(obs), "terminated" : terminated,
"reward" : np.array(rewards), "action" : np.array(acs),
"action_dist": np.array(ac_dists), "logp" : np.array(logps)}
def learn(env, policy, vf, gamma, lam, timesteps_per_batch, num_timesteps,
animate=False, callback=None, optimizer="adam", desired_kl=0.002):
obfilter = ZFilter(env.observation_space.shape)
max_pathlength = env.spec.timestep_limit
stepsize = tf.Variable(initial_value=np.float32(np.array(0.03)), name='stepsize')
inputs, loss, loss_sampled = policy.update_info
optim = kfac.KfacOptimizer(learning_rate=stepsize, cold_lr=stepsize*(1-0.9), momentum=0.9, kfac_update=2,\
epsilon=1e-2, stats_decay=0.99, async=1, cold_iter=1,
weight_decay_dict=policy.wd_dict, max_grad_norm=None)
pi_var_list = []
for var in tf.trainable_variables():
if "pi" in var.name:
pi_var_list.append(var)
update_op, q_runner = optim.minimize(loss, loss_sampled, var_list=pi_var_list)
do_update = U.function(inputs, update_op)
U.initialize()
# start queue runners
enqueue_threads = []
coord = tf.train.Coordinator()
for qr in [q_runner, vf.q_runner]:
assert (qr != None)
enqueue_threads.extend(qr.create_threads(U.get_session(), coord=coord, start=True))
i = 0
timesteps_so_far = 0
while True:
if timesteps_so_far > num_timesteps:
break
logger.log("********** Iteration %i ************"%i)
# Collect paths until we have enough timesteps
timesteps_this_batch = 0
paths = []
while True:
path = rollout(env, policy, max_pathlength, animate=(len(paths)==0 and (i % 10 == 0) and animate), obfilter=obfilter)
paths.append(path)
n = pathlength(path)
timesteps_this_batch += n
timesteps_so_far += n
if timesteps_this_batch > timesteps_per_batch:
break
# Estimate advantage function
vtargs = []
advs = []
for path in paths:
rew_t = path["reward"]
return_t = common.discount(rew_t, gamma)
vtargs.append(return_t)
vpred_t = vf.predict(path)
vpred_t = np.append(vpred_t, 0.0 if path["terminated"] else vpred_t[-1])
delta_t = rew_t + gamma*vpred_t[1:] - vpred_t[:-1]
adv_t = common.discount(delta_t, gamma * lam)
advs.append(adv_t)
# Update value function
vf.fit(paths, vtargs)
# Build arrays for policy update
ob_no = np.concatenate([path["observation"] for path in paths])
action_na = np.concatenate([path["action"] for path in paths])
oldac_dist = np.concatenate([path["action_dist"] for path in paths])
logp_n = np.concatenate([path["logp"] for path in paths])
adv_n = np.concatenate(advs)
standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8)
# Policy update
do_update(ob_no, action_na, standardized_adv_n)
# Adjust stepsize
kl = policy.compute_kl(ob_no, oldac_dist)
if kl > desired_kl * 2:
logger.log("kl too high")
U.eval(tf.assign(stepsize, stepsize / 1.5))
elif kl < desired_kl / 2:
logger.log("kl too low")
U.eval(tf.assign(stepsize, stepsize * 1.5))
else:
logger.log("kl just right!")
logger.record_tabular("EpRewMean", np.mean([path["reward"].sum() for path in paths]))
logger.record_tabular("EpRewSEM", np.std([path["reward"].sum()/np.sqrt(len(paths)) for path in paths]))
logger.record_tabular("EpLenMean", np.mean([pathlength(path) for path in paths]))
logger.record_tabular("KL", kl)
if callback:
callback()
logger.dump_tabular()
i += 1

View File

@@ -0,0 +1,214 @@
import os.path as osp
import time
import joblib
import numpy as np
import tensorflow as tf
from baselines import logger
from baselines.common import set_global_seeds, explained_variance
from baselines.acktr.utils import discount_with_dones
from baselines.acktr.utils import Scheduler, find_trainable_variables
from baselines.acktr.utils import cat_entropy, mse
from baselines.acktr import kfac
class Model(object):
def __init__(self, policy, ob_space, ac_space, nenvs,total_timesteps, nprocs=32, nsteps=20,
nstack=4, ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5,
kfac_clip=0.001, lrschedule='linear'):
config = tf.ConfigProto(allow_soft_placement=True,
intra_op_parallelism_threads=nprocs,
inter_op_parallelism_threads=nprocs)
config.gpu_options.allow_growth = True
self.sess = sess = tf.Session(config=config)
nact = ac_space.n
nbatch = nenvs * nsteps
A = tf.placeholder(tf.int32, [nbatch])
ADV = tf.placeholder(tf.float32, [nbatch])
R = tf.placeholder(tf.float32, [nbatch])
PG_LR = tf.placeholder(tf.float32, [])
VF_LR = tf.placeholder(tf.float32, [])
self.model = step_model = policy(sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False)
self.model2 = train_model = policy(sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True)
logpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=A)
self.logits = logits = train_model.pi
##training loss
pg_loss = tf.reduce_mean(ADV*logpac)
entropy = tf.reduce_mean(cat_entropy(train_model.pi))
pg_loss = pg_loss - ent_coef * entropy
vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R))
train_loss = pg_loss + vf_coef * vf_loss
##Fisher loss construction
self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(logpac)
sample_net = train_model.vf + tf.random_normal(tf.shape(train_model.vf))
self.vf_fisher = vf_fisher_loss = - vf_fisher_coef*tf.reduce_mean(tf.pow(train_model.vf - tf.stop_gradient(sample_net), 2))
self.joint_fisher = joint_fisher_loss = pg_fisher_loss + vf_fisher_loss
self.params=params = find_trainable_variables("model")
self.grads_check = grads = tf.gradients(train_loss,params)
with tf.device('/gpu:0'):
self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip,\
momentum=0.9, kfac_update=1, epsilon=0.01,\
stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=max_grad_norm)
update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss, var_list=params)
train_op, q_runner = optim.apply_gradients(list(zip(grads,params)))
self.q_runner = q_runner
self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)
def train(obs, states, rewards, masks, actions, values):
advs = rewards - values
for step in range(len(obs)):
cur_lr = self.lr.value()
td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, PG_LR:cur_lr}
if states != []:
td_map[train_model.S] = states
td_map[train_model.M] = masks
policy_loss, value_loss, policy_entropy, _ = sess.run(
[pg_loss, vf_loss, entropy, train_op],
td_map
)
return policy_loss, value_loss, policy_entropy
def save(save_path):
ps = sess.run(params)
joblib.dump(ps, save_path)
def load(load_path):
loaded_params = joblib.load(load_path)
restores = []
for p, loaded_p in zip(params, loaded_params):
restores.append(p.assign(loaded_p))
sess.run(restores)
self.train = train
self.save = save
self.load = load
self.train_model = train_model
self.step_model = step_model
self.step = step_model.step
self.value = step_model.value
self.initial_state = step_model.initial_state
tf.global_variables_initializer().run(session=sess)
class Runner(object):
def __init__(self, env, model, nsteps, nstack, gamma):
self.env = env
self.model = model
nh, nw, nc = env.observation_space.shape
nenv = env.num_envs
self.batch_ob_shape = (nenv*nsteps, nh, nw, nc*nstack)
self.obs = np.zeros((nenv, nh, nw, nc*nstack), dtype=np.uint8)
obs = env.reset()
self.update_obs(obs)
self.gamma = gamma
self.nsteps = nsteps
self.states = model.initial_state
self.dones = [False for _ in range(nenv)]
def update_obs(self, obs):
self.obs = np.roll(self.obs, shift=-1, axis=3)
self.obs[:, :, :, -1] = obs[:, :, :, 0]
def run(self):
mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[]
mb_states = self.states
for n in range(self.nsteps):
actions, values, states = self.model.step(self.obs, self.states, self.dones)
mb_obs.append(np.copy(self.obs))
mb_actions.append(actions)
mb_values.append(values)
mb_dones.append(self.dones)
obs, rewards, dones, _ = self.env.step(actions)
self.states = states
self.dones = dones
for n, done in enumerate(dones):
if done:
self.obs[n] = self.obs[n]*0
self.update_obs(obs)
mb_rewards.append(rewards)
mb_dones.append(self.dones)
#batch of steps to batch of rollouts
mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape(self.batch_ob_shape)
mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0)
mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0)
mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0)
mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
mb_masks = mb_dones[:, :-1]
mb_dones = mb_dones[:, 1:]
last_values = self.model.value(self.obs, self.states, self.dones).tolist()
#discount/bootstrap off value fn
for n, (rewards, dones, value) in enumerate(zip(mb_rewards, mb_dones, last_values)):
rewards = rewards.tolist()
dones = dones.tolist()
if dones[-1] == 0:
rewards = discount_with_dones(rewards+[value], dones+[0], self.gamma)[:-1]
else:
rewards = discount_with_dones(rewards, dones, self.gamma)
mb_rewards[n] = rewards
mb_rewards = mb_rewards.flatten()
mb_actions = mb_actions.flatten()
mb_values = mb_values.flatten()
mb_masks = mb_masks.flatten()
return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values
def learn(policy, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interval=1, nprocs=32, nsteps=20,
nstack=4, ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5,
kfac_clip=0.001, save_interval=None, lrschedule='linear'):
tf.reset_default_graph()
set_global_seeds(seed)
nenvs = env.num_envs
ob_space = env.observation_space
ac_space = env.action_space
make_model = lambda : Model(policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=nprocs, nsteps
=nsteps, nstack=nstack, ent_coef=ent_coef, vf_coef=vf_coef, vf_fisher_coef=
vf_fisher_coef, lr=lr, max_grad_norm=max_grad_norm, kfac_clip=kfac_clip,
lrschedule=lrschedule)
if save_interval and logger.get_dir():
import cloudpickle
with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh:
fh.write(cloudpickle.dumps(make_model))
model = make_model()
runner = Runner(env, model, nsteps=nsteps, nstack=nstack, gamma=gamma)
nbatch = nenvs*nsteps
tstart = time.time()
enqueue_threads = model.q_runner.create_threads(model.sess, coord=tf.train.Coordinator(), start=True)
for update in range(1, total_timesteps//nbatch+1):
obs, states, rewards, masks, actions, values = runner.run()
policy_loss, value_loss, policy_entropy = model.train(obs, states, rewards, masks, actions, values)
model.old_obs = obs
nseconds = time.time()-tstart
fps = int((update*nbatch)/nseconds)
if update % log_interval == 0 or update == 1:
ev = explained_variance(values, rewards)
logger.record_tabular("nupdates", update)
logger.record_tabular("total_timesteps", update*nbatch)
logger.record_tabular("fps", fps)
logger.record_tabular("policy_entropy", float(policy_entropy))
logger.record_tabular("policy_loss", float(policy_loss))
logger.record_tabular("value_loss", float(value_loss))
logger.record_tabular("explained_variance", float(ev))
logger.dump_tabular()
if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir():
savepath = osp.join(logger.get_dir(), 'checkpoint%.5i'%update)
print('Saving to', savepath)
model.save(savepath)
env.close()

View File

@@ -0,0 +1,98 @@
from baselines.acktr.running_stat import RunningStat
from collections import deque
import numpy as np
class Filter(object):
def __call__(self, x, update=True):
raise NotImplementedError
def reset(self):
pass
class IdentityFilter(Filter):
def __call__(self, x, update=True):
return x
class CompositionFilter(Filter):
def __init__(self, fs):
self.fs = fs
def __call__(self, x, update=True):
for f in self.fs:
x = f(x)
return x
def output_shape(self, input_space):
out = input_space.shape
for f in self.fs:
out = f.output_shape(out)
return out
class ZFilter(Filter):
"""
y = (x-mean)/std
using running estimates of mean,std
"""
def __init__(self, shape, demean=True, destd=True, clip=10.0):
self.demean = demean
self.destd = destd
self.clip = clip
self.rs = RunningStat(shape)
def __call__(self, x, update=True):
if update: self.rs.push(x)
if self.demean:
x = x - self.rs.mean
if self.destd:
x = x / (self.rs.std+1e-8)
if self.clip:
x = np.clip(x, -self.clip, self.clip)
return x
def output_shape(self, input_space):
return input_space.shape
class AddClock(Filter):
def __init__(self):
self.count = 0
def reset(self):
self.count = 0
def __call__(self, x, update=True):
return np.append(x, self.count/100.0)
def output_shape(self, input_space):
return (input_space.shape[0]+1,)
class FlattenFilter(Filter):
def __call__(self, x, update=True):
return x.ravel()
def output_shape(self, input_space):
return (int(np.prod(input_space.shape)),)
class Ind2OneHotFilter(Filter):
def __init__(self, n):
self.n = n
def __call__(self, x, update=True):
out = np.zeros(self.n)
out[x] = 1
return out
def output_shape(self, input_space):
return (input_space.n,)
class DivFilter(Filter):
def __init__(self, divisor):
self.divisor = divisor
def __call__(self, x, update=True):
return x / self.divisor
def output_shape(self, input_space):
return input_space.shape
class StackFilter(Filter):
def __init__(self, length):
self.stack = deque(maxlen=length)
def reset(self):
self.stack.clear()
def __call__(self, x, update=True):
self.stack.append(x)
while len(self.stack) < self.stack.maxlen:
self.stack.append(x)
return np.concatenate(self.stack, axis=-1)
def output_shape(self, input_space):
return input_space.shape[:-1] + (input_space.shape[-1] * self.stack.maxlen,)

926
baselines/acktr/kfac.py Normal file
View File

@@ -0,0 +1,926 @@
import tensorflow as tf
import numpy as np
import re
from baselines.acktr.kfac_utils import *
from functools import reduce
KFAC_OPS = ['MatMul', 'Conv2D', 'BiasAdd']
KFAC_DEBUG = False
class KfacOptimizer():
def __init__(self, learning_rate=0.01, momentum=0.9, clip_kl=0.01, kfac_update=2, stats_accum_iter=60, full_stats_init=False, cold_iter=100, cold_lr=None, async=False, async_stats=False, epsilon=1e-2, stats_decay=0.95, blockdiag_bias=False, channel_fac=False, factored_damping=False, approxT2=False, use_float64=False, weight_decay_dict={},max_grad_norm=0.5):
self.max_grad_norm = max_grad_norm
self._lr = learning_rate
self._momentum = momentum
self._clip_kl = clip_kl
self._channel_fac = channel_fac
self._kfac_update = kfac_update
self._async = async
self._async_stats = async_stats
self._epsilon = epsilon
self._stats_decay = stats_decay
self._blockdiag_bias = blockdiag_bias
self._approxT2 = approxT2
self._use_float64 = use_float64
self._factored_damping = factored_damping
self._cold_iter = cold_iter
if cold_lr == None:
# good heuristics
self._cold_lr = self._lr# * 3.
else:
self._cold_lr = cold_lr
self._stats_accum_iter = stats_accum_iter
self._weight_decay_dict = weight_decay_dict
self._diag_init_coeff = 0.
self._full_stats_init = full_stats_init
if not self._full_stats_init:
self._stats_accum_iter = self._cold_iter
self.sgd_step = tf.Variable(0, name='KFAC/sgd_step', trainable=False)
self.global_step = tf.Variable(
0, name='KFAC/global_step', trainable=False)
self.cold_step = tf.Variable(0, name='KFAC/cold_step', trainable=False)
self.factor_step = tf.Variable(
0, name='KFAC/factor_step', trainable=False)
self.stats_step = tf.Variable(
0, name='KFAC/stats_step', trainable=False)
self.vFv = tf.Variable(0., name='KFAC/vFv', trainable=False)
self.factors = {}
self.param_vars = []
self.stats = {}
self.stats_eigen = {}
def getFactors(self, g, varlist):
graph = tf.get_default_graph()
factorTensors = {}
fpropTensors = []
bpropTensors = []
opTypes = []
fops = []
def searchFactors(gradient, graph):
# hard coded search stratergy
bpropOp = gradient.op
bpropOp_name = bpropOp.name
bTensors = []
fTensors = []
# combining additive gradient, assume they are the same op type and
# indepedent
if 'AddN' in bpropOp_name:
factors = []
for g in gradient.op.inputs:
factors.append(searchFactors(g, graph))
op_names = [item['opName'] for item in factors]
# TO-DO: need to check all the attribute of the ops as well
print (gradient.name)
print (op_names)
print (len(np.unique(op_names)))
assert len(np.unique(op_names)) == 1, gradient.name + \
' is shared among different computation OPs'
bTensors = reduce(lambda x, y: x + y,
[item['bpropFactors'] for item in factors])
if len(factors[0]['fpropFactors']) > 0:
fTensors = reduce(
lambda x, y: x + y, [item['fpropFactors'] for item in factors])
fpropOp_name = op_names[0]
fpropOp = factors[0]['op']
else:
fpropOp_name = re.search(
'gradientsSampled(_[0-9]+|)/(.+?)_grad', bpropOp_name).group(2)
fpropOp = graph.get_operation_by_name(fpropOp_name)
if fpropOp.op_def.name in KFAC_OPS:
# Known OPs
###
bTensor = [
i for i in bpropOp.inputs if 'gradientsSampled' in i.name][-1]
bTensorShape = fpropOp.outputs[0].get_shape()
if bTensor.get_shape()[0].value == None:
bTensor.set_shape(bTensorShape)
bTensors.append(bTensor)
###
if fpropOp.op_def.name == 'BiasAdd':
fTensors = []
else:
fTensors.append(
[i for i in fpropOp.inputs if param.op.name not in i.name][0])
fpropOp_name = fpropOp.op_def.name
else:
# unknown OPs, block approximation used
bInputsList = [i for i in bpropOp.inputs[
0].op.inputs if 'gradientsSampled' in i.name if 'Shape' not in i.name]
if len(bInputsList) > 0:
bTensor = bInputsList[0]
bTensorShape = fpropOp.outputs[0].get_shape()
if len(bTensor.get_shape()) > 0 and bTensor.get_shape()[0].value == None:
bTensor.set_shape(bTensorShape)
bTensors.append(bTensor)
fpropOp_name = opTypes.append('UNK-' + fpropOp.op_def.name)
return {'opName': fpropOp_name, 'op': fpropOp, 'fpropFactors': fTensors, 'bpropFactors': bTensors}
for t, param in zip(g, varlist):
if KFAC_DEBUG:
print(('get factor for '+param.name))
factors = searchFactors(t, graph)
factorTensors[param] = factors
########
# check associated weights and bias for homogeneous coordinate representation
# and check redundent factors
# TO-DO: there may be a bug to detect associate bias and weights for
# forking layer, e.g. in inception models.
for param in varlist:
factorTensors[param]['assnWeights'] = None
factorTensors[param]['assnBias'] = None
for param in varlist:
if factorTensors[param]['opName'] == 'BiasAdd':
factorTensors[param]['assnWeights'] = None
for item in varlist:
if len(factorTensors[item]['bpropFactors']) > 0:
if (set(factorTensors[item]['bpropFactors']) == set(factorTensors[param]['bpropFactors'])) and (len(factorTensors[item]['fpropFactors']) > 0):
factorTensors[param]['assnWeights'] = item
factorTensors[item]['assnBias'] = param
factorTensors[param]['bpropFactors'] = factorTensors[
item]['bpropFactors']
########
########
# concatenate the additive gradients along the batch dimension, i.e.
# assuming independence structure
for key in ['fpropFactors', 'bpropFactors']:
for i, param in enumerate(varlist):
if len(factorTensors[param][key]) > 0:
if (key + '_concat') not in factorTensors[param]:
name_scope = factorTensors[param][key][0].name.split(':')[
0]
with tf.name_scope(name_scope):
factorTensors[param][
key + '_concat'] = tf.concat(factorTensors[param][key], 0)
else:
factorTensors[param][key + '_concat'] = None
for j, param2 in enumerate(varlist[(i + 1):]):
if (len(factorTensors[param][key]) > 0) and (set(factorTensors[param2][key]) == set(factorTensors[param][key])):
factorTensors[param2][key] = factorTensors[param][key]
factorTensors[param2][
key + '_concat'] = factorTensors[param][key + '_concat']
########
if KFAC_DEBUG:
for items in zip(varlist, fpropTensors, bpropTensors, opTypes):
print((items[0].name, factorTensors[item]))
self.factors = factorTensors
return factorTensors
def getStats(self, factors, varlist):
if len(self.stats) == 0:
# initialize stats variables on CPU because eigen decomp is
# computed on CPU
with tf.device('/cpu'):
tmpStatsCache = {}
# search for tensor factors and
# use block diag approx for the bias units
for var in varlist:
fpropFactor = factors[var]['fpropFactors_concat']
bpropFactor = factors[var]['bpropFactors_concat']
opType = factors[var]['opName']
if opType == 'Conv2D':
Kh = var.get_shape()[0]
Kw = var.get_shape()[1]
C = fpropFactor.get_shape()[-1]
Oh = bpropFactor.get_shape()[1]
Ow = bpropFactor.get_shape()[2]
if Oh == 1 and Ow == 1 and self._channel_fac:
# factorization along the channels do not support
# homogeneous coordinate
var_assnBias = factors[var]['assnBias']
if var_assnBias:
factors[var]['assnBias'] = None
factors[var_assnBias]['assnWeights'] = None
##
for var in varlist:
fpropFactor = factors[var]['fpropFactors_concat']
bpropFactor = factors[var]['bpropFactors_concat']
opType = factors[var]['opName']
self.stats[var] = {'opName': opType,
'fprop_concat_stats': [],
'bprop_concat_stats': [],
'assnWeights': factors[var]['assnWeights'],
'assnBias': factors[var]['assnBias'],
}
if fpropFactor is not None:
if fpropFactor not in tmpStatsCache:
if opType == 'Conv2D':
Kh = var.get_shape()[0]
Kw = var.get_shape()[1]
C = fpropFactor.get_shape()[-1]
Oh = bpropFactor.get_shape()[1]
Ow = bpropFactor.get_shape()[2]
if Oh == 1 and Ow == 1 and self._channel_fac:
# factorization along the channels
# assume independence bewteen input channels and spatial
# 2K-1 x 2K-1 covariance matrix and C x C covariance matrix
# factorization along the channels do not
# support homogeneous coordinate, assnBias
# is always None
fpropFactor2_size = Kh * Kw
slot_fpropFactor_stats2 = tf.Variable(tf.diag(tf.ones(
[fpropFactor2_size])) * self._diag_init_coeff, name='KFAC_STATS/' + fpropFactor.op.name, trainable=False)
self.stats[var]['fprop_concat_stats'].append(
slot_fpropFactor_stats2)
fpropFactor_size = C
else:
# 2K-1 x 2K-1 x C x C covariance matrix
# assume BHWC
fpropFactor_size = Kh * Kw * C
else:
# D x D covariance matrix
fpropFactor_size = fpropFactor.get_shape()[-1]
# use homogeneous coordinate
if not self._blockdiag_bias and self.stats[var]['assnBias']:
fpropFactor_size += 1
slot_fpropFactor_stats = tf.Variable(tf.diag(tf.ones(
[fpropFactor_size])) * self._diag_init_coeff, name='KFAC_STATS/' + fpropFactor.op.name, trainable=False)
self.stats[var]['fprop_concat_stats'].append(
slot_fpropFactor_stats)
if opType != 'Conv2D':
tmpStatsCache[fpropFactor] = self.stats[
var]['fprop_concat_stats']
else:
self.stats[var][
'fprop_concat_stats'] = tmpStatsCache[fpropFactor]
if bpropFactor is not None:
# no need to collect backward stats for bias vectors if
# using homogeneous coordinates
if not((not self._blockdiag_bias) and self.stats[var]['assnWeights']):
if bpropFactor not in tmpStatsCache:
slot_bpropFactor_stats = tf.Variable(tf.diag(tf.ones([bpropFactor.get_shape(
)[-1]])) * self._diag_init_coeff, name='KFAC_STATS/' + bpropFactor.op.name, trainable=False)
self.stats[var]['bprop_concat_stats'].append(
slot_bpropFactor_stats)
tmpStatsCache[bpropFactor] = self.stats[
var]['bprop_concat_stats']
else:
self.stats[var][
'bprop_concat_stats'] = tmpStatsCache[bpropFactor]
return self.stats
def compute_and_apply_stats(self, loss_sampled, var_list=None):
varlist = var_list
if varlist is None:
varlist = tf.trainable_variables()
stats = self.compute_stats(loss_sampled, var_list=varlist)
return self.apply_stats(stats)
def compute_stats(self, loss_sampled, var_list=None):
varlist = var_list
if varlist is None:
varlist = tf.trainable_variables()
gs = tf.gradients(loss_sampled, varlist, name='gradientsSampled')
self.gs = gs
factors = self.getFactors(gs, varlist)
stats = self.getStats(factors, varlist)
updateOps = []
statsUpdates = {}
statsUpdates_cache = {}
for var in varlist:
opType = factors[var]['opName']
fops = factors[var]['op']
fpropFactor = factors[var]['fpropFactors_concat']
fpropStats_vars = stats[var]['fprop_concat_stats']
bpropFactor = factors[var]['bpropFactors_concat']
bpropStats_vars = stats[var]['bprop_concat_stats']
SVD_factors = {}
for stats_var in fpropStats_vars:
stats_var_dim = int(stats_var.get_shape()[0])
if stats_var not in statsUpdates_cache:
old_fpropFactor = fpropFactor
B = (tf.shape(fpropFactor)[0]) # batch size
if opType == 'Conv2D':
strides = fops.get_attr("strides")
padding = fops.get_attr("padding")
convkernel_size = var.get_shape()[0:3]
KH = int(convkernel_size[0])
KW = int(convkernel_size[1])
C = int(convkernel_size[2])
flatten_size = int(KH * KW * C)
Oh = int(bpropFactor.get_shape()[1])
Ow = int(bpropFactor.get_shape()[2])
if Oh == 1 and Ow == 1 and self._channel_fac:
# factorization along the channels
# assume independence among input channels
# factor = B x 1 x 1 x (KH xKW x C)
# patches = B x Oh x Ow x (KH xKW x C)
if len(SVD_factors) == 0:
if KFAC_DEBUG:
print(('approx %s act factor with rank-1 SVD factors' % (var.name)))
# find closest rank-1 approx to the feature map
S, U, V = tf.batch_svd(tf.reshape(
fpropFactor, [-1, KH * KW, C]))
# get rank-1 approx slides
sqrtS1 = tf.expand_dims(tf.sqrt(S[:, 0, 0]), 1)
patches_k = U[:, :, 0] * sqrtS1 # B x KH*KW
full_factor_shape = fpropFactor.get_shape()
patches_k.set_shape(
[full_factor_shape[0], KH * KW])
patches_c = V[:, :, 0] * sqrtS1 # B x C
patches_c.set_shape([full_factor_shape[0], C])
SVD_factors[C] = patches_c
SVD_factors[KH * KW] = patches_k
fpropFactor = SVD_factors[stats_var_dim]
else:
# poor mem usage implementation
patches = tf.extract_image_patches(fpropFactor, ksizes=[1, convkernel_size[
0], convkernel_size[1], 1], strides=strides, rates=[1, 1, 1, 1], padding=padding)
if self._approxT2:
if KFAC_DEBUG:
print(('approxT2 act fisher for %s' % (var.name)))
# T^2 terms * 1/T^2, size: B x C
fpropFactor = tf.reduce_mean(patches, [1, 2])
else:
# size: (B x Oh x Ow) x C
fpropFactor = tf.reshape(
patches, [-1, flatten_size]) / Oh / Ow
fpropFactor_size = int(fpropFactor.get_shape()[-1])
if stats_var_dim == (fpropFactor_size + 1) and not self._blockdiag_bias:
if opType == 'Conv2D' and not self._approxT2:
# correct padding for numerical stability (we
# divided out OhxOw from activations for T1 approx)
fpropFactor = tf.concat([fpropFactor, tf.ones(
[tf.shape(fpropFactor)[0], 1]) / Oh / Ow], 1)
else:
# use homogeneous coordinates
fpropFactor = tf.concat(
[fpropFactor, tf.ones([tf.shape(fpropFactor)[0], 1])], 1)
# average over the number of data points in a batch
# divided by B
cov = tf.matmul(fpropFactor, fpropFactor,
transpose_a=True) / tf.cast(B, tf.float32)
updateOps.append(cov)
statsUpdates[stats_var] = cov
if opType != 'Conv2D':
# HACK: for convolution we recompute fprop stats for
# every layer including forking layers
statsUpdates_cache[stats_var] = cov
for stats_var in bpropStats_vars:
stats_var_dim = int(stats_var.get_shape()[0])
if stats_var not in statsUpdates_cache:
old_bpropFactor = bpropFactor
bpropFactor_shape = bpropFactor.get_shape()
B = tf.shape(bpropFactor)[0] # batch size
C = int(bpropFactor_shape[-1]) # num channels
if opType == 'Conv2D' or len(bpropFactor_shape) == 4:
if fpropFactor is not None:
if self._approxT2:
if KFAC_DEBUG:
print(('approxT2 grad fisher for %s' % (var.name)))
bpropFactor = tf.reduce_sum(
bpropFactor, [1, 2]) # T^2 terms * 1/T^2
else:
bpropFactor = tf.reshape(
bpropFactor, [-1, C]) * Oh * Ow # T * 1/T terms
else:
# just doing block diag approx. spatial independent
# structure does not apply here. summing over
# spatial locations
if KFAC_DEBUG:
print(('block diag approx fisher for %s' % (var.name)))
bpropFactor = tf.reduce_sum(bpropFactor, [1, 2])
# assume sampled loss is averaged. TO-DO:figure out better
# way to handle this
bpropFactor *= tf.to_float(B)
##
cov_b = tf.matmul(
bpropFactor, bpropFactor, transpose_a=True) / tf.to_float(tf.shape(bpropFactor)[0])
updateOps.append(cov_b)
statsUpdates[stats_var] = cov_b
statsUpdates_cache[stats_var] = cov_b
if KFAC_DEBUG:
aKey = list(statsUpdates.keys())[0]
statsUpdates[aKey] = tf.Print(statsUpdates[aKey],
[tf.convert_to_tensor('step:'),
self.global_step,
tf.convert_to_tensor(
'computing stats'),
])
self.statsUpdates = statsUpdates
return statsUpdates
def apply_stats(self, statsUpdates):
""" compute stats and update/apply the new stats to the running average
"""
def updateAccumStats():
if self._full_stats_init:
return tf.cond(tf.greater(self.sgd_step, self._cold_iter), lambda: tf.group(*self._apply_stats(statsUpdates, accumulate=True, accumulateCoeff=1. / self._stats_accum_iter)), tf.no_op)
else:
return tf.group(*self._apply_stats(statsUpdates, accumulate=True, accumulateCoeff=1. / self._stats_accum_iter))
def updateRunningAvgStats(statsUpdates, fac_iter=1):
# return tf.cond(tf.greater_equal(self.factor_step,
# tf.convert_to_tensor(fac_iter)), lambda:
# tf.group(*self._apply_stats(stats_list, varlist)), tf.no_op)
return tf.group(*self._apply_stats(statsUpdates))
if self._async_stats:
# asynchronous stats update
update_stats = self._apply_stats(statsUpdates)
queue = tf.FIFOQueue(1, [item.dtype for item in update_stats], shapes=[
item.get_shape() for item in update_stats])
enqueue_op = queue.enqueue(update_stats)
def dequeue_stats_op():
return queue.dequeue()
self.qr_stats = tf.train.QueueRunner(queue, [enqueue_op])
update_stats_op = tf.cond(tf.equal(queue.size(), tf.convert_to_tensor(
0)), tf.no_op, lambda: tf.group(*[dequeue_stats_op(), ]))
else:
# synchronous stats update
update_stats_op = tf.cond(tf.greater_equal(
self.stats_step, self._stats_accum_iter), lambda: updateRunningAvgStats(statsUpdates), updateAccumStats)
self._update_stats_op = update_stats_op
return update_stats_op
def _apply_stats(self, statsUpdates, accumulate=False, accumulateCoeff=0.):
updateOps = []
# obtain the stats var list
for stats_var in statsUpdates:
stats_new = statsUpdates[stats_var]
if accumulate:
# simple superbatch averaging
update_op = tf.assign_add(
stats_var, accumulateCoeff * stats_new, use_locking=True)
else:
# exponential running averaging
update_op = tf.assign(
stats_var, stats_var * self._stats_decay, use_locking=True)
update_op = tf.assign_add(
update_op, (1. - self._stats_decay) * stats_new, use_locking=True)
updateOps.append(update_op)
with tf.control_dependencies(updateOps):
stats_step_op = tf.assign_add(self.stats_step, 1)
if KFAC_DEBUG:
stats_step_op = (tf.Print(stats_step_op,
[tf.convert_to_tensor('step:'),
self.global_step,
tf.convert_to_tensor('fac step:'),
self.factor_step,
tf.convert_to_tensor('sgd step:'),
self.sgd_step,
tf.convert_to_tensor('Accum:'),
tf.convert_to_tensor(accumulate),
tf.convert_to_tensor('Accum coeff:'),
tf.convert_to_tensor(accumulateCoeff),
tf.convert_to_tensor('stat step:'),
self.stats_step, updateOps[0], updateOps[1]]))
return [stats_step_op, ]
def getStatsEigen(self, stats=None):
if len(self.stats_eigen) == 0:
stats_eigen = {}
if stats is None:
stats = self.stats
tmpEigenCache = {}
with tf.device('/cpu:0'):
for var in stats:
for key in ['fprop_concat_stats', 'bprop_concat_stats']:
for stats_var in stats[var][key]:
if stats_var not in tmpEigenCache:
stats_dim = stats_var.get_shape()[1].value
e = tf.Variable(tf.ones(
[stats_dim]), name='KFAC_FAC/' + stats_var.name.split(':')[0] + '/e', trainable=False)
Q = tf.Variable(tf.diag(tf.ones(
[stats_dim])), name='KFAC_FAC/' + stats_var.name.split(':')[0] + '/Q', trainable=False)
stats_eigen[stats_var] = {'e': e, 'Q': Q}
tmpEigenCache[
stats_var] = stats_eigen[stats_var]
else:
stats_eigen[stats_var] = tmpEigenCache[
stats_var]
self.stats_eigen = stats_eigen
return self.stats_eigen
def computeStatsEigen(self):
""" compute the eigen decomp using copied var stats to avoid concurrent read/write from other queue """
# TO-DO: figure out why this op has delays (possibly moving
# eigenvectors around?)
with tf.device('/cpu:0'):
def removeNone(tensor_list):
local_list = []
for item in tensor_list:
if item is not None:
local_list.append(item)
return local_list
def copyStats(var_list):
print("copying stats to buffer tensors before eigen decomp")
redundant_stats = {}
copied_list = []
for item in var_list:
if item is not None:
if item not in redundant_stats:
if self._use_float64:
redundant_stats[item] = tf.cast(
tf.identity(item), tf.float64)
else:
redundant_stats[item] = tf.identity(item)
copied_list.append(redundant_stats[item])
else:
copied_list.append(None)
return copied_list
#stats = [copyStats(self.fStats), copyStats(self.bStats)]
#stats = [self.fStats, self.bStats]
stats_eigen = self.stats_eigen
computedEigen = {}
eigen_reverse_lookup = {}
updateOps = []
# sync copied stats
# with tf.control_dependencies(removeNone(stats[0]) +
# removeNone(stats[1])):
with tf.control_dependencies([]):
for stats_var in stats_eigen:
if stats_var not in computedEigen:
eigens = tf.self_adjoint_eig(stats_var)
e = eigens[0]
Q = eigens[1]
if self._use_float64:
e = tf.cast(e, tf.float32)
Q = tf.cast(Q, tf.float32)
updateOps.append(e)
updateOps.append(Q)
computedEigen[stats_var] = {'e': e, 'Q': Q}
eigen_reverse_lookup[e] = stats_eigen[stats_var]['e']
eigen_reverse_lookup[Q] = stats_eigen[stats_var]['Q']
self.eigen_reverse_lookup = eigen_reverse_lookup
self.eigen_update_list = updateOps
if KFAC_DEBUG:
self.eigen_update_list = [item for item in updateOps]
with tf.control_dependencies(updateOps):
updateOps.append(tf.Print(tf.constant(
0.), [tf.convert_to_tensor('computed factor eigen')]))
return updateOps
def applyStatsEigen(self, eigen_list):
updateOps = []
print(('updating %d eigenvalue/vectors' % len(eigen_list)))
for i, (tensor, mark) in enumerate(zip(eigen_list, self.eigen_update_list)):
stats_eigen_var = self.eigen_reverse_lookup[mark]
updateOps.append(
tf.assign(stats_eigen_var, tensor, use_locking=True))
with tf.control_dependencies(updateOps):
factor_step_op = tf.assign_add(self.factor_step, 1)
updateOps.append(factor_step_op)
if KFAC_DEBUG:
updateOps.append(tf.Print(tf.constant(
0.), [tf.convert_to_tensor('updated kfac factors')]))
return updateOps
def getKfacPrecondUpdates(self, gradlist, varlist):
updatelist = []
vg = 0.
assert len(self.stats) > 0
assert len(self.stats_eigen) > 0
assert len(self.factors) > 0
counter = 0
grad_dict = {var: grad for grad, var in zip(gradlist, varlist)}
for grad, var in zip(gradlist, varlist):
GRAD_RESHAPE = False
GRAD_TRANSPOSE = False
fpropFactoredFishers = self.stats[var]['fprop_concat_stats']
bpropFactoredFishers = self.stats[var]['bprop_concat_stats']
if (len(fpropFactoredFishers) + len(bpropFactoredFishers)) > 0:
counter += 1
GRAD_SHAPE = grad.get_shape()
if len(grad.get_shape()) > 2:
# reshape conv kernel parameters
KW = int(grad.get_shape()[0])
KH = int(grad.get_shape()[1])
C = int(grad.get_shape()[2])
D = int(grad.get_shape()[3])
if len(fpropFactoredFishers) > 1 and self._channel_fac:
# reshape conv kernel parameters into tensor
grad = tf.reshape(grad, [KW * KH, C, D])
else:
# reshape conv kernel parameters into 2D grad
grad = tf.reshape(grad, [-1, D])
GRAD_RESHAPE = True
elif len(grad.get_shape()) == 1:
# reshape bias or 1D parameters
D = int(grad.get_shape()[0])
grad = tf.expand_dims(grad, 0)
GRAD_RESHAPE = True
else:
# 2D parameters
C = int(grad.get_shape()[0])
D = int(grad.get_shape()[1])
if (self.stats[var]['assnBias'] is not None) and not self._blockdiag_bias:
# use homogeneous coordinates only works for 2D grad.
# TO-DO: figure out how to factorize bias grad
# stack bias grad
var_assnBias = self.stats[var]['assnBias']
grad = tf.concat(
[grad, tf.expand_dims(grad_dict[var_assnBias], 0)], 0)
# project gradient to eigen space and reshape the eigenvalues
# for broadcasting
eigVals = []
for idx, stats in enumerate(self.stats[var]['fprop_concat_stats']):
Q = self.stats_eigen[stats]['Q']
e = detectMinVal(self.stats_eigen[stats][
'e'], var, name='act', debug=KFAC_DEBUG)
Q, e = factorReshape(Q, e, grad, facIndx=idx, ftype='act')
eigVals.append(e)
grad = gmatmul(Q, grad, transpose_a=True, reduce_dim=idx)
for idx, stats in enumerate(self.stats[var]['bprop_concat_stats']):
Q = self.stats_eigen[stats]['Q']
e = detectMinVal(self.stats_eigen[stats][
'e'], var, name='grad', debug=KFAC_DEBUG)
Q, e = factorReshape(Q, e, grad, facIndx=idx, ftype='grad')
eigVals.append(e)
grad = gmatmul(grad, Q, transpose_b=False, reduce_dim=idx)
##
#####
# whiten using eigenvalues
weightDecayCoeff = 0.
if var in self._weight_decay_dict:
weightDecayCoeff = self._weight_decay_dict[var]
if KFAC_DEBUG:
print(('weight decay coeff for %s is %f' % (var.name, weightDecayCoeff)))
if self._factored_damping:
if KFAC_DEBUG:
print(('use factored damping for %s' % (var.name)))
coeffs = 1.
num_factors = len(eigVals)
# compute the ratio of two trace norm of the left and right
# KFac matrices, and their generalization
if len(eigVals) == 1:
damping = self._epsilon + weightDecayCoeff
else:
damping = tf.pow(
self._epsilon + weightDecayCoeff, 1. / num_factors)
eigVals_tnorm_avg = [tf.reduce_mean(
tf.abs(e)) for e in eigVals]
for e, e_tnorm in zip(eigVals, eigVals_tnorm_avg):
eig_tnorm_negList = [
item for item in eigVals_tnorm_avg if item != e_tnorm]
if len(eigVals) == 1:
adjustment = 1.
elif len(eigVals) == 2:
adjustment = tf.sqrt(
e_tnorm / eig_tnorm_negList[0])
else:
eig_tnorm_negList_prod = reduce(
lambda x, y: x * y, eig_tnorm_negList)
adjustment = tf.pow(
tf.pow(e_tnorm, num_factors - 1.) / eig_tnorm_negList_prod, 1. / num_factors)
coeffs *= (e + adjustment * damping)
else:
coeffs = 1.
damping = (self._epsilon + weightDecayCoeff)
for e in eigVals:
coeffs *= e
coeffs += damping
#grad = tf.Print(grad, [tf.convert_to_tensor('1'), tf.convert_to_tensor(var.name), grad.get_shape()])
grad /= coeffs
#grad = tf.Print(grad, [tf.convert_to_tensor('2'), tf.convert_to_tensor(var.name), grad.get_shape()])
#####
# project gradient back to euclidean space
for idx, stats in enumerate(self.stats[var]['fprop_concat_stats']):
Q = self.stats_eigen[stats]['Q']
grad = gmatmul(Q, grad, transpose_a=False, reduce_dim=idx)
for idx, stats in enumerate(self.stats[var]['bprop_concat_stats']):
Q = self.stats_eigen[stats]['Q']
grad = gmatmul(grad, Q, transpose_b=True, reduce_dim=idx)
##
#grad = tf.Print(grad, [tf.convert_to_tensor('3'), tf.convert_to_tensor(var.name), grad.get_shape()])
if (self.stats[var]['assnBias'] is not None) and not self._blockdiag_bias:
# use homogeneous coordinates only works for 2D grad.
# TO-DO: figure out how to factorize bias grad
# un-stack bias grad
var_assnBias = self.stats[var]['assnBias']
C_plus_one = int(grad.get_shape()[0])
grad_assnBias = tf.reshape(tf.slice(grad,
begin=[
C_plus_one - 1, 0],
size=[1, -1]), var_assnBias.get_shape())
grad_assnWeights = tf.slice(grad,
begin=[0, 0],
size=[C_plus_one - 1, -1])
grad_dict[var_assnBias] = grad_assnBias
grad = grad_assnWeights
#grad = tf.Print(grad, [tf.convert_to_tensor('4'), tf.convert_to_tensor(var.name), grad.get_shape()])
if GRAD_RESHAPE:
grad = tf.reshape(grad, GRAD_SHAPE)
grad_dict[var] = grad
print(('projecting %d gradient matrices' % counter))
for g, var in zip(gradlist, varlist):
grad = grad_dict[var]
### clipping ###
if KFAC_DEBUG:
print(('apply clipping to %s' % (var.name)))
tf.Print(grad, [tf.sqrt(tf.reduce_sum(tf.pow(grad, 2)))], "Euclidean norm of new grad")
local_vg = tf.reduce_sum(grad * g * (self._lr * self._lr))
vg += local_vg
# recale everything
if KFAC_DEBUG:
print('apply vFv clipping')
scaling = tf.minimum(1., tf.sqrt(self._clip_kl / vg))
if KFAC_DEBUG:
scaling = tf.Print(scaling, [tf.convert_to_tensor(
'clip: '), scaling, tf.convert_to_tensor(' vFv: '), vg])
with tf.control_dependencies([tf.assign(self.vFv, vg)]):
updatelist = [grad_dict[var] for var in varlist]
for i, item in enumerate(updatelist):
updatelist[i] = scaling * item
return updatelist
def compute_gradients(self, loss, var_list=None):
varlist = var_list
if varlist is None:
varlist = tf.trainable_variables()
g = tf.gradients(loss, varlist)
return [(a, b) for a, b in zip(g, varlist)]
def apply_gradients_kfac(self, grads):
g, varlist = list(zip(*grads))
if len(self.stats_eigen) == 0:
self.getStatsEigen()
qr = None
# launch eigen-decomp on a queue thread
if self._async:
print('Use async eigen decomp')
# get a list of factor loading tensors
factorOps_dummy = self.computeStatsEigen()
# define a queue for the list of factor loading tensors
queue = tf.FIFOQueue(1, [item.dtype for item in factorOps_dummy], shapes=[
item.get_shape() for item in factorOps_dummy])
enqueue_op = tf.cond(tf.logical_and(tf.equal(tf.mod(self.stats_step, self._kfac_update), tf.convert_to_tensor(
0)), tf.greater_equal(self.stats_step, self._stats_accum_iter)), lambda: queue.enqueue(self.computeStatsEigen()), tf.no_op)
def dequeue_op():
return queue.dequeue()
qr = tf.train.QueueRunner(queue, [enqueue_op])
updateOps = []
global_step_op = tf.assign_add(self.global_step, 1)
updateOps.append(global_step_op)
with tf.control_dependencies([global_step_op]):
# compute updates
assert self._update_stats_op != None
updateOps.append(self._update_stats_op)
dependency_list = []
if not self._async:
dependency_list.append(self._update_stats_op)
with tf.control_dependencies(dependency_list):
def no_op_wrapper():
return tf.group(*[tf.assign_add(self.cold_step, 1)])
if not self._async:
# synchronous eigen-decomp updates
updateFactorOps = tf.cond(tf.logical_and(tf.equal(tf.mod(self.stats_step, self._kfac_update),
tf.convert_to_tensor(0)),
tf.greater_equal(self.stats_step, self._stats_accum_iter)), lambda: tf.group(*self.applyStatsEigen(self.computeStatsEigen())), no_op_wrapper)
else:
# asynchronous eigen-decomp updates using queue
updateFactorOps = tf.cond(tf.greater_equal(self.stats_step, self._stats_accum_iter),
lambda: tf.cond(tf.equal(queue.size(), tf.convert_to_tensor(0)),
tf.no_op,
lambda: tf.group(
*self.applyStatsEigen(dequeue_op())),
),
no_op_wrapper)
updateOps.append(updateFactorOps)
with tf.control_dependencies([updateFactorOps]):
def gradOp():
return list(g)
def getKfacGradOp():
return self.getKfacPrecondUpdates(g, varlist)
u = tf.cond(tf.greater(self.factor_step,
tf.convert_to_tensor(0)), getKfacGradOp, gradOp)
optim = tf.train.MomentumOptimizer(
self._lr * (1. - self._momentum), self._momentum)
#optim = tf.train.AdamOptimizer(self._lr, epsilon=0.01)
def optimOp():
def updateOptimOp():
if self._full_stats_init:
return tf.cond(tf.greater(self.factor_step, tf.convert_to_tensor(0)), lambda: optim.apply_gradients(list(zip(u, varlist))), tf.no_op)
else:
return optim.apply_gradients(list(zip(u, varlist)))
if self._full_stats_init:
return tf.cond(tf.greater_equal(self.stats_step, self._stats_accum_iter), updateOptimOp, tf.no_op)
else:
return tf.cond(tf.greater_equal(self.sgd_step, self._cold_iter), updateOptimOp, tf.no_op)
updateOps.append(optimOp())
return tf.group(*updateOps), qr
def apply_gradients(self, grads):
coldOptim = tf.train.MomentumOptimizer(
self._cold_lr, self._momentum)
def coldSGDstart():
sgd_grads, sgd_var = zip(*grads)
if self.max_grad_norm != None:
sgd_grads, sgd_grad_norm = tf.clip_by_global_norm(sgd_grads,self.max_grad_norm)
sgd_grads = list(zip(sgd_grads,sgd_var))
sgd_step_op = tf.assign_add(self.sgd_step, 1)
coldOptim_op = coldOptim.apply_gradients(sgd_grads)
if KFAC_DEBUG:
with tf.control_dependencies([sgd_step_op, coldOptim_op]):
sgd_step_op = tf.Print(
sgd_step_op, [self.sgd_step, tf.convert_to_tensor('doing cold sgd step')])
return tf.group(*[sgd_step_op, coldOptim_op])
kfacOptim_op, qr = self.apply_gradients_kfac(grads)
def warmKFACstart():
return kfacOptim_op
return tf.cond(tf.greater(self.sgd_step, self._cold_iter), warmKFACstart, coldSGDstart), qr
def minimize(self, loss, loss_sampled, var_list=None):
grads = self.compute_gradients(loss, var_list=var_list)
update_stats_op = self.compute_and_apply_stats(
loss_sampled, var_list=var_list)
return self.apply_gradients(grads)

View File

@@ -0,0 +1,124 @@
import tensorflow as tf
import numpy as np
def gmatmul(a, b, transpose_a=False, transpose_b=False, reduce_dim=None):
if reduce_dim == None:
# general batch matmul
if len(a.get_shape()) == 3 and len(b.get_shape()) == 3:
return tf.batch_matmul(a, b, adj_x=transpose_a, adj_y=transpose_b)
elif len(a.get_shape()) == 3 and len(b.get_shape()) == 2:
if transpose_b:
N = b.get_shape()[0].value
else:
N = b.get_shape()[1].value
B = a.get_shape()[0].value
if transpose_a:
K = a.get_shape()[1].value
a = tf.reshape(tf.transpose(a, [0, 2, 1]), [-1, K])
else:
K = a.get_shape()[-1].value
a = tf.reshape(a, [-1, K])
result = tf.matmul(a, b, transpose_b=transpose_b)
result = tf.reshape(result, [B, -1, N])
return result
elif len(a.get_shape()) == 2 and len(b.get_shape()) == 3:
if transpose_a:
M = a.get_shape()[1].value
else:
M = a.get_shape()[0].value
B = b.get_shape()[0].value
if transpose_b:
K = b.get_shape()[-1].value
b = tf.transpose(tf.reshape(b, [-1, K]), [1, 0])
else:
K = b.get_shape()[1].value
b = tf.transpose(tf.reshape(
tf.transpose(b, [0, 2, 1]), [-1, K]), [1, 0])
result = tf.matmul(a, b, transpose_a=transpose_a)
result = tf.transpose(tf.reshape(result, [M, B, -1]), [1, 0, 2])
return result
else:
return tf.matmul(a, b, transpose_a=transpose_a, transpose_b=transpose_b)
else:
# weird batch matmul
if len(a.get_shape()) == 2 and len(b.get_shape()) > 2:
# reshape reduce_dim to the left most dim in b
b_shape = b.get_shape()
if reduce_dim != 0:
b_dims = list(range(len(b_shape)))
b_dims.remove(reduce_dim)
b_dims.insert(0, reduce_dim)
b = tf.transpose(b, b_dims)
b_t_shape = b.get_shape()
b = tf.reshape(b, [int(b_shape[reduce_dim]), -1])
result = tf.matmul(a, b, transpose_a=transpose_a,
transpose_b=transpose_b)
result = tf.reshape(result, b_t_shape)
if reduce_dim != 0:
b_dims = list(range(len(b_shape)))
b_dims.remove(0)
b_dims.insert(reduce_dim, 0)
result = tf.transpose(result, b_dims)
return result
elif len(a.get_shape()) > 2 and len(b.get_shape()) == 2:
# reshape reduce_dim to the right most dim in a
a_shape = a.get_shape()
outter_dim = len(a_shape) - 1
reduce_dim = len(a_shape) - reduce_dim - 1
if reduce_dim != outter_dim:
a_dims = list(range(len(a_shape)))
a_dims.remove(reduce_dim)
a_dims.insert(outter_dim, reduce_dim)
a = tf.transpose(a, a_dims)
a_t_shape = a.get_shape()
a = tf.reshape(a, [-1, int(a_shape[reduce_dim])])
result = tf.matmul(a, b, transpose_a=transpose_a,
transpose_b=transpose_b)
result = tf.reshape(result, a_t_shape)
if reduce_dim != outter_dim:
a_dims = list(range(len(a_shape)))
a_dims.remove(outter_dim)
a_dims.insert(reduce_dim, outter_dim)
result = tf.transpose(result, a_dims)
return result
elif len(a.get_shape()) == 2 and len(b.get_shape()) == 2:
return tf.matmul(a, b, transpose_a=transpose_a, transpose_b=transpose_b)
assert False, 'something went wrong'
def clipoutNeg(vec, threshold=1e-6):
mask = tf.cast(vec > threshold, tf.float32)
return mask * vec
def detectMinVal(input_mat, var, threshold=1e-6, name='', debug=False):
eigen_min = tf.reduce_min(input_mat)
eigen_max = tf.reduce_max(input_mat)
eigen_ratio = eigen_max / eigen_min
input_mat_clipped = clipoutNeg(input_mat, threshold)
if debug:
input_mat_clipped = tf.cond(tf.logical_or(tf.greater(eigen_ratio, 0.), tf.less(eigen_ratio, -500)), lambda: input_mat_clipped, lambda: tf.Print(
input_mat_clipped, [tf.convert_to_tensor('screwed ratio ' + name + ' eigen values!!!'), tf.convert_to_tensor(var.name), eigen_min, eigen_max, eigen_ratio]))
return input_mat_clipped
def factorReshape(Q, e, grad, facIndx=0, ftype='act'):
grad_shape = grad.get_shape()
if ftype == 'act':
assert e.get_shape()[0] == grad_shape[facIndx]
expanded_shape = [1, ] * len(grad_shape)
expanded_shape[facIndx] = -1
e = tf.reshape(e, expanded_shape)
if ftype == 'grad':
assert e.get_shape()[0] == grad_shape[len(grad_shape) - facIndx - 1]
expanded_shape = [1, ] * len(grad_shape)
expanded_shape[len(grad_shape) - facIndx - 1] = -1
e = tf.reshape(e, expanded_shape)
return Q, e

View File

@@ -0,0 +1,80 @@
import numpy as np
import tensorflow as tf
from baselines.acktr.utils import conv, fc, dense, conv_to_fc, sample, kl_div
from baselines.common.distributions import make_pdtype
import baselines.common.tf_util as U
import gym
class CnnPolicy(object):
def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False):
nbatch = nenv*nsteps
nh, nw, nc = ob_space.shape
ob_shape = (nbatch, nh, nw, nc*nstack)
nact = ac_space.n
X = tf.placeholder(tf.uint8, ob_shape) #obs
with tf.variable_scope("model", reuse=reuse):
h = conv(tf.cast(X, tf.float32)/255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2))
h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))
h3 = conv(h2, 'c3', nf=32, rf=3, stride=1, init_scale=np.sqrt(2))
h3 = conv_to_fc(h3)
h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
pi = fc(h4, 'pi', nact, act=lambda x:x)
vf = fc(h4, 'v', 1, act=lambda x:x)
v0 = vf[:, 0]
a0 = sample(pi)
self.initial_state = [] #not stateful
def step(ob, *_args, **_kwargs):
a, v = sess.run([a0, v0], {X:ob})
return a, v, [] #dummy state
def value(ob, *_args, **_kwargs):
return sess.run(v0, {X:ob})
self.X = X
self.pi = pi
self.vf = vf
self.step = step
self.value = value
class GaussianMlpPolicy(object):
def __init__(self, ob_dim, ac_dim):
# Here we'll construct a bunch of expressions, which will be used in two places:
# (1) When sampling actions
# (2) When computing loss functions, for the policy update
# Variables specific to (1) have the word "sampled" in them,
# whereas variables specific to (2) have the word "old" in them
ob_no = tf.placeholder(tf.float32, shape=[None, ob_dim*2], name="ob") # batch of observations
oldac_na = tf.placeholder(tf.float32, shape=[None, ac_dim], name="ac") # batch of actions previous actions
oldac_dist = tf.placeholder(tf.float32, shape=[None, ac_dim*2], name="oldac_dist") # batch of actions previous action distributions
adv_n = tf.placeholder(tf.float32, shape=[None], name="adv") # advantage function estimate
oldlogprob_n = tf.placeholder(tf.float32, shape=[None], name='oldlogprob') # log probability of previous actions
wd_dict = {}
h1 = tf.nn.tanh(dense(ob_no, 64, "h1", weight_init=U.normc_initializer(1.0), bias_init=0.0, weight_loss_dict=wd_dict))
h2 = tf.nn.tanh(dense(h1, 64, "h2", weight_init=U.normc_initializer(1.0), bias_init=0.0, weight_loss_dict=wd_dict))
mean_na = dense(h2, ac_dim, "mean", weight_init=U.normc_initializer(0.1), bias_init=0.0, weight_loss_dict=wd_dict) # Mean control output
self.wd_dict = wd_dict
self.logstd_1a = logstd_1a = tf.get_variable("logstd", [ac_dim], tf.float32, tf.zeros_initializer()) # Variance on outputs
logstd_1a = tf.expand_dims(logstd_1a, 0)
std_1a = tf.exp(logstd_1a)
std_na = tf.tile(std_1a, [tf.shape(mean_na)[0], 1])
ac_dist = tf.concat([tf.reshape(mean_na, [-1, ac_dim]), tf.reshape(std_na, [-1, ac_dim])], 1)
sampled_ac_na = tf.random_normal(tf.shape(ac_dist[:,ac_dim:])) * ac_dist[:,ac_dim:] + ac_dist[:,:ac_dim] # This is the sampled action we'll perform.
logprobsampled_n = - U.sum(tf.log(ac_dist[:,ac_dim:]), axis=1) - 0.5 * tf.log(2.0*np.pi)*ac_dim - 0.5 * U.sum(tf.square(ac_dist[:,:ac_dim] - sampled_ac_na) / (tf.square(ac_dist[:,ac_dim:])), axis=1) # Logprob of sampled action
logprob_n = - U.sum(tf.log(ac_dist[:,ac_dim:]), axis=1) - 0.5 * tf.log(2.0*np.pi)*ac_dim - 0.5 * U.sum(tf.square(ac_dist[:,:ac_dim] - oldac_na) / (tf.square(ac_dist[:,ac_dim:])), axis=1) # Logprob of previous actions under CURRENT policy (whereas oldlogprob_n is under OLD policy)
kl = U.mean(kl_div(oldac_dist, ac_dist, ac_dim))
#kl = .5 * U.mean(tf.square(logprob_n - oldlogprob_n)) # Approximation of KL divergence between old policy used to generate actions, and new policy used to compute logprob_n
surr = - U.mean(adv_n * logprob_n) # Loss function that we'll differentiate to get the policy gradient
surr_sampled = - U.mean(logprob_n) # Sampled loss of the policy
self._act = U.function([ob_no], [sampled_ac_na, ac_dist, logprobsampled_n]) # Generate a new action and its logprob
#self.compute_kl = U.function([ob_no, oldac_na, oldlogprob_n], kl) # Compute (approximate) KL divergence between old policy and new policy
self.compute_kl = U.function([ob_no, oldac_dist], kl)
self.update_info = ((ob_no, oldac_na, adv_n), surr, surr_sampled) # Input and output variables needed for computing loss
U.initialize() # Initialize uninitialized TF variables
def act(self, ob):
ac, ac_dist, logp = self._act(ob[None])
return ac[0], ac_dist[0], logp[0]

View File

@@ -0,0 +1,36 @@
#!/usr/bin/env python
import os, logging, gym
from baselines import logger
from baselines.common import set_global_seeds
from baselines import bench
from baselines.acktr.acktr_disc import learn
from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
from baselines.common.atari_wrappers import wrap_deepmind
from baselines.acktr.policies import CnnPolicy
def train(env_id, num_timesteps, seed, num_cpu):
num_timesteps //= 4
def make_env(rank):
def _thunk():
env = gym.make(env_id)
env.seed(seed + rank)
if logger.get_dir():
env = bench.Monitor(env, os.path.join(logger.get_dir(), "{}.monitor.json".format(rank)))
gym.logger.setLevel(logging.WARN)
return wrap_deepmind(env)
return _thunk
set_global_seeds(seed)
env = SubprocVecEnv([make_env(i) for i in range(num_cpu)])
policy_fn = CnnPolicy
learn(policy_fn, env, seed, total_timesteps=num_timesteps, nprocs=num_cpu)
env.close()
def main():
train('BreakoutNoFrameskip-v4', num_timesteps=int(40e6), seed=0, num_cpu=32)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,42 @@
#!/usr/bin/env python
import argparse
import logging
import os
import tensorflow as tf
import gym
from baselines import logger
from baselines.common import set_global_seeds
from baselines import bench
from baselines.acktr.acktr_cont import learn
from baselines.acktr.policies import GaussianMlpPolicy
from baselines.acktr.value_functions import NeuralNetValueFunction
def train(env_id, num_timesteps, seed):
env=gym.make(env_id)
if logger.get_dir():
env = bench.Monitor(env, os.path.join(logger.get_dir(), "monitor.json"))
set_global_seeds(seed)
env.seed(seed)
gym.logger.setLevel(logging.WARN)
with tf.Session(config=tf.ConfigProto()) as session:
ob_dim = env.observation_space.shape[0]
ac_dim = env.action_space.shape[0]
with tf.variable_scope("vf"):
vf = NeuralNetValueFunction(ob_dim, ac_dim)
with tf.variable_scope("pi"):
policy = GaussianMlpPolicy(ob_dim, ac_dim)
learn(env, policy=policy, vf=vf,
gamma=0.99, lam=0.97, timesteps_per_batch=2500,
desired_kl=0.002,
num_timesteps=num_timesteps, animate=False)
env.close()
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Run Mujoco benchmark.')
parser.add_argument('--env_id', type=str, default="Reacher-v1")
args = parser.parse_args()
train(args.env_id, num_timesteps=1e6, seed=1)

View File

@@ -0,0 +1,46 @@
import numpy as np
# http://www.johndcook.com/blog/standard_deviation/
class RunningStat(object):
def __init__(self, shape):
self._n = 0
self._M = np.zeros(shape)
self._S = np.zeros(shape)
def push(self, x):
x = np.asarray(x)
assert x.shape == self._M.shape
self._n += 1
if self._n == 1:
self._M[...] = x
else:
oldM = self._M.copy()
self._M[...] = oldM + (x - oldM)/self._n
self._S[...] = self._S + (x - oldM)*(x - self._M)
@property
def n(self):
return self._n
@property
def mean(self):
return self._M
@property
def var(self):
return self._S/(self._n - 1) if self._n > 1 else np.square(self._M)
@property
def std(self):
return np.sqrt(self.var)
@property
def shape(self):
return self._M.shape
def test_running_stat():
for shp in ((), (3,), (3,4)):
li = []
rs = RunningStat(shp)
for _ in range(5):
val = np.random.randn(*shp)
rs.push(val)
li.append(val)
m = np.mean(li, axis=0)
assert np.allclose(rs.mean, m)
v = np.square(m) if (len(li) == 1) else np.var(li, ddof=1, axis=0)
assert np.allclose(rs.var, v)

200
baselines/acktr/utils.py Normal file
View File

@@ -0,0 +1,200 @@
import os
import numpy as np
import tensorflow as tf
import baselines.common.tf_util as U
from collections import deque
def sample(logits):
noise = tf.random_uniform(tf.shape(logits))
return tf.argmax(logits - tf.log(-tf.log(noise)), 1)
def std(x):
mean = tf.reduce_mean(x)
var = tf.reduce_mean(tf.square(x-mean))
return tf.sqrt(var)
def cat_entropy(logits):
a0 = logits - tf.reduce_max(logits, 1, keep_dims=True)
ea0 = tf.exp(a0)
z0 = tf.reduce_sum(ea0, 1, keep_dims=True)
p0 = ea0 / z0
return tf.reduce_sum(p0 * (tf.log(z0) - a0), 1)
def cat_entropy_softmax(p0):
return - tf.reduce_sum(p0 * tf.log(p0 + 1e-6), axis = 1)
def mse(pred, target):
return tf.square(pred-target)/2.
def ortho_init(scale=1.0):
def _ortho_init(shape, dtype, partition_info=None):
#lasagne ortho init for tf
shape = tuple(shape)
if len(shape) == 2:
flat_shape = shape
elif len(shape) == 4: # assumes NHWC
flat_shape = (np.prod(shape[:-1]), shape[-1])
else:
raise NotImplementedError
a = np.random.normal(0.0, 1.0, flat_shape)
u, _, v = np.linalg.svd(a, full_matrices=False)
q = u if u.shape == flat_shape else v # pick the one with the correct shape
q = q.reshape(shape)
return (scale * q[:shape[0], :shape[1]]).astype(np.float32)
return _ortho_init
def conv(x, scope, nf, rf, stride, pad='VALID', act=tf.nn.relu, init_scale=1.0):
with tf.variable_scope(scope):
nin = x.get_shape()[3].value
w = tf.get_variable("w", [rf, rf, nin, nf], initializer=ortho_init(init_scale))
b = tf.get_variable("b", [nf], initializer=tf.constant_initializer(0.0))
z = tf.nn.conv2d(x, w, strides=[1, stride, stride, 1], padding=pad)+b
h = act(z)
return h
def fc(x, scope, nh, act=tf.nn.relu, init_scale=1.0):
with tf.variable_scope(scope):
nin = x.get_shape()[1].value
w = tf.get_variable("w", [nin, nh], initializer=ortho_init(init_scale))
b = tf.get_variable("b", [nh], initializer=tf.constant_initializer(0.0))
z = tf.matmul(x, w)+b
h = act(z)
return h
def dense(x, size, name, weight_init=None, bias_init=0, weight_loss_dict=None, reuse=None):
with tf.variable_scope(name, reuse=reuse):
assert (len(U.scope_name().split('/')) == 2)
w = tf.get_variable("w", [x.get_shape()[1], size], initializer=weight_init)
b = tf.get_variable("b", [size], initializer=tf.constant_initializer(bias_init))
weight_decay_fc = 3e-4
if weight_loss_dict is not None:
weight_decay = tf.multiply(tf.nn.l2_loss(w), weight_decay_fc, name='weight_decay_loss')
if weight_loss_dict is not None:
weight_loss_dict[w] = weight_decay_fc
weight_loss_dict[b] = 0.0
tf.add_to_collection(U.scope_name().split('/')[0] + '_' + 'losses', weight_decay)
return tf.nn.bias_add(tf.matmul(x, w), b)
def conv_to_fc(x):
nh = np.prod([v.value for v in x.get_shape()[1:]])
x = tf.reshape(x, [-1, nh])
return x
def kl_div(action_dist1, action_dist2, action_size):
mean1, std1 = action_dist1[:, :action_size], action_dist1[:, action_size:]
mean2, std2 = action_dist2[:, :action_size], action_dist2[:, action_size:]
numerator = tf.square(mean1 - mean2) + tf.square(std1) - tf.square(std2)
denominator = 2 * tf.square(std2) + 1e-8
return tf.reduce_sum(
numerator/denominator + tf.log(std2) - tf.log(std1),reduction_indices=-1)
def discount_with_dones(rewards, dones, gamma):
discounted = []
r = 0
for reward, done in zip(rewards[::-1], dones[::-1]):
r = reward + gamma*r*(1.-done) # fixed off by one bug
discounted.append(r)
return discounted[::-1]
def find_trainable_variables(key):
with tf.variable_scope(key):
return tf.trainable_variables()
def make_path(f):
return os.makedirs(f, exist_ok=True)
def constant(p):
return 1
def linear(p):
return 1-p
def middle_drop(p):
eps = 0.75
if 1-p<eps:
return eps*0.1
return 1-p
def double_linear_con(p):
p *= 2
eps = 0.125
if 1-p<eps:
return eps
return 1-p
def double_middle_drop(p):
eps1 = 0.75
eps2 = 0.25
if 1-p<eps1:
if 1-p<eps2:
return eps2*0.5
return eps1*0.1
return 1-p
schedules = {
'linear':linear,
'constant':constant,
'double_linear_con':double_linear_con,
'middle_drop':middle_drop,
'double_middle_drop':double_middle_drop
}
class Scheduler(object):
def __init__(self, v, nvalues, schedule):
self.n = 0.
self.v = v
self.nvalues = nvalues
self.schedule = schedules[schedule]
def value(self):
current_value = self.v*self.schedule(self.n/self.nvalues)
self.n += 1.
return current_value
def value_steps(self, steps):
return self.v*self.schedule(steps/self.nvalues)
class EpisodeStats:
def __init__(self, nsteps, nenvs):
self.episode_rewards = []
for i in range(nenvs):
self.episode_rewards.append([])
self.lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths
self.rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards
self.nsteps = nsteps
self.nenvs = nenvs
def feed(self, rewards, masks):
rewards = np.reshape(rewards, [self.nenvs, self.nsteps])
masks = np.reshape(masks, [self.nenvs, self.nsteps])
for i in range(0, self.nenvs):
for j in range(0, self.nsteps):
self.episode_rewards[i].append(rewards[i][j])
if masks[i][j]:
l = len(self.episode_rewards[i])
s = sum(self.episode_rewards[i])
self.lenbuffer.append(l)
self.rewbuffer.append(s)
self.episode_rewards[i] = []
def mean_length(self):
if self.lenbuffer:
return np.mean(self.lenbuffer)
else:
return 0 # on the first params dump, no episodes are finished
def mean_reward(self):
if self.rewbuffer:
return np.mean(self.rewbuffer)
else:
return 0

View File

@@ -0,0 +1,50 @@
from baselines import logger
import numpy as np
from baselines import common
from baselines.common import tf_util as U
import tensorflow as tf
import kfac
from utils import dense
class NeuralNetValueFunction(object):
def __init__(self, ob_dim, ac_dim): #pylint: disable=W0613
X = tf.placeholder(tf.float32, shape=[None, ob_dim*2+ac_dim*2+2]) # batch of observations
vtarg_n = tf.placeholder(tf.float32, shape=[None], name='vtarg')
wd_dict = {}
h1 = tf.nn.elu(dense(X, 64, "h1", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict))
h2 = tf.nn.elu(dense(h1, 64, "h2", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict))
vpred_n = dense(h2, 1, "hfinal", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)[:,0]
sample_vpred_n = vpred_n + tf.random_normal(tf.shape(vpred_n))
wd_loss = tf.get_collection("vf_losses", None)
loss = U.mean(tf.square(vpred_n - vtarg_n)) + tf.add_n(wd_loss)
loss_sampled = U.mean(tf.square(vpred_n - tf.stop_gradient(sample_vpred_n)))
self._predict = U.function([X], vpred_n)
optim = kfac.KfacOptimizer(learning_rate=0.001, cold_lr=0.001*(1-0.9), momentum=0.9, \
clip_kl=0.3, epsilon=0.1, stats_decay=0.95, \
async=1, kfac_update=2, cold_iter=50, \
weight_decay_dict=wd_dict, max_grad_norm=None)
vf_var_list = []
for var in tf.trainable_variables():
if "vf" in var.name:
vf_var_list.append(var)
update_op, self.q_runner = optim.minimize(loss, loss_sampled, var_list=vf_var_list)
self.do_update = U.function([X, vtarg_n], update_op) #pylint: disable=E1101
U.initialize() # Initialize uninitialized TF variables
def _preproc(self, path):
l = pathlength(path)
al = np.arange(l).reshape(-1,1)/10.0
act = path["action_dist"].astype('float32')
X = np.concatenate([path['observation'], act, al, np.ones((l, 1))], axis=1)
return X
def predict(self, path):
return self._predict(self._preproc(path))
def fit(self, paths, targvals):
X = np.concatenate([self._preproc(p) for p in paths])
y = np.concatenate(targvals)
logger.record_tabular("EVBefore", common.explained_variance(self._predict(X), y))
for _ in range(25): self.do_update(X, y)
logger.record_tabular("EVAfter", common.explained_variance(self._predict(X), y))
def pathlength(path):
return path["reward"].shape[0]

View File

@@ -22,6 +22,13 @@ def get_task(benchmark, env_id):
"""Get a task by env_id. Return None if the benchmark doesn't have the env""" """Get a task by env_id. Return None if the benchmark doesn't have the env"""
return next(filter(lambda task: task['env_id'] == env_id, benchmark['tasks']), None) return next(filter(lambda task: task['env_id'] == env_id, benchmark['tasks']), None)
def find_task_for_env_id_in_any_benchmark(env_id):
for bm in _BENCHMARKS:
for task in bm["tasks"]:
if task["env_id"]==env_id:
return bm, task
return None, None
_ATARI_SUFFIX = 'NoFrameskip-v4' _ATARI_SUFFIX = 'NoFrameskip-v4'
register_benchmark({ register_benchmark({
@@ -49,30 +56,61 @@ register_benchmark({
}) })
# MuJoCo
_mujocosmall = [ _mujocosmall = [
'InvertedDoublePendulum-v1', 'InvertedPendulum-v1', 'InvertedDoublePendulum-v1', 'InvertedPendulum-v1',
'HalfCheetah-v1', 'Hopper-v1', 'Walker2d-v1', 'HalfCheetah-v1', 'Hopper-v1', 'Walker2d-v1',
'Reacher-v1', 'Swimmer-v1'] 'Reacher-v1', 'Swimmer-v1']
register_benchmark({ register_benchmark({
'name' : 'Mujoco1M', 'name' : 'Mujoco1M',
'description' : 'Some small 2D MuJoCo tasks, run for 1M timesteps', 'description' : 'Some small 2D MuJoCo tasks, run for 1M timesteps',
'tasks' : [{'env_id' : _envid, 'trials' : 3, 'num_timesteps' : int(1e6)} for _envid in _mujocosmall] 'tasks' : [{'env_id' : _envid, 'trials' : 3, 'num_timesteps' : int(1e6)} for _envid in _mujocosmall]
}) })
register_benchmark({
_roboschool_mujoco = [ 'name' : 'MujocoWalkers',
'RoboschoolInvertedDoublePendulum-v0', 'RoboschoolInvertedPendulum-v0', # cartpole 'description' : 'MuJoCo forward walkers, run for 8M, humanoid 100M',
'RoboschoolHalfCheetah-v0', 'RoboschoolHopper-v0', 'RoboschoolWalker2d-v0', # forward walkers 'tasks' : [
'RoboschoolReacher-v0' {'env_id' : "Hopper-v1", 'trials' : 4, 'num_timesteps' : 8*1000000 },
{'env_id' : "Walker2d-v1", 'trials' : 4, 'num_timesteps' : 8*1000000 },
{'env_id' : "Humanoid-v1", 'trials' : 4, 'num_timesteps' : 100*1000000 },
] ]
})
# To reproduce:
# python3 baselines/baselines/ppo2/ppo2_run_benchmark.py gce MujocoWalkers myrun_ppo2_whiteobs1_cpu8
# (observation input filters necessary)
# Roboschool
register_benchmark({ register_benchmark({
'name' : 'RoboschoolMujoco2M', 'name' : 'Roboschool8M',
'description' : 'Same small 2D tasks, still improving up to 2M', 'description' : 'Small 2D tasks, up to 30 minutes to complete on 8 cores',
'tasks' : [{'env_id' : _envid, 'trials' : 3, 'num_timesteps' : int(2e6)} for _envid in _roboschool_mujoco] 'tasks' : [
{'env_id' : "RoboschoolReacher-v1", 'trials' : 4, 'num_timesteps' : 2*1000000 },
{'env_id' : "RoboschoolAnt-v1", 'trials' : 4, 'num_timesteps' : 8*1000000 },
{'env_id' : "RoboschoolHalfCheetah-v1", 'trials' : 4, 'num_timesteps' : 8*1000000 },
{'env_id' : "RoboschoolHopper-v1", 'trials' : 4, 'num_timesteps' : 8*1000000 },
{'env_id' : "RoboschoolWalker2d-v1", 'trials' : 4, 'num_timesteps' : 8*1000000 },
]
}) })
register_benchmark({
'name' : 'RoboschoolHarder',
'description' : 'Test your might!!! Up to 12 hours on 32 cores',
'tasks' : [
{'env_id' : "RoboschoolHumanoid-v1", 'trials' : 4, 'num_timesteps' : 100*1000000 },
{'env_id' : "RoboschoolHumanoidFlagrun-v1", 'trials' : 4, 'num_timesteps' : 200*1000000 },
{'env_id' : "RoboschoolHumanoidFlagrunHarder-v1", 'trials' : 4, 'num_timesteps' : 400*1000000 },
]
})
# To reproduce:
# python3 baselines/baselines/ppo2/ppo2_run_benchmark.py gce Roboschool8M myrun_ppo2_cpu8
# python3 baselines/baselines/ppo2/ppo2_run_benchmark.py gce RoboschoolHarder myrun_ppo2_cpu32_large_samples65536
# (Large network, train on 65536 samples each iteration. Also, _large is really necessary only for Harder)
# Other
_atari50 = [ # actually 49 _atari50 = [ # actually 49
'Alien', 'Amidar', 'Assault', 'Asterix', 'Asteroids', 'Alien', 'Amidar', 'Assault', 'Asterix', 'Asteroids',
'Atlantis', 'BankHeist', 'BattleZone', 'BeamRider', 'Bowling', 'Atlantis', 'BankHeist', 'BattleZone', 'BeamRider', 'Bowling',
@@ -91,3 +129,12 @@ register_benchmark({
'description' :'7 Atari games from Mnih et al. (2013), with pixel observations, 40M frames', 'description' :'7 Atari games from Mnih et al. (2013), with pixel observations, 40M frames',
'tasks' : [{'env_id' : _game + _ATARI_SUFFIX, 'trials' : 3, 'num_timesteps' : int(40e6)} for _game in _atari50] 'tasks' : [{'env_id' : _game + _ATARI_SUFFIX, 'trials' : 3, 'num_timesteps' : int(40e6)} for _game in _atari50]
}) })
def env_shortname(s):
"Make typical names above shorter, while keeping recognizable"
s = s.replace("NoFrameskip", "")
if s[:10]=="Roboschool": s = s[10:]
i = s.rfind("-v")
if i!=-1: s = s[:i]
return s.lower()

View File

@@ -117,7 +117,7 @@ class LoadMonitorResultsError(Exception):
def get_monitor_files(dir): def get_monitor_files(dir):
return glob(path.join(dir, "*" + Monitor.EXT)) return glob(path.join(dir, "*" + Monitor.EXT))
def load_results(dir): def load_results(dir, raw_episodes=False):
fnames = get_monitor_files(dir) fnames = get_monitor_files(dir)
if not fnames: if not fnames:
raise LoadMonitorResultsError("no monitor files of the form *%s found in %s" % (Monitor.EXT, dir)) raise LoadMonitorResultsError("no monitor files of the form *%s found in %s" % (Monitor.EXT, dir))
@@ -137,10 +137,13 @@ def load_results(dir):
for header in headers[1:]: for header in headers[1:]:
assert header['env_id'] == header0['env_id'], "mixing data from two envs" assert header['env_id'] == header0['env_id'], "mixing data from two envs"
episodes = sorted(episodes, key=lambda e: e['abstime']) episodes = sorted(episodes, key=lambda e: e['abstime'])
return { if raw_episodes:
'env_info': {'env_id': header0['env_id'], 'gym_version': header0['gym_version']}, return episodes
'episode_end_times': [e['abstime'] for e in episodes], else:
'episode_lengths': [e['l'] for e in episodes], return {
'episode_rewards': [e['r'] for e in episodes], 'env_info': {'env_id': header0['env_id'], 'gym_version': header0['gym_version']},
'initial_reset_time': min([min(header['t_start'] for header in headers)]) 'episode_end_times': [e['abstime'] for e in episodes],
} 'episode_lengths': [e['l'] for e in episodes],
'episode_rewards': [e['r'] for e in episodes],
'initial_reset_time': min([min(header['t_start'] for header in headers)])
}

View File

@@ -108,7 +108,7 @@ class BernoulliPdType(PdType):
# def flatparam(self): # def flatparam(self):
# return self.logits # return self.logits
# def mode(self): # def mode(self):
# return U.argmax(self.logits, axis=1) # return U.argmax(self.logits, axis=-1)
# def logp(self, x): # def logp(self, x):
# return -tf.nn.sparse_softmax_cross_entropy_with_logits(self.logits, x) # return -tf.nn.sparse_softmax_cross_entropy_with_logits(self.logits, x)
# def kl(self, other): # def kl(self, other):
@@ -118,7 +118,7 @@ class BernoulliPdType(PdType):
# return tf.nn.softmax_cross_entropy_with_logits(self.logits, self.ps) # return tf.nn.softmax_cross_entropy_with_logits(self.logits, self.ps)
# def sample(self): # def sample(self):
# u = tf.random_uniform(tf.shape(self.logits)) # u = tf.random_uniform(tf.shape(self.logits))
# return U.argmax(self.logits - tf.log(-tf.log(u)), axis=1) # return U.argmax(self.logits - tf.log(-tf.log(u)), axis=-1)
class CategoricalPd(Pd): class CategoricalPd(Pd):
def __init__(self, logits): def __init__(self, logits):
@@ -126,27 +126,33 @@ class CategoricalPd(Pd):
def flatparam(self): def flatparam(self):
return self.logits return self.logits
def mode(self): def mode(self):
return U.argmax(self.logits, axis=1) return U.argmax(self.logits, axis=-1)
def neglogp(self, x): def neglogp(self, x):
return tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=x) # return tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=x)
# Note: we can't use sparse_softmax_cross_entropy_with_logits because
# the implementation does not allow second-order derivatives...
one_hot_actions = tf.one_hot(x, self.logits.get_shape().as_list()[-1])
return tf.nn.softmax_cross_entropy_with_logits(
logits=self.logits,
labels=one_hot_actions)
def kl(self, other): def kl(self, other):
a0 = self.logits - U.max(self.logits, axis=1, keepdims=True) a0 = self.logits - U.max(self.logits, axis=-1, keepdims=True)
a1 = other.logits - U.max(other.logits, axis=1, keepdims=True) a1 = other.logits - U.max(other.logits, axis=-1, keepdims=True)
ea0 = tf.exp(a0) ea0 = tf.exp(a0)
ea1 = tf.exp(a1) ea1 = tf.exp(a1)
z0 = U.sum(ea0, axis=1, keepdims=True) z0 = U.sum(ea0, axis=-1, keepdims=True)
z1 = U.sum(ea1, axis=1, keepdims=True) z1 = U.sum(ea1, axis=-1, keepdims=True)
p0 = ea0 / z0 p0 = ea0 / z0
return U.sum(p0 * (a0 - tf.log(z0) - a1 + tf.log(z1)), axis=1) return U.sum(p0 * (a0 - tf.log(z0) - a1 + tf.log(z1)), axis=-1)
def entropy(self): def entropy(self):
a0 = self.logits - U.max(self.logits, axis=1, keepdims=True) a0 = self.logits - U.max(self.logits, axis=-1, keepdims=True)
ea0 = tf.exp(a0) ea0 = tf.exp(a0)
z0 = U.sum(ea0, axis=1, keepdims=True) z0 = U.sum(ea0, axis=-1, keepdims=True)
p0 = ea0 / z0 p0 = ea0 / z0
return U.sum(p0 * (tf.log(z0) - a0), axis=1) return U.sum(p0 * (tf.log(z0) - a0), axis=-1)
def sample(self): def sample(self):
u = tf.random_uniform(tf.shape(self.logits)) u = tf.random_uniform(tf.shape(self.logits))
return tf.argmax(self.logits - tf.log(-tf.log(u)), axis=1) return tf.argmax(self.logits - tf.log(-tf.log(u)), axis=-1)
@classmethod @classmethod
def fromflat(cls, flat): def fromflat(cls, flat):
return cls(flat) return cls(flat)
@@ -177,7 +183,7 @@ class MultiCategoricalPd(Pd):
class DiagGaussianPd(Pd): class DiagGaussianPd(Pd):
def __init__(self, flat): def __init__(self, flat):
self.flat = flat self.flat = flat
mean, logstd = tf.split(axis=len(flat.get_shape()) - 1, num_or_size_splits=2, value=flat) mean, logstd = tf.split(axis=len(flat.shape)-1, num_or_size_splits=2, value=flat)
self.mean = mean self.mean = mean
self.logstd = logstd self.logstd = logstd
self.std = tf.exp(logstd) self.std = tf.exp(logstd)
@@ -186,14 +192,14 @@ class DiagGaussianPd(Pd):
def mode(self): def mode(self):
return self.mean return self.mean
def neglogp(self, x): def neglogp(self, x):
return 0.5 * U.sum(tf.square((x - self.mean) / self.std), axis=len(x.get_shape()) - 1) \ return 0.5 * U.sum(tf.square((x - self.mean) / self.std), axis=-1) \
+ 0.5 * np.log(2.0 * np.pi) * tf.to_float(tf.shape(x)[-1]) \ + 0.5 * np.log(2.0 * np.pi) * tf.to_float(tf.shape(x)[-1]) \
+ U.sum(self.logstd, axis=len(x.get_shape()) - 1) + U.sum(self.logstd, axis=-1)
def kl(self, other): def kl(self, other):
assert isinstance(other, DiagGaussianPd) assert isinstance(other, DiagGaussianPd)
return U.sum(other.logstd - self.logstd + (tf.square(self.std) + tf.square(self.mean - other.mean)) / (2.0 * tf.square(other.std)) - 0.5, axis=-1) return U.sum(other.logstd - self.logstd + (tf.square(self.std) + tf.square(self.mean - other.mean)) / (2.0 * tf.square(other.std)) - 0.5, axis=-1)
def entropy(self): def entropy(self):
return U.sum(self.logstd + .5 * np.log(2.0 * np.pi * np.e), -1) return U.sum(self.logstd + .5 * np.log(2.0 * np.pi * np.e), axis=-1)
def sample(self): def sample(self):
return self.mean + self.std * tf.random_normal(tf.shape(self.mean)) return self.mean + self.std * tf.random_normal(tf.shape(self.mean))
@classmethod @classmethod
@@ -209,11 +215,11 @@ class BernoulliPd(Pd):
def mode(self): def mode(self):
return tf.round(self.ps) return tf.round(self.ps)
def neglogp(self, x): def neglogp(self, x):
return U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=tf.to_float(x)), axis=1) return U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=tf.to_float(x)), axis=-1)
def kl(self, other): def kl(self, other):
return U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=other.logits, labels=self.ps), axis=1) - U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps), axis=1) return U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=other.logits, labels=self.ps), axis=-1) - U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps), axis=-1)
def entropy(self): def entropy(self):
return U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps), axis=1) return U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps), axis=-1)
def sample(self): def sample(self):
u = tf.random_uniform(tf.shape(self.ps)) u = tf.random_uniform(tf.shape(self.ps))
return tf.to_float(math_ops.less(u, self.ps)) return tf.to_float(math_ops.less(u, self.ps))
@@ -286,4 +292,3 @@ def validate_probtype(probtype, pdparam):
klval_ll = - entval - logliks.mean() #pylint: disable=E1101 klval_ll = - entval - logliks.mean() #pylint: disable=E1101
klval_ll_stderr = logliks.std() / np.sqrt(N) #pylint: disable=E1101 klval_ll_stderr = logliks.std() / np.sqrt(N) #pylint: disable=E1101
assert np.abs(klval - klval_ll) < 3 * klval_ll_stderr # within 3 sigmas assert np.abs(klval - klval_ll) < 3 * klval_ll_stderr # within 3 sigmas

View File

@@ -0,0 +1,19 @@
class VecEnv(object):
"""
Vectorized environment base class
"""
def step(self, vac):
"""
Apply sequence of actions to sequence of environments
actions -> (observations, rewards, news)
where 'news' is a boolean vector indicating whether each element is new.
"""
raise NotImplementedError
def reset(self):
"""
Reset all environments
"""
raise NotImplementedError
def close(self):
pass

View File

@@ -0,0 +1,74 @@
import numpy as np
from multiprocessing import Process, Pipe
from baselines.common.vec_env import VecEnv
def worker(remote, env_fn_wrapper):
env = env_fn_wrapper.x()
while True:
cmd, data = remote.recv()
if cmd == 'step':
ob, reward, done, info = env.step(data)
if done:
ob = env.reset()
remote.send((ob, reward, done, info))
elif cmd == 'reset':
ob = env.reset()
remote.send(ob)
elif cmd == 'close':
remote.close()
break
elif cmd == 'get_spaces':
remote.send((env.action_space, env.observation_space))
else:
raise NotImplementedError
class CloudpickleWrapper(object):
"""
Uses cloudpickle to serialize contents (otherwise multiprocessing tries to use pickle)
"""
def __init__(self, x):
self.x = x
def __getstate__(self):
import cloudpickle
return cloudpickle.dumps(self.x)
def __setstate__(self, ob):
import pickle
self.x = pickle.loads(ob)
class SubprocVecEnv(VecEnv):
def __init__(self, env_fns):
"""
envs: list of gym environments to run in subprocesses
"""
nenvs = len(env_fns)
self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(nenvs)])
self.ps = [Process(target=worker, args=(work_remote, CloudpickleWrapper(env_fn)))
for (work_remote, env_fn) in zip(self.work_remotes, env_fns)]
for p in self.ps:
p.start()
self.remotes[0].send(('get_spaces', None))
self.action_space, self.observation_space = self.remotes[0].recv()
def step(self, actions):
for remote, action in zip(self.remotes, actions):
remote.send(('step', action))
results = [remote.recv() for remote in self.remotes]
obs, rews, dones, infos = zip(*results)
return np.stack(obs), np.stack(rews), np.stack(dones), infos
def reset(self):
for remote in self.remotes:
remote.send(('reset', None))
return np.stack([remote.recv() for remote in self.remotes])
def close(self):
for remote in self.remotes:
remote.send(('close', None))
for p in self.ps:
p.join()
@property
def num_envs(self):
return len(self.remotes)

View File

@@ -15,7 +15,7 @@ python -m baselines.deepq.experiments.enjoy_cartpole
``` ```
Be sure to check out the source code of [both](experiments/train_cartpole.py) [files](experiments/enjoy_cartpole.py)! Be sure to check out the source code of [both](baselines/deepq/experiments/train_cartpole.py) [files](baselines/deepq/experiments/enjoy_cartpole.py)!
## If you wish to apply DQN to solve a problem. ## If you wish to apply DQN to solve a problem.
@@ -49,4 +49,4 @@ Once you pick a model, you can download it and visualize the learned policy. Be
python -m baselines.deepq.experiments.atari.download_model --blob model-atari-duel-pong-1 --model-dir /tmp/models python -m baselines.deepq.experiments.atari.download_model --blob model-atari-duel-pong-1 --model-dir /tmp/models
python -m baselines.deepq.experiments.atari.enjoy --model-dir /tmp/models/model-atari-duel-pong-1 --env Pong --dueling python -m baselines.deepq.experiments.atari.enjoy --model-dir /tmp/models/model-atari-duel-pong-1 --env Pong --dueling
``` ```

View File

@@ -43,7 +43,6 @@ def parse_args():
parser.add_argument("--target-update-freq", type=int, default=40000, help="number of iterations between every target network update") parser.add_argument("--target-update-freq", type=int, default=40000, help="number of iterations between every target network update")
parser.add_argument("--param-noise-update-freq", type=int, default=50, help="number of iterations between every re-scaling of the parameter noise") parser.add_argument("--param-noise-update-freq", type=int, default=50, help="number of iterations between every re-scaling of the parameter noise")
parser.add_argument("--param-noise-reset-freq", type=int, default=10000, help="maximum number of steps to take per episode before re-perturbing the exploration policy") parser.add_argument("--param-noise-reset-freq", type=int, default=10000, help="maximum number of steps to take per episode before re-perturbing the exploration policy")
parser.add_argument("--param-noise-threshold", type=float, default=0.05, help="the desired KL divergence between perturbed and non-perturbed policy. set to < 0 to use a KL divergence relative to the eps-greedy exploration")
# Bells and whistles # Bells and whistles
boolean_flag(parser, "double-q", default=True, help="whether or not to use double q learning") boolean_flag(parser, "double-q", default=True, help="whether or not to use double q learning")
boolean_flag(parser, "dueling", default=False, help="whether or not to use dueling model") boolean_flag(parser, "dueling", default=False, help="whether or not to use dueling model")
@@ -202,14 +201,11 @@ if __name__ == '__main__':
reset = True reset = True
update_eps = 0.01 # ensures that we cannot get stuck completely update_eps = 0.01 # ensures that we cannot get stuck completely
if args.param_noise_threshold >= 0.: # Compute the threshold such that the KL divergence between perturbed and non-perturbed
update_param_noise_threshold = args.param_noise_threshold # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
else: # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
# Compute the threshold such that the KL divergence between perturbed and non-perturbed # for detailed explanation.
# policy is comparable to eps-greedy exploration with eps = exploration.value(t). update_param_noise_threshold = -np.log(1. - exploration.value(num_iters) + exploration.value(num_iters) / float(env.action_space.n))
# See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
# for detailed explanation.
update_param_noise_threshold = -np.log(1. - exploration.value(num_iters) + exploration.value(num_iters) / float(env.action_space.n))
kwargs['reset'] = reset kwargs['reset'] = reset
kwargs['update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_threshold'] = update_param_noise_threshold
kwargs['update_param_noise_scale'] = (num_iters % args.param_noise_update_freq == 0) kwargs['update_param_noise_scale'] = (num_iters % args.param_noise_update_freq == 0)

View File

@@ -95,7 +95,6 @@ def learn(env,
prioritized_replay_eps=1e-6, prioritized_replay_eps=1e-6,
num_cpu=16, num_cpu=16,
param_noise=False, param_noise=False,
param_noise_threshold=0.05,
callback=None): callback=None):
"""Train a deepq model. """Train a deepq model.
@@ -225,14 +224,11 @@ def learn(env,
update_param_noise_threshold = 0. update_param_noise_threshold = 0.
else: else:
update_eps = 0. update_eps = 0.
if param_noise_threshold >= 0.: # Compute the threshold such that the KL divergence between perturbed and non-perturbed
update_param_noise_threshold = param_noise_threshold # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
else: # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
# Compute the threshold such that the KL divergence between perturbed and non-perturbed # for detailed explanation.
# policy is comparable to eps-greedy exploration with eps = exploration.value(t). update_param_noise_threshold = -np.log(1. - exploration.value(t) + exploration.value(t) / float(env.action_space.n))
# See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
# for detailed explanation.
update_param_noise_threshold = -np.log(1. - exploration.value(t) + exploration.value(t) / float(env.action_space.n))
kwargs['reset'] = reset kwargs['reset'] = reset
kwargs['update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_threshold'] = update_param_noise_threshold
kwargs['update_param_noise_scale'] = True kwargs['update_param_noise_scale'] = True

View File

@@ -1,13 +1,3 @@
"""
See README.md for a description of the logging API.
OFF state corresponds to having Logger.CURRENT == Logger.DEFAULT
ON state is otherwise
"""
from collections import OrderedDict
import os import os
import sys import sys
import shutil import shutil
@@ -17,7 +7,7 @@ import time
import datetime import datetime
import tempfile import tempfile
LOG_OUTPUT_FORMATS = ['stdout', 'log', 'json', 'tensorboard'] LOG_OUTPUT_FORMATS = ['stdout', 'log', 'json']
DEBUG = 10 DEBUG = 10
INFO = 20 INFO = 20
@@ -49,9 +39,12 @@ class HumanOutputFormat(OutputFormat):
def writekvs(self, kvs): def writekvs(self, kvs):
# Create strings for printing # Create strings for printing
key2str = OrderedDict() key2str = {}
for (key, val) in kvs.items(): for (key, val) in kvs.items():
valstr = '%-8.3g' % (val,) if hasattr(val, '__float__') else val if isinstance(val, float):
valstr = '%-8.3g' % (val,)
else:
valstr = str(val)
key2str[self._truncate(key)] = self._truncate(valstr) key2str[self._truncate(key)] = self._truncate(valstr)
# Find max widths # Find max widths
@@ -61,7 +54,7 @@ class HumanOutputFormat(OutputFormat):
# Write out the data # Write out the data
dashes = '-' * (keywidth + valwidth + 7) dashes = '-' * (keywidth + valwidth + 7)
lines = [dashes] lines = [dashes]
for (key, val) in key2str.items(): for (key, val) in sorted(key2str.items()):
lines.append('| %s%s | %s%s |' % ( lines.append('| %s%s | %s%s |' % (
key, key,
' ' * (keywidth - len(key)), ' ' * (keywidth - len(key)),
@@ -150,7 +143,6 @@ def make_output_format(format, ev_dir):
# API # API
# ================================================================ # ================================================================
def logkv(key, val): def logkv(key, val):
""" """
Log a value of some diagnostic Log a value of some diagnostic
@@ -158,6 +150,12 @@ def logkv(key, val):
""" """
Logger.CURRENT.logkv(key, val) Logger.CURRENT.logkv(key, val)
def logkvs(d):
"""
Log a dictionary of key-value pairs
"""
for (k, v) in d.items():
logkv(k, v)
def dumpkvs(): def dumpkvs():
""" """
@@ -168,10 +166,8 @@ def dumpkvs():
""" """
Logger.CURRENT.dumpkvs() Logger.CURRENT.dumpkvs()
def getkvs():
# for backwards compatibility return Logger.CURRENT.name2val
record_tabular = logkv
dump_tabular = dumpkvs
def log(*args, level=INFO): def log(*args, level=INFO):
@@ -203,7 +199,6 @@ def set_level(level):
""" """
Logger.CURRENT.set_level(level) Logger.CURRENT.set_level(level)
def get_dir(): def get_dir():
""" """
Get directory that log files are being written to. Get directory that log files are being written to.
@@ -211,18 +206,20 @@ def get_dir():
""" """
return Logger.CURRENT.get_dir() return Logger.CURRENT.get_dir()
record_tabular = logkv
dump_tabular = dumpkvs
# ================================================================ # ================================================================
# Backend # Backend
# ================================================================ # ================================================================
class Logger(object): class Logger(object):
DEFAULT = None # A logger with no output files. (See right below class definition) DEFAULT = None # A logger with no output files. (See right below class definition)
# So that you can still log to the terminal without setting up any output files # So that you can still log to the terminal without setting up any output files
CURRENT = None # Current logger being used by the free functions above CURRENT = None # Current logger being used by the free functions above
def __init__(self, dir, output_formats): def __init__(self, dir, output_formats):
self.name2val = OrderedDict() # values this iteration self.name2val = {} # values this iteration
self.level = INFO self.level = INFO
self.dir = dir self.dir = dir
self.output_formats = output_formats self.output_formats = output_formats
@@ -233,6 +230,7 @@ class Logger(object):
self.name2val[key] = val self.name2val[key] = val
def dumpkvs(self): def dumpkvs(self):
if self.level == DISABLED: return
for fmt in self.output_formats: for fmt in self.output_formats:
fmt.writekvs(self.name2val) fmt.writekvs(self.name2val)
self.name2val.clear() self.name2val.clear()
@@ -259,57 +257,30 @@ class Logger(object):
for fmt in self.output_formats: for fmt in self.output_formats:
fmt.writeseq(args) fmt.writeseq(args)
Logger.DEFAULT = Logger.CURRENT = Logger(dir=None, output_formats=[HumanOutputFormat(sys.stdout)])
def configure(dir=None, format_strs=None):
assert Logger.CURRENT is Logger.DEFAULT,\
"Only call logger.configure() when it's in the default state. Try calling logger.reset() first."
prevlogger = Logger.CURRENT
if dir is None:
dir = os.getenv('OPENAI_LOGDIR')
if dir is None:
dir = osp.join(tempfile.gettempdir(),
datetime.datetime.now().strftime("openai-%Y-%m-%d-%H-%M-%S-%f"))
if format_strs is None:
format_strs = LOG_OUTPUT_FORMATS
output_formats = [make_output_format(f, dir) for f in format_strs]
Logger.CURRENT = Logger(dir=dir, output_formats=output_formats)
log('Logging to %s'%dir)
def reset():
Logger.CURRENT = Logger.DEFAULT
log('Reset logger')
# ================================================================ # ================================================================
Logger.DEFAULT = Logger(output_formats=[HumanOutputFormat(sys.stdout)], dir=None)
Logger.CURRENT = Logger.DEFAULT
class session(object):
"""
Context manager that sets up the loggers for an experiment.
"""
CURRENT = None # Set to a LoggerContext object using enter/exit or context manager
def __init__(self, dir=None, format_strs=None):
if dir is None:
dir = os.getenv('OPENAI_LOGDIR')
if dir is None:
dir = osp.join(tempfile.gettempdir(),
datetime.datetime.now().strftime("openai-%Y-%m-%d-%H-%M-%S-%f"))
self.dir = dir
if format_strs is None:
format_strs = LOG_OUTPUT_FORMATS
output_formats = [make_output_format(f, dir) for f in format_strs]
Logger.CURRENT = Logger(dir=dir, output_formats=output_formats)
print('Logging to', dir)
def __enter__(self):
os.makedirs(self.evaluation_dir(), exist_ok=True)
output_formats = [make_output_format(f, self.evaluation_dir())
for f in LOG_OUTPUT_FORMATS]
Logger.CURRENT = Logger(dir=self.dir, output_formats=output_formats)
os.environ['OPENAI_LOGDIR'] = self.evaluation_dir()
def __exit__(self, *args):
Logger.CURRENT.close()
Logger.CURRENT = Logger.DEFAULT
def evaluation_dir(self):
return self.dir
def _setup():
logdir = os.getenv('OPENAI_LOGDIR')
if logdir:
session(logdir).__enter__()
_setup()
# ================================================================
def _demo(): def _demo():
info("hi") info("hi")
debug("shouldn't appear") debug("shouldn't appear")
@@ -319,19 +290,19 @@ def _demo():
if os.path.exists(dir): if os.path.exists(dir):
shutil.rmtree(dir) shutil.rmtree(dir)
with session(dir=dir): with session(dir=dir):
record_tabular("a", 3) logkv("a", 3)
record_tabular("b", 2.5) logkv("b", 2.5)
dump_tabular() dumpkvs()
record_tabular("b", -2.5) logkv("b", -2.5)
record_tabular("a", 5.5) logkv("a", 5.5)
dump_tabular() dumpkvs()
info("^^^ should see a = 5.5") info("^^^ should see a = 5.5")
record_tabular("b", -2.5) logkv("b", -2.5)
dump_tabular() dumpkvs()
record_tabular("a", "longasslongasslongasslongasslongasslongassvalue") logkv("a", "longasslongasslongasslongasslongasslongassvalue")
dump_tabular() dumpkvs()
if __name__ == "__main__": if __name__ == "__main__":

View File

@@ -22,7 +22,6 @@ def train(env_id, num_timesteps, seed, num_cpu):
rank = MPI.COMM_WORLD.Get_rank() rank = MPI.COMM_WORLD.Get_rank()
sess = U.single_threaded_session() sess = U.single_threaded_session()
sess.__enter__() sess.__enter__()
logger.session().__enter__()
if rank != 0: logger.set_level(logger.DISABLED) if rank != 0: logger.set_level(logger.DISABLED)
workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
set_global_seeds(workerseed) set_global_seeds(workerseed)

View File

@@ -9,7 +9,6 @@ import sys
def train(env_id, num_timesteps, seed): def train(env_id, num_timesteps, seed):
from baselines.pposgd import mlp_policy, pposgd_simple from baselines.pposgd import mlp_policy, pposgd_simple
U.make_session(num_cpu=1).__enter__() U.make_session(num_cpu=1).__enter__()
logger.session().__enter__()
set_global_seeds(seed) set_global_seeds(seed)
env = gym.make(env_id) env = gym.make(env_id)
def policy_fn(name, ob_space, ac_space): def policy_fn(name, ob_space, ac_space):

View File

@@ -24,7 +24,6 @@ def train(env_id, num_timesteps, seed, num_cpu):
rank = MPI.COMM_WORLD.Get_rank() rank = MPI.COMM_WORLD.Get_rank()
sess = U.single_threaded_session() sess = U.single_threaded_session()
sess.__enter__() sess.__enter__()
logger.session().__enter__()
if rank != 0: if rank != 0:
logger.set_level(logger.DISABLED) logger.set_level(logger.DISABLED)

View File

@@ -19,7 +19,6 @@ def train(env_id, num_timesteps, seed):
if whoami == "parent": if whoami == "parent":
return return
import baselines.common.tf_util as U import baselines.common.tf_util as U
logger.session().__enter__()
sess = U.single_threaded_session() sess = U.single_threaded_session()
sess.__enter__() sess.__enter__()