ACKTR + A2C
This commit is contained in:
@@ -9,10 +9,14 @@ These algorithms will make it easier for the research community to replicate, re
|
||||
You can install it by typing:
|
||||
|
||||
```bash
|
||||
pip install baselines
|
||||
git clone https://github.com/openai/baselines.git
|
||||
cd baselines
|
||||
pip install -e .
|
||||
```
|
||||
|
||||
- [A2C](baselines/a2c)
|
||||
- [ACKTR](baselines/acktr)
|
||||
- [DDPG](baselines/ddpg)
|
||||
- [DQN](baselines/deepq)
|
||||
- [PPO](baselines/pposgd)
|
||||
- [TRPO](baselines/trpo_mpi)
|
||||
- [DDPG](baselines/ddpg)
|
||||
|
0
baselines/a2c/__init__.py
Normal file
0
baselines/a2c/__init__.py
Normal file
208
baselines/a2c/a2c.py
Normal file
208
baselines/a2c/a2c.py
Normal file
@@ -0,0 +1,208 @@
|
||||
import os.path as osp
|
||||
import gym
|
||||
import time
|
||||
import joblib
|
||||
import logging
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
from baselines import logger
|
||||
|
||||
from baselines.common import set_global_seeds, explained_variance
|
||||
from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
|
||||
from baselines.common.atari_wrappers import wrap_deepmind
|
||||
|
||||
from baselines.a2c.utils import discount_with_dones
|
||||
from baselines.a2c.utils import Scheduler, make_path, find_trainable_variables
|
||||
from baselines.a2c.policies import CnnPolicy
|
||||
from baselines.a2c.utils import cat_entropy, mse
|
||||
|
||||
class Model(object):
|
||||
|
||||
def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, nstack, num_procs,
|
||||
ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4,
|
||||
alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear'):
|
||||
config = tf.ConfigProto(allow_soft_placement=True,
|
||||
intra_op_parallelism_threads=num_procs,
|
||||
inter_op_parallelism_threads=num_procs)
|
||||
config.gpu_options.allow_growth = True
|
||||
sess = tf.Session(config=config)
|
||||
nact = ac_space.n
|
||||
nbatch = nenvs*nsteps
|
||||
|
||||
A = tf.placeholder(tf.int32, [nbatch])
|
||||
ADV = tf.placeholder(tf.float32, [nbatch])
|
||||
R = tf.placeholder(tf.float32, [nbatch])
|
||||
LR = tf.placeholder(tf.float32, [])
|
||||
|
||||
step_model = policy(sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False)
|
||||
train_model = policy(sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True)
|
||||
|
||||
neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=A)
|
||||
pg_loss = tf.reduce_mean(ADV * neglogpac)
|
||||
vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R))
|
||||
entropy = tf.reduce_mean(cat_entropy(train_model.pi))
|
||||
loss = pg_loss - entropy*ent_coef + vf_loss * vf_coef
|
||||
|
||||
params = find_trainable_variables("model")
|
||||
grads = tf.gradients(loss, params)
|
||||
if max_grad_norm is not None:
|
||||
grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
|
||||
grads = list(zip(grads, params))
|
||||
trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon)
|
||||
_train = trainer.apply_gradients(grads)
|
||||
|
||||
lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)
|
||||
|
||||
def train(obs, states, rewards, masks, actions, values):
|
||||
advs = rewards - values
|
||||
for step in range(len(obs)):
|
||||
cur_lr = lr.value()
|
||||
td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, LR:cur_lr}
|
||||
if states != []:
|
||||
td_map[train_model.S] = states
|
||||
td_map[train_model.M] = masks
|
||||
policy_loss, value_loss, policy_entropy, _ = sess.run(
|
||||
[pg_loss, vf_loss, entropy, _train],
|
||||
td_map
|
||||
)
|
||||
return policy_loss, value_loss, policy_entropy
|
||||
|
||||
def save(save_path):
|
||||
ps = sess.run(params)
|
||||
make_path(save_path)
|
||||
joblib.dump(ps, save_path)
|
||||
|
||||
def load(load_path):
|
||||
loaded_params = joblib.load(load_path)
|
||||
restores = []
|
||||
for p, loaded_p in zip(params, loaded_params):
|
||||
restores.append(p.assign(loaded_p))
|
||||
ps = sess.run(restores)
|
||||
|
||||
self.train = train
|
||||
self.train_model = train_model
|
||||
self.step_model = step_model
|
||||
self.step = step_model.step
|
||||
self.value = step_model.value
|
||||
self.initial_state = step_model.initial_state
|
||||
self.save = save
|
||||
self.load = load
|
||||
tf.global_variables_initializer().run(session=sess)
|
||||
|
||||
class Runner(object):
|
||||
|
||||
def __init__(self, env, model, nsteps=5, nstack=4, gamma=0.99):
|
||||
self.env = env
|
||||
self.model = model
|
||||
nh, nw, nc = env.observation_space.shape
|
||||
nenv = env.num_envs
|
||||
self.batch_ob_shape = (nenv*nsteps, nh, nw, nc*nstack)
|
||||
self.obs = np.zeros((nenv, nh, nw, nc*nstack), dtype=np.uint8)
|
||||
obs = env.reset()
|
||||
self.update_obs(obs)
|
||||
self.gamma = gamma
|
||||
self.nsteps = nsteps
|
||||
self.states = model.initial_state
|
||||
self.dones = [False for _ in range(nenv)]
|
||||
|
||||
def update_obs(self, obs):
|
||||
# Do frame-stacking here instead of the FrameStack wrapper to reduce
|
||||
# IPC overhead
|
||||
self.obs = np.roll(self.obs, shift=-1, axis=3)
|
||||
self.obs[:, :, :, -1] = obs[:, :, :, 0]
|
||||
|
||||
def run(self):
|
||||
mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[]
|
||||
mb_states = self.states
|
||||
for n in range(self.nsteps):
|
||||
actions, values, states = self.model.step(self.obs, self.states, self.dones)
|
||||
mb_obs.append(np.copy(self.obs))
|
||||
mb_actions.append(actions)
|
||||
mb_values.append(values)
|
||||
mb_dones.append(self.dones)
|
||||
obs, rewards, dones, _ = self.env.step(actions)
|
||||
self.states = states
|
||||
self.dones = dones
|
||||
for n, done in enumerate(dones):
|
||||
if done:
|
||||
self.obs[n] = self.obs[n]*0
|
||||
self.update_obs(obs)
|
||||
mb_rewards.append(rewards)
|
||||
mb_dones.append(self.dones)
|
||||
#batch of steps to batch of rollouts
|
||||
mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape(self.batch_ob_shape)
|
||||
mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0)
|
||||
mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0)
|
||||
mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0)
|
||||
mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
|
||||
mb_masks = mb_dones[:, :-1]
|
||||
mb_dones = mb_dones[:, 1:]
|
||||
last_values = self.model.value(self.obs, self.states, self.dones).tolist()
|
||||
#discount/bootstrap off value fn
|
||||
for n, (rewards, dones, value) in enumerate(zip(mb_rewards, mb_dones, last_values)):
|
||||
rewards = rewards.tolist()
|
||||
dones = dones.tolist()
|
||||
if dones[-1] == 0:
|
||||
rewards = discount_with_dones(rewards+[value], dones+[0], self.gamma)[:-1]
|
||||
else:
|
||||
rewards = discount_with_dones(rewards, dones, self.gamma)
|
||||
mb_rewards[n] = rewards
|
||||
mb_rewards = mb_rewards.flatten()
|
||||
mb_actions = mb_actions.flatten()
|
||||
mb_values = mb_values.flatten()
|
||||
mb_masks = mb_masks.flatten()
|
||||
return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values
|
||||
|
||||
def learn(policy, env, seed, nsteps=5, nstack=4, total_timesteps=int(80e6), vf_coef=0.5, ent_coef=0.01, max_grad_norm=0.5, lr=7e-4, lrschedule='linear', epsilon=1e-5, alpha=0.99, gamma=0.99, log_interval=100):
|
||||
tf.reset_default_graph()
|
||||
set_global_seeds(seed)
|
||||
|
||||
nenvs = env.num_envs
|
||||
ob_space = env.observation_space
|
||||
ac_space = env.action_space
|
||||
num_procs = len(env.remotes) # HACK
|
||||
model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nenvs=nenvs, nsteps=nsteps, nstack=nstack, num_procs=num_procs, ent_coef=ent_coef, vf_coef=vf_coef,
|
||||
max_grad_norm=max_grad_norm, lr=lr, alpha=alpha, epsilon=epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule)
|
||||
runner = Runner(env, model, nsteps=nsteps, nstack=nstack, gamma=gamma)
|
||||
|
||||
nbatch = nenvs*nsteps
|
||||
tstart = time.time()
|
||||
for update in range(1, total_timesteps//nbatch+1):
|
||||
obs, states, rewards, masks, actions, values = runner.run()
|
||||
policy_loss, value_loss, policy_entropy = model.train(obs, states, rewards, masks, actions, values)
|
||||
nseconds = time.time()-tstart
|
||||
fps = int((update*nbatch)/nseconds)
|
||||
if update % log_interval == 0 or update == 1:
|
||||
ev = explained_variance(values, rewards)
|
||||
logger.record_tabular("nupdates", update)
|
||||
logger.record_tabular("total_timesteps", update*nbatch)
|
||||
logger.record_tabular("fps", fps)
|
||||
logger.record_tabular("policy_entropy", float(policy_entropy))
|
||||
logger.record_tabular("value_loss", float(value_loss))
|
||||
logger.record_tabular("explained_variance", float(ev))
|
||||
logger.dump_tabular()
|
||||
env.close()
|
||||
|
||||
def main():
|
||||
env_id = 'SpaceInvaders'
|
||||
seed = 42
|
||||
nenvs = 4
|
||||
|
||||
def make_env(rank):
|
||||
def env_fn():
|
||||
env = gym.make('{}NoFrameskip-v4'.format(env_id))
|
||||
env.seed(seed + rank)
|
||||
if logger.get_dir():
|
||||
from baselines import bench
|
||||
env = bench.Monitor(env, osp.join(logger.get_dir(), "{}.monitor.json".format(rank)))
|
||||
gym.logger.setLevel(logging.WARN)
|
||||
return wrap_deepmind(env)
|
||||
return env_fn
|
||||
|
||||
set_global_seeds(seed)
|
||||
env = SubprocVecEnv([make_env(i) for i in range(nenvs)])
|
||||
policy = CnnPolicy
|
||||
learn(policy, env, seed)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
207
baselines/a2c/policies.py
Normal file
207
baselines/a2c/policies.py
Normal file
@@ -0,0 +1,207 @@
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
from baselines.a2c.utils import conv, fc, conv_to_fc, batch_to_seq, seq_to_batch, lstm, lnlstm, sample, check_shape
|
||||
from baselines.common.distributions import make_pdtype
|
||||
import baselines.common.tf_util as U
|
||||
import gym
|
||||
|
||||
class LnLstmPolicy(object):
|
||||
def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, nlstm=256, reuse=False):
|
||||
nbatch = nenv*nsteps
|
||||
nh, nw, nc = ob_space.shape
|
||||
ob_shape = (nbatch, nh, nw, nc*nstack)
|
||||
nact = ac_space.n
|
||||
X = tf.placeholder(tf.uint8, ob_shape) #obs
|
||||
M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
|
||||
S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
|
||||
with tf.variable_scope("model", reuse=reuse):
|
||||
h = conv(tf.cast(X, tf.float32)/255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2))
|
||||
h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))
|
||||
h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2))
|
||||
h3 = conv_to_fc(h3)
|
||||
h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
|
||||
xs = batch_to_seq(h4, nenv, nsteps)
|
||||
ms = batch_to_seq(M, nenv, nsteps)
|
||||
h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm)
|
||||
h5 = seq_to_batch(h5)
|
||||
pi = fc(h5, 'pi', nact, act=lambda x:x)
|
||||
vf = fc(h5, 'v', 1, act=lambda x:x)
|
||||
|
||||
v0 = vf[:, 0]
|
||||
a0 = sample(pi)
|
||||
self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)
|
||||
|
||||
def step(ob, state, mask):
|
||||
a, v, s = sess.run([a0, v0, snew], {X:ob, S:state, M:mask})
|
||||
return a, v, s
|
||||
|
||||
def value(ob, state, mask):
|
||||
return sess.run(v0, {X:ob, S:state, M:mask})
|
||||
|
||||
self.X = X
|
||||
self.M = M
|
||||
self.S = S
|
||||
self.pi = pi
|
||||
self.vf = vf
|
||||
self.step = step
|
||||
self.value = value
|
||||
|
||||
class LstmPolicy(object):
|
||||
|
||||
def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, nlstm=256, reuse=False):
|
||||
nbatch = nenv*nsteps
|
||||
nh, nw, nc = ob_space.shape
|
||||
ob_shape = (nbatch, nh, nw, nc*nstack)
|
||||
nact = ac_space.n
|
||||
X = tf.placeholder(tf.uint8, ob_shape) #obs
|
||||
M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
|
||||
S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
|
||||
with tf.variable_scope("model", reuse=reuse):
|
||||
h = conv(tf.cast(X, tf.float32)/255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2))
|
||||
h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))
|
||||
h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2))
|
||||
h3 = conv_to_fc(h3)
|
||||
h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
|
||||
xs = batch_to_seq(h4, nenv, nsteps)
|
||||
ms = batch_to_seq(M, nenv, nsteps)
|
||||
h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
|
||||
h5 = seq_to_batch(h5)
|
||||
pi = fc(h5, 'pi', nact, act=lambda x:x)
|
||||
vf = fc(h5, 'v', 1, act=lambda x:x)
|
||||
|
||||
v0 = vf[:, 0]
|
||||
a0 = sample(pi)
|
||||
self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)
|
||||
|
||||
def step(ob, state, mask):
|
||||
a, v, s = sess.run([a0, v0, snew], {X:ob, S:state, M:mask})
|
||||
return a, v, s
|
||||
|
||||
def value(ob, state, mask):
|
||||
return sess.run(v0, {X:ob, S:state, M:mask})
|
||||
|
||||
self.X = X
|
||||
self.M = M
|
||||
self.S = S
|
||||
self.pi = pi
|
||||
self.vf = vf
|
||||
self.step = step
|
||||
self.value = value
|
||||
|
||||
class CnnPolicy(object):
|
||||
|
||||
def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False):
|
||||
nbatch = nenv*nsteps
|
||||
nh, nw, nc = ob_space.shape
|
||||
ob_shape = (nbatch, nh, nw, nc*nstack)
|
||||
nact = ac_space.n
|
||||
X = tf.placeholder(tf.uint8, ob_shape) #obs
|
||||
with tf.variable_scope("model", reuse=reuse):
|
||||
h = conv(tf.cast(X, tf.float32)/255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2))
|
||||
h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))
|
||||
h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2))
|
||||
h3 = conv_to_fc(h3)
|
||||
h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
|
||||
pi = fc(h4, 'pi', nact, act=lambda x:x)
|
||||
vf = fc(h4, 'v', 1, act=lambda x:x)
|
||||
|
||||
v0 = vf[:, 0]
|
||||
a0 = sample(pi)
|
||||
self.initial_state = [] #not stateful
|
||||
|
||||
def step(ob, *_args, **_kwargs):
|
||||
a, v = sess.run([a0, v0], {X:ob})
|
||||
return a, v, [] #dummy state
|
||||
|
||||
def value(ob, *_args, **_kwargs):
|
||||
return sess.run(v0, {X:ob})
|
||||
|
||||
self.X = X
|
||||
self.pi = pi
|
||||
self.vf = vf
|
||||
self.step = step
|
||||
self.value = value
|
||||
|
||||
class AcerCnnPolicy(object):
|
||||
|
||||
def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False):
|
||||
nbatch = nenv * nsteps
|
||||
nh, nw, nc = ob_space.shape
|
||||
ob_shape = (nbatch, nh, nw, nc * nstack)
|
||||
nact = ac_space.n
|
||||
X = tf.placeholder(tf.uint8, ob_shape) # obs
|
||||
with tf.variable_scope("model", reuse=reuse):
|
||||
h = conv(tf.cast(X, tf.float32) / 255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2))
|
||||
h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))
|
||||
h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2))
|
||||
h3 = conv_to_fc(h3)
|
||||
h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
|
||||
pi_logits = fc(h4, 'pi', nact, act=lambda x: x, init_scale=0.01)
|
||||
pi = tf.nn.softmax(pi_logits)
|
||||
q = fc(h4, 'q', nact, act=lambda x: x)
|
||||
|
||||
a = sample(pi_logits) # could change this to use self.pi instead
|
||||
self.initial_state = [] # not stateful
|
||||
self.X = X
|
||||
self.pi = pi # actual policy params now
|
||||
self.q = q
|
||||
|
||||
def step(ob, *args, **kwargs):
|
||||
# returns actions, mus, states
|
||||
a0, pi0 = sess.run([a, pi], {X: ob})
|
||||
return a0, pi0, [] # dummy state
|
||||
|
||||
def out(ob, *args, **kwargs):
|
||||
pi0, q0 = sess.run([pi, q], {X: ob})
|
||||
return pi0, q0
|
||||
|
||||
def act(ob, *args, **kwargs):
|
||||
return sess.run(a, {X: ob})
|
||||
|
||||
self.step = step
|
||||
self.out = out
|
||||
self.act = act
|
||||
|
||||
class AcerLstmPolicy(object):
|
||||
|
||||
def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False, nlstm=256):
|
||||
nbatch = nenv * nsteps
|
||||
nh, nw, nc = ob_space.shape
|
||||
ob_shape = (nbatch, nh, nw, nc * nstack)
|
||||
nact = ac_space.n
|
||||
X = tf.placeholder(tf.uint8, ob_shape) # obs
|
||||
M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
|
||||
S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
|
||||
with tf.variable_scope("model", reuse=reuse):
|
||||
h = conv(tf.cast(X, tf.float32) / 255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2))
|
||||
h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))
|
||||
h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2))
|
||||
h3 = conv_to_fc(h3)
|
||||
h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
|
||||
|
||||
# lstm
|
||||
xs = batch_to_seq(h4, nenv, nsteps)
|
||||
ms = batch_to_seq(M, nenv, nsteps)
|
||||
h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
|
||||
h5 = seq_to_batch(h5)
|
||||
|
||||
pi_logits = fc(h5, 'pi', nact, act=lambda x: x, init_scale=0.01)
|
||||
pi = tf.nn.softmax(pi_logits)
|
||||
q = fc(h5, 'q', nact, act=lambda x: x)
|
||||
|
||||
a = sample(pi_logits) # could change this to use self.pi instead
|
||||
self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)
|
||||
self.X = X
|
||||
self.M = M
|
||||
self.S = S
|
||||
self.pi = pi # actual policy params now
|
||||
self.q = q
|
||||
|
||||
def step(ob, state, mask, *args, **kwargs):
|
||||
# returns actions, mus, states
|
||||
a0, pi0, s = sess.run([a, pi, snew], {X: ob, S: state, M: mask})
|
||||
return a0, pi0, s
|
||||
|
||||
self.step = step
|
||||
|
||||
# For Mujoco. Taken from PPOSGD
|
41
baselines/a2c/run_atari.py
Normal file
41
baselines/a2c/run_atari.py
Normal file
@@ -0,0 +1,41 @@
|
||||
#!/usr/bin/env python
|
||||
import os, logging, gym
|
||||
from baselines import logger
|
||||
from baselines.common import set_global_seeds
|
||||
from baselines import bench
|
||||
from baselines.a2c.a2c import learn
|
||||
from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
|
||||
from baselines.common.atari_wrappers import wrap_deepmind
|
||||
from baselines.a2c.policies import CnnPolicy, LstmPolicy, LnLstmPolicy
|
||||
|
||||
def train(env_id, num_timesteps, seed, policy, lrschedule, num_cpu):
|
||||
num_timesteps //= 4
|
||||
|
||||
def make_env(rank):
|
||||
def _thunk():
|
||||
env = gym.make(env_id)
|
||||
env.seed(seed + rank)
|
||||
env = bench.Monitor(env, os.path.join(logger.get_dir(), "{}.monitor.json".format(rank)))
|
||||
gym.logger.setLevel(logging.WARN)
|
||||
return wrap_deepmind(env)
|
||||
return _thunk
|
||||
|
||||
set_global_seeds(seed)
|
||||
env = SubprocVecEnv([make_env(i) for i in range(num_cpu)])
|
||||
|
||||
if policy == 'cnn':
|
||||
policy_fn = CnnPolicy
|
||||
elif policy == 'lstm':
|
||||
policy_fn = LstmPolicy
|
||||
elif policy == 'lnlstm':
|
||||
policy_fn = LnLstmPolicy
|
||||
learn(policy_fn, env, seed, total_timesteps=num_timesteps, lrschedule=lrschedule)
|
||||
env.close()
|
||||
|
||||
|
||||
def main():
|
||||
train('BreakoutNoFrameskip-v4', num_timesteps=int(40e6), seed=0, policy='cnn', lrschedule='linear', num_cpu=16)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
255
baselines/a2c/utils.py
Normal file
255
baselines/a2c/utils.py
Normal file
@@ -0,0 +1,255 @@
|
||||
import os
|
||||
import gym
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
from gym import spaces
|
||||
from collections import deque
|
||||
|
||||
def sample(logits):
|
||||
noise = tf.random_uniform(tf.shape(logits))
|
||||
return tf.argmax(logits - tf.log(-tf.log(noise)), 1)
|
||||
|
||||
def cat_entropy(logits):
|
||||
a0 = logits - tf.reduce_max(logits, 1, keep_dims=True)
|
||||
ea0 = tf.exp(a0)
|
||||
z0 = tf.reduce_sum(ea0, 1, keep_dims=True)
|
||||
p0 = ea0 / z0
|
||||
return tf.reduce_sum(p0 * (tf.log(z0) - a0), 1)
|
||||
|
||||
def cat_entropy_softmax(p0):
|
||||
return - tf.reduce_sum(p0 * tf.log(p0 + 1e-6), axis = 1)
|
||||
|
||||
def mse(pred, target):
|
||||
return tf.square(pred-target)/2.
|
||||
|
||||
def ortho_init(scale=1.0):
|
||||
def _ortho_init(shape, dtype, partition_info=None):
|
||||
#lasagne ortho init for tf
|
||||
shape = tuple(shape)
|
||||
if len(shape) == 2:
|
||||
flat_shape = shape
|
||||
elif len(shape) == 4: # assumes NHWC
|
||||
flat_shape = (np.prod(shape[:-1]), shape[-1])
|
||||
else:
|
||||
raise NotImplementedError
|
||||
a = np.random.normal(0.0, 1.0, flat_shape)
|
||||
u, _, v = np.linalg.svd(a, full_matrices=False)
|
||||
q = u if u.shape == flat_shape else v # pick the one with the correct shape
|
||||
q = q.reshape(shape)
|
||||
return (scale * q[:shape[0], :shape[1]]).astype(np.float32)
|
||||
return _ortho_init
|
||||
|
||||
def conv(x, scope, nf, rf, stride, pad='VALID', act=tf.nn.relu, init_scale=1.0):
|
||||
with tf.variable_scope(scope):
|
||||
nin = x.get_shape()[3].value
|
||||
w = tf.get_variable("w", [rf, rf, nin, nf], initializer=ortho_init(init_scale))
|
||||
b = tf.get_variable("b", [nf], initializer=tf.constant_initializer(0.0))
|
||||
z = tf.nn.conv2d(x, w, strides=[1, stride, stride, 1], padding=pad)+b
|
||||
h = act(z)
|
||||
return h
|
||||
|
||||
def fc(x, scope, nh, act=tf.nn.relu, init_scale=1.0):
|
||||
with tf.variable_scope(scope):
|
||||
nin = x.get_shape()[1].value
|
||||
w = tf.get_variable("w", [nin, nh], initializer=ortho_init(init_scale))
|
||||
b = tf.get_variable("b", [nh], initializer=tf.constant_initializer(0.0))
|
||||
z = tf.matmul(x, w)+b
|
||||
h = act(z)
|
||||
return h
|
||||
|
||||
def batch_to_seq(h, nbatch, nsteps, flat=False):
|
||||
if flat:
|
||||
h = tf.reshape(h, [nbatch, nsteps])
|
||||
else:
|
||||
h = tf.reshape(h, [nbatch, nsteps, -1])
|
||||
return [tf.squeeze(v, [1]) for v in tf.split(axis=1, num_or_size_splits=nsteps, value=h)]
|
||||
|
||||
def seq_to_batch(h, flat = False):
|
||||
shape = h[0].get_shape().as_list()
|
||||
if not flat:
|
||||
assert(len(shape) > 1)
|
||||
nh = h[0].get_shape()[-1].value
|
||||
return tf.reshape(tf.concat(axis=1, values=h), [-1, nh])
|
||||
else:
|
||||
return tf.reshape(tf.stack(values=h, axis=1), [-1])
|
||||
|
||||
def lstm(xs, ms, s, scope, nh, init_scale=1.0):
|
||||
nbatch, nin = [v.value for v in xs[0].get_shape()]
|
||||
nsteps = len(xs)
|
||||
with tf.variable_scope(scope):
|
||||
wx = tf.get_variable("wx", [nin, nh*4], initializer=ortho_init(init_scale))
|
||||
wh = tf.get_variable("wh", [nh, nh*4], initializer=ortho_init(init_scale))
|
||||
b = tf.get_variable("b", [nh*4], initializer=tf.constant_initializer(0.0))
|
||||
|
||||
c, h = tf.split(axis=1, num_or_size_splits=2, value=s)
|
||||
for idx, (x, m) in enumerate(zip(xs, ms)):
|
||||
c = c*(1-m)
|
||||
h = h*(1-m)
|
||||
z = tf.matmul(x, wx) + tf.matmul(h, wh) + b
|
||||
i, f, o, u = tf.split(axis=1, num_or_size_splits=4, value=z)
|
||||
i = tf.nn.sigmoid(i)
|
||||
f = tf.nn.sigmoid(f)
|
||||
o = tf.nn.sigmoid(o)
|
||||
u = tf.tanh(u)
|
||||
c = f*c + i*u
|
||||
h = o*tf.tanh(c)
|
||||
xs[idx] = h
|
||||
s = tf.concat(axis=1, values=[c, h])
|
||||
return xs, s
|
||||
|
||||
def _ln(x, g, b, e=1e-5, axes=[1]):
|
||||
u, s = tf.nn.moments(x, axes=axes, keep_dims=True)
|
||||
x = (x-u)/tf.sqrt(s+e)
|
||||
x = x*g+b
|
||||
return x
|
||||
|
||||
def lnlstm(xs, ms, s, scope, nh, init_scale=1.0):
|
||||
nbatch, nin = [v.value for v in xs[0].get_shape()]
|
||||
nsteps = len(xs)
|
||||
with tf.variable_scope(scope):
|
||||
wx = tf.get_variable("wx", [nin, nh*4], initializer=ortho_init(init_scale))
|
||||
gx = tf.get_variable("gx", [nh*4], initializer=tf.constant_initializer(1.0))
|
||||
bx = tf.get_variable("bx", [nh*4], initializer=tf.constant_initializer(0.0))
|
||||
|
||||
wh = tf.get_variable("wh", [nh, nh*4], initializer=ortho_init(init_scale))
|
||||
gh = tf.get_variable("gh", [nh*4], initializer=tf.constant_initializer(1.0))
|
||||
bh = tf.get_variable("bh", [nh*4], initializer=tf.constant_initializer(0.0))
|
||||
|
||||
b = tf.get_variable("b", [nh*4], initializer=tf.constant_initializer(0.0))
|
||||
|
||||
gc = tf.get_variable("gc", [nh], initializer=tf.constant_initializer(1.0))
|
||||
bc = tf.get_variable("bc", [nh], initializer=tf.constant_initializer(0.0))
|
||||
|
||||
c, h = tf.split(axis=1, num_or_size_splits=2, value=s)
|
||||
for idx, (x, m) in enumerate(zip(xs, ms)):
|
||||
c = c*(1-m)
|
||||
h = h*(1-m)
|
||||
z = _ln(tf.matmul(x, wx), gx, bx) + _ln(tf.matmul(h, wh), gh, bh) + b
|
||||
i, f, o, u = tf.split(axis=1, num_or_size_splits=4, value=z)
|
||||
i = tf.nn.sigmoid(i)
|
||||
f = tf.nn.sigmoid(f)
|
||||
o = tf.nn.sigmoid(o)
|
||||
u = tf.tanh(u)
|
||||
c = f*c + i*u
|
||||
h = o*tf.tanh(_ln(c, gc, bc))
|
||||
xs[idx] = h
|
||||
s = tf.concat(axis=1, values=[c, h])
|
||||
return xs, s
|
||||
|
||||
def conv_to_fc(x):
|
||||
nh = np.prod([v.value for v in x.get_shape()[1:]])
|
||||
x = tf.reshape(x, [-1, nh])
|
||||
return x
|
||||
|
||||
def discount_with_dones(rewards, dones, gamma):
|
||||
discounted = []
|
||||
r = 0
|
||||
for reward, done in zip(rewards[::-1], dones[::-1]):
|
||||
r = reward + gamma*r*(1.-done) # fixed off by one bug
|
||||
discounted.append(r)
|
||||
return discounted[::-1]
|
||||
|
||||
def find_trainable_variables(key):
|
||||
with tf.variable_scope(key):
|
||||
return tf.trainable_variables()
|
||||
|
||||
def make_path(f):
|
||||
return os.makedirs(f, exist_ok=True)
|
||||
|
||||
def constant(p):
|
||||
return 1
|
||||
|
||||
def linear(p):
|
||||
return 1-p
|
||||
|
||||
schedules = {
|
||||
'linear':linear,
|
||||
'constant':constant
|
||||
}
|
||||
|
||||
class Scheduler(object):
|
||||
|
||||
def __init__(self, v, nvalues, schedule):
|
||||
self.n = 0.
|
||||
self.v = v
|
||||
self.nvalues = nvalues
|
||||
self.schedule = schedules[schedule]
|
||||
|
||||
def value(self):
|
||||
current_value = self.v*self.schedule(self.n/self.nvalues)
|
||||
self.n += 1.
|
||||
return current_value
|
||||
|
||||
def value_steps(self, steps):
|
||||
return self.v*self.schedule(steps/self.nvalues)
|
||||
|
||||
|
||||
class EpisodeStats:
|
||||
def __init__(self, nsteps, nenvs):
|
||||
self.episode_rewards = []
|
||||
for i in range(nenvs):
|
||||
self.episode_rewards.append([])
|
||||
self.lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths
|
||||
self.rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards
|
||||
self.nsteps = nsteps
|
||||
self.nenvs = nenvs
|
||||
|
||||
def feed(self, rewards, masks):
|
||||
rewards = np.reshape(rewards, [self.nenvs, self.nsteps])
|
||||
masks = np.reshape(masks, [self.nenvs, self.nsteps])
|
||||
for i in range(0, self.nenvs):
|
||||
for j in range(0, self.nsteps):
|
||||
self.episode_rewards[i].append(rewards[i][j])
|
||||
if masks[i][j]:
|
||||
l = len(self.episode_rewards[i])
|
||||
s = sum(self.episode_rewards[i])
|
||||
self.lenbuffer.append(l)
|
||||
self.rewbuffer.append(s)
|
||||
self.episode_rewards[i] = []
|
||||
|
||||
def mean_length(self):
|
||||
if self.lenbuffer:
|
||||
return np.mean(self.lenbuffer)
|
||||
else:
|
||||
return 0 # on the first params dump, no episodes are finished
|
||||
|
||||
def mean_reward(self):
|
||||
if self.rewbuffer:
|
||||
return np.mean(self.rewbuffer)
|
||||
else:
|
||||
return 0
|
||||
|
||||
|
||||
# For ACER
|
||||
def get_by_index(x, idx):
|
||||
assert(len(x.get_shape()) == 2)
|
||||
assert(len(idx.get_shape()) == 1)
|
||||
idx_flattened = tf.range(0, x.shape[0]) * x.shape[1] + idx
|
||||
y = tf.gather(tf.reshape(x, [-1]), # flatten input
|
||||
idx_flattened) # use flattened indices
|
||||
return y
|
||||
|
||||
def check_shape(ts,shapes):
|
||||
i = 0
|
||||
for (t,shape) in zip(ts,shapes):
|
||||
assert t.get_shape().as_list()==shape, "id " + str(i) + " shape " + str(t.get_shape()) + str(shape)
|
||||
i += 1
|
||||
|
||||
def avg_norm(t):
|
||||
return tf.reduce_mean(tf.sqrt(tf.reduce_sum(tf.square(t), axis=-1)))
|
||||
|
||||
def myadd(g1, g2, param):
|
||||
print([g1, g2, param.name])
|
||||
assert (not (g1 is None and g2 is None)), param.name
|
||||
if g1 is None:
|
||||
return g2
|
||||
elif g2 is None:
|
||||
return g1
|
||||
else:
|
||||
return g1 + g2
|
||||
|
||||
def my_explained_variance(qpred, q):
|
||||
_, vary = tf.nn.moments(q, axes=[0, 1])
|
||||
_, varpred = tf.nn.moments(q - qpred, axes=[0, 1])
|
||||
check_shape([vary, varpred], [[]] * 2)
|
||||
return 1.0 - (varpred / vary)
|
0
baselines/acktr/__init__.py
Normal file
0
baselines/acktr/__init__.py
Normal file
138
baselines/acktr/acktr_cont.py
Normal file
138
baselines/acktr/acktr_cont.py
Normal file
@@ -0,0 +1,138 @@
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
from baselines import logger
|
||||
from baselines import common
|
||||
from baselines.common import tf_util as U
|
||||
from baselines.acktr import kfac
|
||||
from baselines.acktr.filters import ZFilter
|
||||
|
||||
def pathlength(path):
|
||||
return path["reward"].shape[0]# Loss function that we'll differentiate to get the policy gradient
|
||||
|
||||
def rollout(env, policy, max_pathlength, animate=False, obfilter=None):
|
||||
"""
|
||||
Simulate the env and policy for max_pathlength steps
|
||||
"""
|
||||
ob = env.reset()
|
||||
prev_ob = np.float32(np.zeros(ob.shape))
|
||||
if obfilter: ob = obfilter(ob)
|
||||
terminated = False
|
||||
|
||||
obs = []
|
||||
acs = []
|
||||
ac_dists = []
|
||||
logps = []
|
||||
rewards = []
|
||||
for _ in range(max_pathlength):
|
||||
if animate:
|
||||
env.render()
|
||||
state = np.concatenate([ob, prev_ob], -1)
|
||||
obs.append(state)
|
||||
ac, ac_dist, logp = policy.act(state)
|
||||
acs.append(ac)
|
||||
ac_dists.append(ac_dist)
|
||||
logps.append(logp)
|
||||
prev_ob = np.copy(ob)
|
||||
scaled_ac = env.action_space.low + (ac + 1.) * 0.5 * (env.action_space.high - env.action_space.low)
|
||||
scaled_ac = np.clip(scaled_ac, env.action_space.low, env.action_space.high)
|
||||
ob, rew, done, _ = env.step(scaled_ac)
|
||||
if obfilter: ob = obfilter(ob)
|
||||
rewards.append(rew)
|
||||
if done:
|
||||
terminated = True
|
||||
break
|
||||
return {"observation" : np.array(obs), "terminated" : terminated,
|
||||
"reward" : np.array(rewards), "action" : np.array(acs),
|
||||
"action_dist": np.array(ac_dists), "logp" : np.array(logps)}
|
||||
|
||||
def learn(env, policy, vf, gamma, lam, timesteps_per_batch, num_timesteps,
|
||||
animate=False, callback=None, optimizer="adam", desired_kl=0.002):
|
||||
|
||||
obfilter = ZFilter(env.observation_space.shape)
|
||||
|
||||
max_pathlength = env.spec.timestep_limit
|
||||
stepsize = tf.Variable(initial_value=np.float32(np.array(0.03)), name='stepsize')
|
||||
inputs, loss, loss_sampled = policy.update_info
|
||||
optim = kfac.KfacOptimizer(learning_rate=stepsize, cold_lr=stepsize*(1-0.9), momentum=0.9, kfac_update=2,\
|
||||
epsilon=1e-2, stats_decay=0.99, async=1, cold_iter=1,
|
||||
weight_decay_dict=policy.wd_dict, max_grad_norm=None)
|
||||
pi_var_list = []
|
||||
for var in tf.trainable_variables():
|
||||
if "pi" in var.name:
|
||||
pi_var_list.append(var)
|
||||
|
||||
update_op, q_runner = optim.minimize(loss, loss_sampled, var_list=pi_var_list)
|
||||
do_update = U.function(inputs, update_op)
|
||||
U.initialize()
|
||||
|
||||
# start queue runners
|
||||
enqueue_threads = []
|
||||
coord = tf.train.Coordinator()
|
||||
for qr in [q_runner, vf.q_runner]:
|
||||
assert (qr != None)
|
||||
enqueue_threads.extend(qr.create_threads(U.get_session(), coord=coord, start=True))
|
||||
|
||||
i = 0
|
||||
timesteps_so_far = 0
|
||||
while True:
|
||||
if timesteps_so_far > num_timesteps:
|
||||
break
|
||||
logger.log("********** Iteration %i ************"%i)
|
||||
|
||||
# Collect paths until we have enough timesteps
|
||||
timesteps_this_batch = 0
|
||||
paths = []
|
||||
while True:
|
||||
path = rollout(env, policy, max_pathlength, animate=(len(paths)==0 and (i % 10 == 0) and animate), obfilter=obfilter)
|
||||
paths.append(path)
|
||||
n = pathlength(path)
|
||||
timesteps_this_batch += n
|
||||
timesteps_so_far += n
|
||||
if timesteps_this_batch > timesteps_per_batch:
|
||||
break
|
||||
|
||||
# Estimate advantage function
|
||||
vtargs = []
|
||||
advs = []
|
||||
for path in paths:
|
||||
rew_t = path["reward"]
|
||||
return_t = common.discount(rew_t, gamma)
|
||||
vtargs.append(return_t)
|
||||
vpred_t = vf.predict(path)
|
||||
vpred_t = np.append(vpred_t, 0.0 if path["terminated"] else vpred_t[-1])
|
||||
delta_t = rew_t + gamma*vpred_t[1:] - vpred_t[:-1]
|
||||
adv_t = common.discount(delta_t, gamma * lam)
|
||||
advs.append(adv_t)
|
||||
# Update value function
|
||||
vf.fit(paths, vtargs)
|
||||
|
||||
# Build arrays for policy update
|
||||
ob_no = np.concatenate([path["observation"] for path in paths])
|
||||
action_na = np.concatenate([path["action"] for path in paths])
|
||||
oldac_dist = np.concatenate([path["action_dist"] for path in paths])
|
||||
logp_n = np.concatenate([path["logp"] for path in paths])
|
||||
adv_n = np.concatenate(advs)
|
||||
standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8)
|
||||
|
||||
# Policy update
|
||||
do_update(ob_no, action_na, standardized_adv_n)
|
||||
|
||||
# Adjust stepsize
|
||||
kl = policy.compute_kl(ob_no, oldac_dist)
|
||||
if kl > desired_kl * 2:
|
||||
logger.log("kl too high")
|
||||
U.eval(tf.assign(stepsize, stepsize / 1.5))
|
||||
elif kl < desired_kl / 2:
|
||||
logger.log("kl too low")
|
||||
U.eval(tf.assign(stepsize, stepsize * 1.5))
|
||||
else:
|
||||
logger.log("kl just right!")
|
||||
|
||||
logger.record_tabular("EpRewMean", np.mean([path["reward"].sum() for path in paths]))
|
||||
logger.record_tabular("EpRewSEM", np.std([path["reward"].sum()/np.sqrt(len(paths)) for path in paths]))
|
||||
logger.record_tabular("EpLenMean", np.mean([pathlength(path) for path in paths]))
|
||||
logger.record_tabular("KL", kl)
|
||||
if callback:
|
||||
callback()
|
||||
logger.dump_tabular()
|
||||
i += 1
|
214
baselines/acktr/acktr_disc.py
Normal file
214
baselines/acktr/acktr_disc.py
Normal file
@@ -0,0 +1,214 @@
|
||||
import os.path as osp
|
||||
import time
|
||||
import joblib
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
from baselines import logger
|
||||
|
||||
from baselines.common import set_global_seeds, explained_variance
|
||||
|
||||
from baselines.acktr.utils import discount_with_dones
|
||||
from baselines.acktr.utils import Scheduler, find_trainable_variables
|
||||
from baselines.acktr.utils import cat_entropy, mse
|
||||
from baselines.acktr import kfac
|
||||
|
||||
|
||||
class Model(object):
|
||||
|
||||
def __init__(self, policy, ob_space, ac_space, nenvs,total_timesteps, nprocs=32, nsteps=20,
|
||||
nstack=4, ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5,
|
||||
kfac_clip=0.001, lrschedule='linear'):
|
||||
config = tf.ConfigProto(allow_soft_placement=True,
|
||||
intra_op_parallelism_threads=nprocs,
|
||||
inter_op_parallelism_threads=nprocs)
|
||||
config.gpu_options.allow_growth = True
|
||||
self.sess = sess = tf.Session(config=config)
|
||||
nact = ac_space.n
|
||||
nbatch = nenvs * nsteps
|
||||
A = tf.placeholder(tf.int32, [nbatch])
|
||||
ADV = tf.placeholder(tf.float32, [nbatch])
|
||||
R = tf.placeholder(tf.float32, [nbatch])
|
||||
PG_LR = tf.placeholder(tf.float32, [])
|
||||
VF_LR = tf.placeholder(tf.float32, [])
|
||||
|
||||
self.model = step_model = policy(sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False)
|
||||
self.model2 = train_model = policy(sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True)
|
||||
|
||||
logpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=A)
|
||||
self.logits = logits = train_model.pi
|
||||
|
||||
##training loss
|
||||
pg_loss = tf.reduce_mean(ADV*logpac)
|
||||
entropy = tf.reduce_mean(cat_entropy(train_model.pi))
|
||||
pg_loss = pg_loss - ent_coef * entropy
|
||||
vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R))
|
||||
train_loss = pg_loss + vf_coef * vf_loss
|
||||
|
||||
|
||||
##Fisher loss construction
|
||||
self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(logpac)
|
||||
sample_net = train_model.vf + tf.random_normal(tf.shape(train_model.vf))
|
||||
self.vf_fisher = vf_fisher_loss = - vf_fisher_coef*tf.reduce_mean(tf.pow(train_model.vf - tf.stop_gradient(sample_net), 2))
|
||||
self.joint_fisher = joint_fisher_loss = pg_fisher_loss + vf_fisher_loss
|
||||
|
||||
self.params=params = find_trainable_variables("model")
|
||||
|
||||
self.grads_check = grads = tf.gradients(train_loss,params)
|
||||
|
||||
with tf.device('/gpu:0'):
|
||||
self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip,\
|
||||
momentum=0.9, kfac_update=1, epsilon=0.01,\
|
||||
stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=max_grad_norm)
|
||||
|
||||
update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss, var_list=params)
|
||||
train_op, q_runner = optim.apply_gradients(list(zip(grads,params)))
|
||||
self.q_runner = q_runner
|
||||
self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)
|
||||
|
||||
def train(obs, states, rewards, masks, actions, values):
|
||||
advs = rewards - values
|
||||
for step in range(len(obs)):
|
||||
cur_lr = self.lr.value()
|
||||
|
||||
td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, PG_LR:cur_lr}
|
||||
if states != []:
|
||||
td_map[train_model.S] = states
|
||||
td_map[train_model.M] = masks
|
||||
|
||||
policy_loss, value_loss, policy_entropy, _ = sess.run(
|
||||
[pg_loss, vf_loss, entropy, train_op],
|
||||
td_map
|
||||
)
|
||||
return policy_loss, value_loss, policy_entropy
|
||||
|
||||
def save(save_path):
|
||||
ps = sess.run(params)
|
||||
joblib.dump(ps, save_path)
|
||||
|
||||
def load(load_path):
|
||||
loaded_params = joblib.load(load_path)
|
||||
restores = []
|
||||
for p, loaded_p in zip(params, loaded_params):
|
||||
restores.append(p.assign(loaded_p))
|
||||
sess.run(restores)
|
||||
|
||||
|
||||
|
||||
self.train = train
|
||||
self.save = save
|
||||
self.load = load
|
||||
self.train_model = train_model
|
||||
self.step_model = step_model
|
||||
self.step = step_model.step
|
||||
self.value = step_model.value
|
||||
self.initial_state = step_model.initial_state
|
||||
tf.global_variables_initializer().run(session=sess)
|
||||
|
||||
class Runner(object):
|
||||
|
||||
def __init__(self, env, model, nsteps, nstack, gamma):
|
||||
self.env = env
|
||||
self.model = model
|
||||
nh, nw, nc = env.observation_space.shape
|
||||
nenv = env.num_envs
|
||||
self.batch_ob_shape = (nenv*nsteps, nh, nw, nc*nstack)
|
||||
self.obs = np.zeros((nenv, nh, nw, nc*nstack), dtype=np.uint8)
|
||||
obs = env.reset()
|
||||
self.update_obs(obs)
|
||||
self.gamma = gamma
|
||||
self.nsteps = nsteps
|
||||
self.states = model.initial_state
|
||||
self.dones = [False for _ in range(nenv)]
|
||||
|
||||
def update_obs(self, obs):
|
||||
self.obs = np.roll(self.obs, shift=-1, axis=3)
|
||||
self.obs[:, :, :, -1] = obs[:, :, :, 0]
|
||||
|
||||
def run(self):
|
||||
mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[]
|
||||
mb_states = self.states
|
||||
for n in range(self.nsteps):
|
||||
actions, values, states = self.model.step(self.obs, self.states, self.dones)
|
||||
mb_obs.append(np.copy(self.obs))
|
||||
mb_actions.append(actions)
|
||||
mb_values.append(values)
|
||||
mb_dones.append(self.dones)
|
||||
obs, rewards, dones, _ = self.env.step(actions)
|
||||
self.states = states
|
||||
self.dones = dones
|
||||
for n, done in enumerate(dones):
|
||||
if done:
|
||||
self.obs[n] = self.obs[n]*0
|
||||
self.update_obs(obs)
|
||||
mb_rewards.append(rewards)
|
||||
mb_dones.append(self.dones)
|
||||
#batch of steps to batch of rollouts
|
||||
mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape(self.batch_ob_shape)
|
||||
mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0)
|
||||
mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0)
|
||||
mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0)
|
||||
mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
|
||||
mb_masks = mb_dones[:, :-1]
|
||||
mb_dones = mb_dones[:, 1:]
|
||||
last_values = self.model.value(self.obs, self.states, self.dones).tolist()
|
||||
#discount/bootstrap off value fn
|
||||
for n, (rewards, dones, value) in enumerate(zip(mb_rewards, mb_dones, last_values)):
|
||||
rewards = rewards.tolist()
|
||||
dones = dones.tolist()
|
||||
if dones[-1] == 0:
|
||||
rewards = discount_with_dones(rewards+[value], dones+[0], self.gamma)[:-1]
|
||||
else:
|
||||
rewards = discount_with_dones(rewards, dones, self.gamma)
|
||||
mb_rewards[n] = rewards
|
||||
mb_rewards = mb_rewards.flatten()
|
||||
mb_actions = mb_actions.flatten()
|
||||
mb_values = mb_values.flatten()
|
||||
mb_masks = mb_masks.flatten()
|
||||
return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values
|
||||
|
||||
def learn(policy, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interval=1, nprocs=32, nsteps=20,
|
||||
nstack=4, ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5,
|
||||
kfac_clip=0.001, save_interval=None, lrschedule='linear'):
|
||||
tf.reset_default_graph()
|
||||
set_global_seeds(seed)
|
||||
|
||||
nenvs = env.num_envs
|
||||
ob_space = env.observation_space
|
||||
ac_space = env.action_space
|
||||
make_model = lambda : Model(policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=nprocs, nsteps
|
||||
=nsteps, nstack=nstack, ent_coef=ent_coef, vf_coef=vf_coef, vf_fisher_coef=
|
||||
vf_fisher_coef, lr=lr, max_grad_norm=max_grad_norm, kfac_clip=kfac_clip,
|
||||
lrschedule=lrschedule)
|
||||
if save_interval and logger.get_dir():
|
||||
import cloudpickle
|
||||
with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh:
|
||||
fh.write(cloudpickle.dumps(make_model))
|
||||
model = make_model()
|
||||
|
||||
runner = Runner(env, model, nsteps=nsteps, nstack=nstack, gamma=gamma)
|
||||
nbatch = nenvs*nsteps
|
||||
tstart = time.time()
|
||||
enqueue_threads = model.q_runner.create_threads(model.sess, coord=tf.train.Coordinator(), start=True)
|
||||
for update in range(1, total_timesteps//nbatch+1):
|
||||
obs, states, rewards, masks, actions, values = runner.run()
|
||||
policy_loss, value_loss, policy_entropy = model.train(obs, states, rewards, masks, actions, values)
|
||||
model.old_obs = obs
|
||||
nseconds = time.time()-tstart
|
||||
fps = int((update*nbatch)/nseconds)
|
||||
if update % log_interval == 0 or update == 1:
|
||||
ev = explained_variance(values, rewards)
|
||||
logger.record_tabular("nupdates", update)
|
||||
logger.record_tabular("total_timesteps", update*nbatch)
|
||||
logger.record_tabular("fps", fps)
|
||||
logger.record_tabular("policy_entropy", float(policy_entropy))
|
||||
logger.record_tabular("policy_loss", float(policy_loss))
|
||||
logger.record_tabular("value_loss", float(value_loss))
|
||||
logger.record_tabular("explained_variance", float(ev))
|
||||
logger.dump_tabular()
|
||||
|
||||
if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir():
|
||||
savepath = osp.join(logger.get_dir(), 'checkpoint%.5i'%update)
|
||||
print('Saving to', savepath)
|
||||
model.save(savepath)
|
||||
|
||||
env.close()
|
98
baselines/acktr/filters.py
Normal file
98
baselines/acktr/filters.py
Normal file
@@ -0,0 +1,98 @@
|
||||
from baselines.acktr.running_stat import RunningStat
|
||||
from collections import deque
|
||||
import numpy as np
|
||||
|
||||
class Filter(object):
|
||||
def __call__(self, x, update=True):
|
||||
raise NotImplementedError
|
||||
def reset(self):
|
||||
pass
|
||||
|
||||
class IdentityFilter(Filter):
|
||||
def __call__(self, x, update=True):
|
||||
return x
|
||||
|
||||
class CompositionFilter(Filter):
|
||||
def __init__(self, fs):
|
||||
self.fs = fs
|
||||
def __call__(self, x, update=True):
|
||||
for f in self.fs:
|
||||
x = f(x)
|
||||
return x
|
||||
def output_shape(self, input_space):
|
||||
out = input_space.shape
|
||||
for f in self.fs:
|
||||
out = f.output_shape(out)
|
||||
return out
|
||||
|
||||
class ZFilter(Filter):
|
||||
"""
|
||||
y = (x-mean)/std
|
||||
using running estimates of mean,std
|
||||
"""
|
||||
|
||||
def __init__(self, shape, demean=True, destd=True, clip=10.0):
|
||||
self.demean = demean
|
||||
self.destd = destd
|
||||
self.clip = clip
|
||||
|
||||
self.rs = RunningStat(shape)
|
||||
|
||||
def __call__(self, x, update=True):
|
||||
if update: self.rs.push(x)
|
||||
if self.demean:
|
||||
x = x - self.rs.mean
|
||||
if self.destd:
|
||||
x = x / (self.rs.std+1e-8)
|
||||
if self.clip:
|
||||
x = np.clip(x, -self.clip, self.clip)
|
||||
return x
|
||||
def output_shape(self, input_space):
|
||||
return input_space.shape
|
||||
|
||||
class AddClock(Filter):
|
||||
def __init__(self):
|
||||
self.count = 0
|
||||
def reset(self):
|
||||
self.count = 0
|
||||
def __call__(self, x, update=True):
|
||||
return np.append(x, self.count/100.0)
|
||||
def output_shape(self, input_space):
|
||||
return (input_space.shape[0]+1,)
|
||||
|
||||
class FlattenFilter(Filter):
|
||||
def __call__(self, x, update=True):
|
||||
return x.ravel()
|
||||
def output_shape(self, input_space):
|
||||
return (int(np.prod(input_space.shape)),)
|
||||
|
||||
class Ind2OneHotFilter(Filter):
|
||||
def __init__(self, n):
|
||||
self.n = n
|
||||
def __call__(self, x, update=True):
|
||||
out = np.zeros(self.n)
|
||||
out[x] = 1
|
||||
return out
|
||||
def output_shape(self, input_space):
|
||||
return (input_space.n,)
|
||||
|
||||
class DivFilter(Filter):
|
||||
def __init__(self, divisor):
|
||||
self.divisor = divisor
|
||||
def __call__(self, x, update=True):
|
||||
return x / self.divisor
|
||||
def output_shape(self, input_space):
|
||||
return input_space.shape
|
||||
|
||||
class StackFilter(Filter):
|
||||
def __init__(self, length):
|
||||
self.stack = deque(maxlen=length)
|
||||
def reset(self):
|
||||
self.stack.clear()
|
||||
def __call__(self, x, update=True):
|
||||
self.stack.append(x)
|
||||
while len(self.stack) < self.stack.maxlen:
|
||||
self.stack.append(x)
|
||||
return np.concatenate(self.stack, axis=-1)
|
||||
def output_shape(self, input_space):
|
||||
return input_space.shape[:-1] + (input_space.shape[-1] * self.stack.maxlen,)
|
926
baselines/acktr/kfac.py
Normal file
926
baselines/acktr/kfac.py
Normal file
@@ -0,0 +1,926 @@
|
||||
import tensorflow as tf
|
||||
import numpy as np
|
||||
import re
|
||||
from baselines.acktr.kfac_utils import *
|
||||
from functools import reduce
|
||||
|
||||
KFAC_OPS = ['MatMul', 'Conv2D', 'BiasAdd']
|
||||
KFAC_DEBUG = False
|
||||
|
||||
|
||||
class KfacOptimizer():
|
||||
|
||||
def __init__(self, learning_rate=0.01, momentum=0.9, clip_kl=0.01, kfac_update=2, stats_accum_iter=60, full_stats_init=False, cold_iter=100, cold_lr=None, async=False, async_stats=False, epsilon=1e-2, stats_decay=0.95, blockdiag_bias=False, channel_fac=False, factored_damping=False, approxT2=False, use_float64=False, weight_decay_dict={},max_grad_norm=0.5):
|
||||
self.max_grad_norm = max_grad_norm
|
||||
self._lr = learning_rate
|
||||
self._momentum = momentum
|
||||
self._clip_kl = clip_kl
|
||||
self._channel_fac = channel_fac
|
||||
self._kfac_update = kfac_update
|
||||
self._async = async
|
||||
self._async_stats = async_stats
|
||||
self._epsilon = epsilon
|
||||
self._stats_decay = stats_decay
|
||||
self._blockdiag_bias = blockdiag_bias
|
||||
self._approxT2 = approxT2
|
||||
self._use_float64 = use_float64
|
||||
self._factored_damping = factored_damping
|
||||
self._cold_iter = cold_iter
|
||||
if cold_lr == None:
|
||||
# good heuristics
|
||||
self._cold_lr = self._lr# * 3.
|
||||
else:
|
||||
self._cold_lr = cold_lr
|
||||
self._stats_accum_iter = stats_accum_iter
|
||||
self._weight_decay_dict = weight_decay_dict
|
||||
self._diag_init_coeff = 0.
|
||||
self._full_stats_init = full_stats_init
|
||||
if not self._full_stats_init:
|
||||
self._stats_accum_iter = self._cold_iter
|
||||
|
||||
self.sgd_step = tf.Variable(0, name='KFAC/sgd_step', trainable=False)
|
||||
self.global_step = tf.Variable(
|
||||
0, name='KFAC/global_step', trainable=False)
|
||||
self.cold_step = tf.Variable(0, name='KFAC/cold_step', trainable=False)
|
||||
self.factor_step = tf.Variable(
|
||||
0, name='KFAC/factor_step', trainable=False)
|
||||
self.stats_step = tf.Variable(
|
||||
0, name='KFAC/stats_step', trainable=False)
|
||||
self.vFv = tf.Variable(0., name='KFAC/vFv', trainable=False)
|
||||
|
||||
self.factors = {}
|
||||
self.param_vars = []
|
||||
self.stats = {}
|
||||
self.stats_eigen = {}
|
||||
|
||||
def getFactors(self, g, varlist):
|
||||
graph = tf.get_default_graph()
|
||||
factorTensors = {}
|
||||
fpropTensors = []
|
||||
bpropTensors = []
|
||||
opTypes = []
|
||||
fops = []
|
||||
|
||||
def searchFactors(gradient, graph):
|
||||
# hard coded search stratergy
|
||||
bpropOp = gradient.op
|
||||
bpropOp_name = bpropOp.name
|
||||
|
||||
bTensors = []
|
||||
fTensors = []
|
||||
|
||||
# combining additive gradient, assume they are the same op type and
|
||||
# indepedent
|
||||
if 'AddN' in bpropOp_name:
|
||||
factors = []
|
||||
for g in gradient.op.inputs:
|
||||
factors.append(searchFactors(g, graph))
|
||||
op_names = [item['opName'] for item in factors]
|
||||
# TO-DO: need to check all the attribute of the ops as well
|
||||
print (gradient.name)
|
||||
print (op_names)
|
||||
print (len(np.unique(op_names)))
|
||||
assert len(np.unique(op_names)) == 1, gradient.name + \
|
||||
' is shared among different computation OPs'
|
||||
|
||||
bTensors = reduce(lambda x, y: x + y,
|
||||
[item['bpropFactors'] for item in factors])
|
||||
if len(factors[0]['fpropFactors']) > 0:
|
||||
fTensors = reduce(
|
||||
lambda x, y: x + y, [item['fpropFactors'] for item in factors])
|
||||
fpropOp_name = op_names[0]
|
||||
fpropOp = factors[0]['op']
|
||||
else:
|
||||
fpropOp_name = re.search(
|
||||
'gradientsSampled(_[0-9]+|)/(.+?)_grad', bpropOp_name).group(2)
|
||||
fpropOp = graph.get_operation_by_name(fpropOp_name)
|
||||
if fpropOp.op_def.name in KFAC_OPS:
|
||||
# Known OPs
|
||||
###
|
||||
bTensor = [
|
||||
i for i in bpropOp.inputs if 'gradientsSampled' in i.name][-1]
|
||||
bTensorShape = fpropOp.outputs[0].get_shape()
|
||||
if bTensor.get_shape()[0].value == None:
|
||||
bTensor.set_shape(bTensorShape)
|
||||
bTensors.append(bTensor)
|
||||
###
|
||||
if fpropOp.op_def.name == 'BiasAdd':
|
||||
fTensors = []
|
||||
else:
|
||||
fTensors.append(
|
||||
[i for i in fpropOp.inputs if param.op.name not in i.name][0])
|
||||
fpropOp_name = fpropOp.op_def.name
|
||||
else:
|
||||
# unknown OPs, block approximation used
|
||||
bInputsList = [i for i in bpropOp.inputs[
|
||||
0].op.inputs if 'gradientsSampled' in i.name if 'Shape' not in i.name]
|
||||
if len(bInputsList) > 0:
|
||||
bTensor = bInputsList[0]
|
||||
bTensorShape = fpropOp.outputs[0].get_shape()
|
||||
if len(bTensor.get_shape()) > 0 and bTensor.get_shape()[0].value == None:
|
||||
bTensor.set_shape(bTensorShape)
|
||||
bTensors.append(bTensor)
|
||||
fpropOp_name = opTypes.append('UNK-' + fpropOp.op_def.name)
|
||||
|
||||
return {'opName': fpropOp_name, 'op': fpropOp, 'fpropFactors': fTensors, 'bpropFactors': bTensors}
|
||||
|
||||
for t, param in zip(g, varlist):
|
||||
if KFAC_DEBUG:
|
||||
print(('get factor for '+param.name))
|
||||
factors = searchFactors(t, graph)
|
||||
factorTensors[param] = factors
|
||||
|
||||
########
|
||||
# check associated weights and bias for homogeneous coordinate representation
|
||||
# and check redundent factors
|
||||
# TO-DO: there may be a bug to detect associate bias and weights for
|
||||
# forking layer, e.g. in inception models.
|
||||
for param in varlist:
|
||||
factorTensors[param]['assnWeights'] = None
|
||||
factorTensors[param]['assnBias'] = None
|
||||
for param in varlist:
|
||||
if factorTensors[param]['opName'] == 'BiasAdd':
|
||||
factorTensors[param]['assnWeights'] = None
|
||||
for item in varlist:
|
||||
if len(factorTensors[item]['bpropFactors']) > 0:
|
||||
if (set(factorTensors[item]['bpropFactors']) == set(factorTensors[param]['bpropFactors'])) and (len(factorTensors[item]['fpropFactors']) > 0):
|
||||
factorTensors[param]['assnWeights'] = item
|
||||
factorTensors[item]['assnBias'] = param
|
||||
factorTensors[param]['bpropFactors'] = factorTensors[
|
||||
item]['bpropFactors']
|
||||
|
||||
########
|
||||
|
||||
########
|
||||
# concatenate the additive gradients along the batch dimension, i.e.
|
||||
# assuming independence structure
|
||||
for key in ['fpropFactors', 'bpropFactors']:
|
||||
for i, param in enumerate(varlist):
|
||||
if len(factorTensors[param][key]) > 0:
|
||||
if (key + '_concat') not in factorTensors[param]:
|
||||
name_scope = factorTensors[param][key][0].name.split(':')[
|
||||
0]
|
||||
with tf.name_scope(name_scope):
|
||||
factorTensors[param][
|
||||
key + '_concat'] = tf.concat(factorTensors[param][key], 0)
|
||||
else:
|
||||
factorTensors[param][key + '_concat'] = None
|
||||
for j, param2 in enumerate(varlist[(i + 1):]):
|
||||
if (len(factorTensors[param][key]) > 0) and (set(factorTensors[param2][key]) == set(factorTensors[param][key])):
|
||||
factorTensors[param2][key] = factorTensors[param][key]
|
||||
factorTensors[param2][
|
||||
key + '_concat'] = factorTensors[param][key + '_concat']
|
||||
########
|
||||
|
||||
if KFAC_DEBUG:
|
||||
for items in zip(varlist, fpropTensors, bpropTensors, opTypes):
|
||||
print((items[0].name, factorTensors[item]))
|
||||
self.factors = factorTensors
|
||||
return factorTensors
|
||||
|
||||
def getStats(self, factors, varlist):
|
||||
if len(self.stats) == 0:
|
||||
# initialize stats variables on CPU because eigen decomp is
|
||||
# computed on CPU
|
||||
with tf.device('/cpu'):
|
||||
tmpStatsCache = {}
|
||||
|
||||
# search for tensor factors and
|
||||
# use block diag approx for the bias units
|
||||
for var in varlist:
|
||||
fpropFactor = factors[var]['fpropFactors_concat']
|
||||
bpropFactor = factors[var]['bpropFactors_concat']
|
||||
opType = factors[var]['opName']
|
||||
if opType == 'Conv2D':
|
||||
Kh = var.get_shape()[0]
|
||||
Kw = var.get_shape()[1]
|
||||
C = fpropFactor.get_shape()[-1]
|
||||
|
||||
Oh = bpropFactor.get_shape()[1]
|
||||
Ow = bpropFactor.get_shape()[2]
|
||||
if Oh == 1 and Ow == 1 and self._channel_fac:
|
||||
# factorization along the channels do not support
|
||||
# homogeneous coordinate
|
||||
var_assnBias = factors[var]['assnBias']
|
||||
if var_assnBias:
|
||||
factors[var]['assnBias'] = None
|
||||
factors[var_assnBias]['assnWeights'] = None
|
||||
##
|
||||
|
||||
for var in varlist:
|
||||
fpropFactor = factors[var]['fpropFactors_concat']
|
||||
bpropFactor = factors[var]['bpropFactors_concat']
|
||||
opType = factors[var]['opName']
|
||||
self.stats[var] = {'opName': opType,
|
||||
'fprop_concat_stats': [],
|
||||
'bprop_concat_stats': [],
|
||||
'assnWeights': factors[var]['assnWeights'],
|
||||
'assnBias': factors[var]['assnBias'],
|
||||
}
|
||||
if fpropFactor is not None:
|
||||
if fpropFactor not in tmpStatsCache:
|
||||
if opType == 'Conv2D':
|
||||
Kh = var.get_shape()[0]
|
||||
Kw = var.get_shape()[1]
|
||||
C = fpropFactor.get_shape()[-1]
|
||||
|
||||
Oh = bpropFactor.get_shape()[1]
|
||||
Ow = bpropFactor.get_shape()[2]
|
||||
if Oh == 1 and Ow == 1 and self._channel_fac:
|
||||
# factorization along the channels
|
||||
# assume independence bewteen input channels and spatial
|
||||
# 2K-1 x 2K-1 covariance matrix and C x C covariance matrix
|
||||
# factorization along the channels do not
|
||||
# support homogeneous coordinate, assnBias
|
||||
# is always None
|
||||
fpropFactor2_size = Kh * Kw
|
||||
slot_fpropFactor_stats2 = tf.Variable(tf.diag(tf.ones(
|
||||
[fpropFactor2_size])) * self._diag_init_coeff, name='KFAC_STATS/' + fpropFactor.op.name, trainable=False)
|
||||
self.stats[var]['fprop_concat_stats'].append(
|
||||
slot_fpropFactor_stats2)
|
||||
|
||||
fpropFactor_size = C
|
||||
else:
|
||||
# 2K-1 x 2K-1 x C x C covariance matrix
|
||||
# assume BHWC
|
||||
fpropFactor_size = Kh * Kw * C
|
||||
else:
|
||||
# D x D covariance matrix
|
||||
fpropFactor_size = fpropFactor.get_shape()[-1]
|
||||
|
||||
# use homogeneous coordinate
|
||||
if not self._blockdiag_bias and self.stats[var]['assnBias']:
|
||||
fpropFactor_size += 1
|
||||
|
||||
slot_fpropFactor_stats = tf.Variable(tf.diag(tf.ones(
|
||||
[fpropFactor_size])) * self._diag_init_coeff, name='KFAC_STATS/' + fpropFactor.op.name, trainable=False)
|
||||
self.stats[var]['fprop_concat_stats'].append(
|
||||
slot_fpropFactor_stats)
|
||||
if opType != 'Conv2D':
|
||||
tmpStatsCache[fpropFactor] = self.stats[
|
||||
var]['fprop_concat_stats']
|
||||
else:
|
||||
self.stats[var][
|
||||
'fprop_concat_stats'] = tmpStatsCache[fpropFactor]
|
||||
|
||||
if bpropFactor is not None:
|
||||
# no need to collect backward stats for bias vectors if
|
||||
# using homogeneous coordinates
|
||||
if not((not self._blockdiag_bias) and self.stats[var]['assnWeights']):
|
||||
if bpropFactor not in tmpStatsCache:
|
||||
slot_bpropFactor_stats = tf.Variable(tf.diag(tf.ones([bpropFactor.get_shape(
|
||||
)[-1]])) * self._diag_init_coeff, name='KFAC_STATS/' + bpropFactor.op.name, trainable=False)
|
||||
self.stats[var]['bprop_concat_stats'].append(
|
||||
slot_bpropFactor_stats)
|
||||
tmpStatsCache[bpropFactor] = self.stats[
|
||||
var]['bprop_concat_stats']
|
||||
else:
|
||||
self.stats[var][
|
||||
'bprop_concat_stats'] = tmpStatsCache[bpropFactor]
|
||||
|
||||
return self.stats
|
||||
|
||||
def compute_and_apply_stats(self, loss_sampled, var_list=None):
|
||||
varlist = var_list
|
||||
if varlist is None:
|
||||
varlist = tf.trainable_variables()
|
||||
|
||||
stats = self.compute_stats(loss_sampled, var_list=varlist)
|
||||
return self.apply_stats(stats)
|
||||
|
||||
def compute_stats(self, loss_sampled, var_list=None):
|
||||
varlist = var_list
|
||||
if varlist is None:
|
||||
varlist = tf.trainable_variables()
|
||||
|
||||
gs = tf.gradients(loss_sampled, varlist, name='gradientsSampled')
|
||||
self.gs = gs
|
||||
factors = self.getFactors(gs, varlist)
|
||||
stats = self.getStats(factors, varlist)
|
||||
|
||||
updateOps = []
|
||||
statsUpdates = {}
|
||||
statsUpdates_cache = {}
|
||||
for var in varlist:
|
||||
opType = factors[var]['opName']
|
||||
fops = factors[var]['op']
|
||||
fpropFactor = factors[var]['fpropFactors_concat']
|
||||
fpropStats_vars = stats[var]['fprop_concat_stats']
|
||||
bpropFactor = factors[var]['bpropFactors_concat']
|
||||
bpropStats_vars = stats[var]['bprop_concat_stats']
|
||||
SVD_factors = {}
|
||||
for stats_var in fpropStats_vars:
|
||||
stats_var_dim = int(stats_var.get_shape()[0])
|
||||
if stats_var not in statsUpdates_cache:
|
||||
old_fpropFactor = fpropFactor
|
||||
B = (tf.shape(fpropFactor)[0]) # batch size
|
||||
if opType == 'Conv2D':
|
||||
strides = fops.get_attr("strides")
|
||||
padding = fops.get_attr("padding")
|
||||
convkernel_size = var.get_shape()[0:3]
|
||||
|
||||
KH = int(convkernel_size[0])
|
||||
KW = int(convkernel_size[1])
|
||||
C = int(convkernel_size[2])
|
||||
flatten_size = int(KH * KW * C)
|
||||
|
||||
Oh = int(bpropFactor.get_shape()[1])
|
||||
Ow = int(bpropFactor.get_shape()[2])
|
||||
|
||||
if Oh == 1 and Ow == 1 and self._channel_fac:
|
||||
# factorization along the channels
|
||||
# assume independence among input channels
|
||||
# factor = B x 1 x 1 x (KH xKW x C)
|
||||
# patches = B x Oh x Ow x (KH xKW x C)
|
||||
if len(SVD_factors) == 0:
|
||||
if KFAC_DEBUG:
|
||||
print(('approx %s act factor with rank-1 SVD factors' % (var.name)))
|
||||
# find closest rank-1 approx to the feature map
|
||||
S, U, V = tf.batch_svd(tf.reshape(
|
||||
fpropFactor, [-1, KH * KW, C]))
|
||||
# get rank-1 approx slides
|
||||
sqrtS1 = tf.expand_dims(tf.sqrt(S[:, 0, 0]), 1)
|
||||
patches_k = U[:, :, 0] * sqrtS1 # B x KH*KW
|
||||
full_factor_shape = fpropFactor.get_shape()
|
||||
patches_k.set_shape(
|
||||
[full_factor_shape[0], KH * KW])
|
||||
patches_c = V[:, :, 0] * sqrtS1 # B x C
|
||||
patches_c.set_shape([full_factor_shape[0], C])
|
||||
SVD_factors[C] = patches_c
|
||||
SVD_factors[KH * KW] = patches_k
|
||||
fpropFactor = SVD_factors[stats_var_dim]
|
||||
|
||||
else:
|
||||
# poor mem usage implementation
|
||||
patches = tf.extract_image_patches(fpropFactor, ksizes=[1, convkernel_size[
|
||||
0], convkernel_size[1], 1], strides=strides, rates=[1, 1, 1, 1], padding=padding)
|
||||
|
||||
if self._approxT2:
|
||||
if KFAC_DEBUG:
|
||||
print(('approxT2 act fisher for %s' % (var.name)))
|
||||
# T^2 terms * 1/T^2, size: B x C
|
||||
fpropFactor = tf.reduce_mean(patches, [1, 2])
|
||||
else:
|
||||
# size: (B x Oh x Ow) x C
|
||||
fpropFactor = tf.reshape(
|
||||
patches, [-1, flatten_size]) / Oh / Ow
|
||||
fpropFactor_size = int(fpropFactor.get_shape()[-1])
|
||||
if stats_var_dim == (fpropFactor_size + 1) and not self._blockdiag_bias:
|
||||
if opType == 'Conv2D' and not self._approxT2:
|
||||
# correct padding for numerical stability (we
|
||||
# divided out OhxOw from activations for T1 approx)
|
||||
fpropFactor = tf.concat([fpropFactor, tf.ones(
|
||||
[tf.shape(fpropFactor)[0], 1]) / Oh / Ow], 1)
|
||||
else:
|
||||
# use homogeneous coordinates
|
||||
fpropFactor = tf.concat(
|
||||
[fpropFactor, tf.ones([tf.shape(fpropFactor)[0], 1])], 1)
|
||||
|
||||
# average over the number of data points in a batch
|
||||
# divided by B
|
||||
cov = tf.matmul(fpropFactor, fpropFactor,
|
||||
transpose_a=True) / tf.cast(B, tf.float32)
|
||||
updateOps.append(cov)
|
||||
statsUpdates[stats_var] = cov
|
||||
if opType != 'Conv2D':
|
||||
# HACK: for convolution we recompute fprop stats for
|
||||
# every layer including forking layers
|
||||
statsUpdates_cache[stats_var] = cov
|
||||
|
||||
for stats_var in bpropStats_vars:
|
||||
stats_var_dim = int(stats_var.get_shape()[0])
|
||||
if stats_var not in statsUpdates_cache:
|
||||
old_bpropFactor = bpropFactor
|
||||
bpropFactor_shape = bpropFactor.get_shape()
|
||||
B = tf.shape(bpropFactor)[0] # batch size
|
||||
C = int(bpropFactor_shape[-1]) # num channels
|
||||
if opType == 'Conv2D' or len(bpropFactor_shape) == 4:
|
||||
if fpropFactor is not None:
|
||||
if self._approxT2:
|
||||
if KFAC_DEBUG:
|
||||
print(('approxT2 grad fisher for %s' % (var.name)))
|
||||
bpropFactor = tf.reduce_sum(
|
||||
bpropFactor, [1, 2]) # T^2 terms * 1/T^2
|
||||
else:
|
||||
bpropFactor = tf.reshape(
|
||||
bpropFactor, [-1, C]) * Oh * Ow # T * 1/T terms
|
||||
else:
|
||||
# just doing block diag approx. spatial independent
|
||||
# structure does not apply here. summing over
|
||||
# spatial locations
|
||||
if KFAC_DEBUG:
|
||||
print(('block diag approx fisher for %s' % (var.name)))
|
||||
bpropFactor = tf.reduce_sum(bpropFactor, [1, 2])
|
||||
|
||||
# assume sampled loss is averaged. TO-DO:figure out better
|
||||
# way to handle this
|
||||
bpropFactor *= tf.to_float(B)
|
||||
##
|
||||
|
||||
cov_b = tf.matmul(
|
||||
bpropFactor, bpropFactor, transpose_a=True) / tf.to_float(tf.shape(bpropFactor)[0])
|
||||
|
||||
updateOps.append(cov_b)
|
||||
statsUpdates[stats_var] = cov_b
|
||||
statsUpdates_cache[stats_var] = cov_b
|
||||
|
||||
if KFAC_DEBUG:
|
||||
aKey = list(statsUpdates.keys())[0]
|
||||
statsUpdates[aKey] = tf.Print(statsUpdates[aKey],
|
||||
[tf.convert_to_tensor('step:'),
|
||||
self.global_step,
|
||||
tf.convert_to_tensor(
|
||||
'computing stats'),
|
||||
])
|
||||
self.statsUpdates = statsUpdates
|
||||
return statsUpdates
|
||||
|
||||
def apply_stats(self, statsUpdates):
|
||||
""" compute stats and update/apply the new stats to the running average
|
||||
"""
|
||||
|
||||
def updateAccumStats():
|
||||
if self._full_stats_init:
|
||||
return tf.cond(tf.greater(self.sgd_step, self._cold_iter), lambda: tf.group(*self._apply_stats(statsUpdates, accumulate=True, accumulateCoeff=1. / self._stats_accum_iter)), tf.no_op)
|
||||
else:
|
||||
return tf.group(*self._apply_stats(statsUpdates, accumulate=True, accumulateCoeff=1. / self._stats_accum_iter))
|
||||
|
||||
def updateRunningAvgStats(statsUpdates, fac_iter=1):
|
||||
# return tf.cond(tf.greater_equal(self.factor_step,
|
||||
# tf.convert_to_tensor(fac_iter)), lambda:
|
||||
# tf.group(*self._apply_stats(stats_list, varlist)), tf.no_op)
|
||||
return tf.group(*self._apply_stats(statsUpdates))
|
||||
|
||||
if self._async_stats:
|
||||
# asynchronous stats update
|
||||
update_stats = self._apply_stats(statsUpdates)
|
||||
|
||||
queue = tf.FIFOQueue(1, [item.dtype for item in update_stats], shapes=[
|
||||
item.get_shape() for item in update_stats])
|
||||
enqueue_op = queue.enqueue(update_stats)
|
||||
|
||||
def dequeue_stats_op():
|
||||
return queue.dequeue()
|
||||
self.qr_stats = tf.train.QueueRunner(queue, [enqueue_op])
|
||||
update_stats_op = tf.cond(tf.equal(queue.size(), tf.convert_to_tensor(
|
||||
0)), tf.no_op, lambda: tf.group(*[dequeue_stats_op(), ]))
|
||||
else:
|
||||
# synchronous stats update
|
||||
update_stats_op = tf.cond(tf.greater_equal(
|
||||
self.stats_step, self._stats_accum_iter), lambda: updateRunningAvgStats(statsUpdates), updateAccumStats)
|
||||
self._update_stats_op = update_stats_op
|
||||
return update_stats_op
|
||||
|
||||
def _apply_stats(self, statsUpdates, accumulate=False, accumulateCoeff=0.):
|
||||
updateOps = []
|
||||
# obtain the stats var list
|
||||
for stats_var in statsUpdates:
|
||||
stats_new = statsUpdates[stats_var]
|
||||
if accumulate:
|
||||
# simple superbatch averaging
|
||||
update_op = tf.assign_add(
|
||||
stats_var, accumulateCoeff * stats_new, use_locking=True)
|
||||
else:
|
||||
# exponential running averaging
|
||||
update_op = tf.assign(
|
||||
stats_var, stats_var * self._stats_decay, use_locking=True)
|
||||
update_op = tf.assign_add(
|
||||
update_op, (1. - self._stats_decay) * stats_new, use_locking=True)
|
||||
updateOps.append(update_op)
|
||||
|
||||
with tf.control_dependencies(updateOps):
|
||||
stats_step_op = tf.assign_add(self.stats_step, 1)
|
||||
|
||||
if KFAC_DEBUG:
|
||||
stats_step_op = (tf.Print(stats_step_op,
|
||||
[tf.convert_to_tensor('step:'),
|
||||
self.global_step,
|
||||
tf.convert_to_tensor('fac step:'),
|
||||
self.factor_step,
|
||||
tf.convert_to_tensor('sgd step:'),
|
||||
self.sgd_step,
|
||||
tf.convert_to_tensor('Accum:'),
|
||||
tf.convert_to_tensor(accumulate),
|
||||
tf.convert_to_tensor('Accum coeff:'),
|
||||
tf.convert_to_tensor(accumulateCoeff),
|
||||
tf.convert_to_tensor('stat step:'),
|
||||
self.stats_step, updateOps[0], updateOps[1]]))
|
||||
return [stats_step_op, ]
|
||||
|
||||
def getStatsEigen(self, stats=None):
|
||||
if len(self.stats_eigen) == 0:
|
||||
stats_eigen = {}
|
||||
if stats is None:
|
||||
stats = self.stats
|
||||
|
||||
tmpEigenCache = {}
|
||||
with tf.device('/cpu:0'):
|
||||
for var in stats:
|
||||
for key in ['fprop_concat_stats', 'bprop_concat_stats']:
|
||||
for stats_var in stats[var][key]:
|
||||
if stats_var not in tmpEigenCache:
|
||||
stats_dim = stats_var.get_shape()[1].value
|
||||
e = tf.Variable(tf.ones(
|
||||
[stats_dim]), name='KFAC_FAC/' + stats_var.name.split(':')[0] + '/e', trainable=False)
|
||||
Q = tf.Variable(tf.diag(tf.ones(
|
||||
[stats_dim])), name='KFAC_FAC/' + stats_var.name.split(':')[0] + '/Q', trainable=False)
|
||||
stats_eigen[stats_var] = {'e': e, 'Q': Q}
|
||||
tmpEigenCache[
|
||||
stats_var] = stats_eigen[stats_var]
|
||||
else:
|
||||
stats_eigen[stats_var] = tmpEigenCache[
|
||||
stats_var]
|
||||
self.stats_eigen = stats_eigen
|
||||
return self.stats_eigen
|
||||
|
||||
def computeStatsEigen(self):
|
||||
""" compute the eigen decomp using copied var stats to avoid concurrent read/write from other queue """
|
||||
# TO-DO: figure out why this op has delays (possibly moving
|
||||
# eigenvectors around?)
|
||||
with tf.device('/cpu:0'):
|
||||
def removeNone(tensor_list):
|
||||
local_list = []
|
||||
for item in tensor_list:
|
||||
if item is not None:
|
||||
local_list.append(item)
|
||||
return local_list
|
||||
|
||||
def copyStats(var_list):
|
||||
print("copying stats to buffer tensors before eigen decomp")
|
||||
redundant_stats = {}
|
||||
copied_list = []
|
||||
for item in var_list:
|
||||
if item is not None:
|
||||
if item not in redundant_stats:
|
||||
if self._use_float64:
|
||||
redundant_stats[item] = tf.cast(
|
||||
tf.identity(item), tf.float64)
|
||||
else:
|
||||
redundant_stats[item] = tf.identity(item)
|
||||
copied_list.append(redundant_stats[item])
|
||||
else:
|
||||
copied_list.append(None)
|
||||
return copied_list
|
||||
#stats = [copyStats(self.fStats), copyStats(self.bStats)]
|
||||
#stats = [self.fStats, self.bStats]
|
||||
|
||||
stats_eigen = self.stats_eigen
|
||||
computedEigen = {}
|
||||
eigen_reverse_lookup = {}
|
||||
updateOps = []
|
||||
# sync copied stats
|
||||
# with tf.control_dependencies(removeNone(stats[0]) +
|
||||
# removeNone(stats[1])):
|
||||
with tf.control_dependencies([]):
|
||||
for stats_var in stats_eigen:
|
||||
if stats_var not in computedEigen:
|
||||
eigens = tf.self_adjoint_eig(stats_var)
|
||||
e = eigens[0]
|
||||
Q = eigens[1]
|
||||
if self._use_float64:
|
||||
e = tf.cast(e, tf.float32)
|
||||
Q = tf.cast(Q, tf.float32)
|
||||
updateOps.append(e)
|
||||
updateOps.append(Q)
|
||||
computedEigen[stats_var] = {'e': e, 'Q': Q}
|
||||
eigen_reverse_lookup[e] = stats_eigen[stats_var]['e']
|
||||
eigen_reverse_lookup[Q] = stats_eigen[stats_var]['Q']
|
||||
|
||||
self.eigen_reverse_lookup = eigen_reverse_lookup
|
||||
self.eigen_update_list = updateOps
|
||||
|
||||
if KFAC_DEBUG:
|
||||
self.eigen_update_list = [item for item in updateOps]
|
||||
with tf.control_dependencies(updateOps):
|
||||
updateOps.append(tf.Print(tf.constant(
|
||||
0.), [tf.convert_to_tensor('computed factor eigen')]))
|
||||
|
||||
return updateOps
|
||||
|
||||
def applyStatsEigen(self, eigen_list):
|
||||
updateOps = []
|
||||
print(('updating %d eigenvalue/vectors' % len(eigen_list)))
|
||||
for i, (tensor, mark) in enumerate(zip(eigen_list, self.eigen_update_list)):
|
||||
stats_eigen_var = self.eigen_reverse_lookup[mark]
|
||||
updateOps.append(
|
||||
tf.assign(stats_eigen_var, tensor, use_locking=True))
|
||||
|
||||
with tf.control_dependencies(updateOps):
|
||||
factor_step_op = tf.assign_add(self.factor_step, 1)
|
||||
updateOps.append(factor_step_op)
|
||||
if KFAC_DEBUG:
|
||||
updateOps.append(tf.Print(tf.constant(
|
||||
0.), [tf.convert_to_tensor('updated kfac factors')]))
|
||||
return updateOps
|
||||
|
||||
def getKfacPrecondUpdates(self, gradlist, varlist):
|
||||
updatelist = []
|
||||
vg = 0.
|
||||
|
||||
assert len(self.stats) > 0
|
||||
assert len(self.stats_eigen) > 0
|
||||
assert len(self.factors) > 0
|
||||
counter = 0
|
||||
|
||||
grad_dict = {var: grad for grad, var in zip(gradlist, varlist)}
|
||||
|
||||
for grad, var in zip(gradlist, varlist):
|
||||
GRAD_RESHAPE = False
|
||||
GRAD_TRANSPOSE = False
|
||||
|
||||
fpropFactoredFishers = self.stats[var]['fprop_concat_stats']
|
||||
bpropFactoredFishers = self.stats[var]['bprop_concat_stats']
|
||||
|
||||
if (len(fpropFactoredFishers) + len(bpropFactoredFishers)) > 0:
|
||||
counter += 1
|
||||
GRAD_SHAPE = grad.get_shape()
|
||||
if len(grad.get_shape()) > 2:
|
||||
# reshape conv kernel parameters
|
||||
KW = int(grad.get_shape()[0])
|
||||
KH = int(grad.get_shape()[1])
|
||||
C = int(grad.get_shape()[2])
|
||||
D = int(grad.get_shape()[3])
|
||||
|
||||
if len(fpropFactoredFishers) > 1 and self._channel_fac:
|
||||
# reshape conv kernel parameters into tensor
|
||||
grad = tf.reshape(grad, [KW * KH, C, D])
|
||||
else:
|
||||
# reshape conv kernel parameters into 2D grad
|
||||
grad = tf.reshape(grad, [-1, D])
|
||||
GRAD_RESHAPE = True
|
||||
elif len(grad.get_shape()) == 1:
|
||||
# reshape bias or 1D parameters
|
||||
D = int(grad.get_shape()[0])
|
||||
|
||||
grad = tf.expand_dims(grad, 0)
|
||||
GRAD_RESHAPE = True
|
||||
else:
|
||||
# 2D parameters
|
||||
C = int(grad.get_shape()[0])
|
||||
D = int(grad.get_shape()[1])
|
||||
|
||||
if (self.stats[var]['assnBias'] is not None) and not self._blockdiag_bias:
|
||||
# use homogeneous coordinates only works for 2D grad.
|
||||
# TO-DO: figure out how to factorize bias grad
|
||||
# stack bias grad
|
||||
var_assnBias = self.stats[var]['assnBias']
|
||||
grad = tf.concat(
|
||||
[grad, tf.expand_dims(grad_dict[var_assnBias], 0)], 0)
|
||||
|
||||
# project gradient to eigen space and reshape the eigenvalues
|
||||
# for broadcasting
|
||||
eigVals = []
|
||||
|
||||
for idx, stats in enumerate(self.stats[var]['fprop_concat_stats']):
|
||||
Q = self.stats_eigen[stats]['Q']
|
||||
e = detectMinVal(self.stats_eigen[stats][
|
||||
'e'], var, name='act', debug=KFAC_DEBUG)
|
||||
|
||||
Q, e = factorReshape(Q, e, grad, facIndx=idx, ftype='act')
|
||||
eigVals.append(e)
|
||||
grad = gmatmul(Q, grad, transpose_a=True, reduce_dim=idx)
|
||||
|
||||
for idx, stats in enumerate(self.stats[var]['bprop_concat_stats']):
|
||||
Q = self.stats_eigen[stats]['Q']
|
||||
e = detectMinVal(self.stats_eigen[stats][
|
||||
'e'], var, name='grad', debug=KFAC_DEBUG)
|
||||
|
||||
Q, e = factorReshape(Q, e, grad, facIndx=idx, ftype='grad')
|
||||
eigVals.append(e)
|
||||
grad = gmatmul(grad, Q, transpose_b=False, reduce_dim=idx)
|
||||
##
|
||||
|
||||
#####
|
||||
# whiten using eigenvalues
|
||||
weightDecayCoeff = 0.
|
||||
if var in self._weight_decay_dict:
|
||||
weightDecayCoeff = self._weight_decay_dict[var]
|
||||
if KFAC_DEBUG:
|
||||
print(('weight decay coeff for %s is %f' % (var.name, weightDecayCoeff)))
|
||||
|
||||
if self._factored_damping:
|
||||
if KFAC_DEBUG:
|
||||
print(('use factored damping for %s' % (var.name)))
|
||||
coeffs = 1.
|
||||
num_factors = len(eigVals)
|
||||
# compute the ratio of two trace norm of the left and right
|
||||
# KFac matrices, and their generalization
|
||||
if len(eigVals) == 1:
|
||||
damping = self._epsilon + weightDecayCoeff
|
||||
else:
|
||||
damping = tf.pow(
|
||||
self._epsilon + weightDecayCoeff, 1. / num_factors)
|
||||
eigVals_tnorm_avg = [tf.reduce_mean(
|
||||
tf.abs(e)) for e in eigVals]
|
||||
for e, e_tnorm in zip(eigVals, eigVals_tnorm_avg):
|
||||
eig_tnorm_negList = [
|
||||
item for item in eigVals_tnorm_avg if item != e_tnorm]
|
||||
if len(eigVals) == 1:
|
||||
adjustment = 1.
|
||||
elif len(eigVals) == 2:
|
||||
adjustment = tf.sqrt(
|
||||
e_tnorm / eig_tnorm_negList[0])
|
||||
else:
|
||||
eig_tnorm_negList_prod = reduce(
|
||||
lambda x, y: x * y, eig_tnorm_negList)
|
||||
adjustment = tf.pow(
|
||||
tf.pow(e_tnorm, num_factors - 1.) / eig_tnorm_negList_prod, 1. / num_factors)
|
||||
coeffs *= (e + adjustment * damping)
|
||||
else:
|
||||
coeffs = 1.
|
||||
damping = (self._epsilon + weightDecayCoeff)
|
||||
for e in eigVals:
|
||||
coeffs *= e
|
||||
coeffs += damping
|
||||
|
||||
#grad = tf.Print(grad, [tf.convert_to_tensor('1'), tf.convert_to_tensor(var.name), grad.get_shape()])
|
||||
|
||||
grad /= coeffs
|
||||
|
||||
#grad = tf.Print(grad, [tf.convert_to_tensor('2'), tf.convert_to_tensor(var.name), grad.get_shape()])
|
||||
#####
|
||||
# project gradient back to euclidean space
|
||||
for idx, stats in enumerate(self.stats[var]['fprop_concat_stats']):
|
||||
Q = self.stats_eigen[stats]['Q']
|
||||
grad = gmatmul(Q, grad, transpose_a=False, reduce_dim=idx)
|
||||
|
||||
for idx, stats in enumerate(self.stats[var]['bprop_concat_stats']):
|
||||
Q = self.stats_eigen[stats]['Q']
|
||||
grad = gmatmul(grad, Q, transpose_b=True, reduce_dim=idx)
|
||||
##
|
||||
|
||||
#grad = tf.Print(grad, [tf.convert_to_tensor('3'), tf.convert_to_tensor(var.name), grad.get_shape()])
|
||||
if (self.stats[var]['assnBias'] is not None) and not self._blockdiag_bias:
|
||||
# use homogeneous coordinates only works for 2D grad.
|
||||
# TO-DO: figure out how to factorize bias grad
|
||||
# un-stack bias grad
|
||||
var_assnBias = self.stats[var]['assnBias']
|
||||
C_plus_one = int(grad.get_shape()[0])
|
||||
grad_assnBias = tf.reshape(tf.slice(grad,
|
||||
begin=[
|
||||
C_plus_one - 1, 0],
|
||||
size=[1, -1]), var_assnBias.get_shape())
|
||||
grad_assnWeights = tf.slice(grad,
|
||||
begin=[0, 0],
|
||||
size=[C_plus_one - 1, -1])
|
||||
grad_dict[var_assnBias] = grad_assnBias
|
||||
grad = grad_assnWeights
|
||||
|
||||
#grad = tf.Print(grad, [tf.convert_to_tensor('4'), tf.convert_to_tensor(var.name), grad.get_shape()])
|
||||
if GRAD_RESHAPE:
|
||||
grad = tf.reshape(grad, GRAD_SHAPE)
|
||||
|
||||
grad_dict[var] = grad
|
||||
|
||||
print(('projecting %d gradient matrices' % counter))
|
||||
|
||||
for g, var in zip(gradlist, varlist):
|
||||
grad = grad_dict[var]
|
||||
### clipping ###
|
||||
if KFAC_DEBUG:
|
||||
print(('apply clipping to %s' % (var.name)))
|
||||
tf.Print(grad, [tf.sqrt(tf.reduce_sum(tf.pow(grad, 2)))], "Euclidean norm of new grad")
|
||||
local_vg = tf.reduce_sum(grad * g * (self._lr * self._lr))
|
||||
vg += local_vg
|
||||
|
||||
# recale everything
|
||||
if KFAC_DEBUG:
|
||||
print('apply vFv clipping')
|
||||
|
||||
scaling = tf.minimum(1., tf.sqrt(self._clip_kl / vg))
|
||||
if KFAC_DEBUG:
|
||||
scaling = tf.Print(scaling, [tf.convert_to_tensor(
|
||||
'clip: '), scaling, tf.convert_to_tensor(' vFv: '), vg])
|
||||
with tf.control_dependencies([tf.assign(self.vFv, vg)]):
|
||||
updatelist = [grad_dict[var] for var in varlist]
|
||||
for i, item in enumerate(updatelist):
|
||||
updatelist[i] = scaling * item
|
||||
|
||||
return updatelist
|
||||
|
||||
def compute_gradients(self, loss, var_list=None):
|
||||
varlist = var_list
|
||||
if varlist is None:
|
||||
varlist = tf.trainable_variables()
|
||||
g = tf.gradients(loss, varlist)
|
||||
|
||||
return [(a, b) for a, b in zip(g, varlist)]
|
||||
|
||||
def apply_gradients_kfac(self, grads):
|
||||
g, varlist = list(zip(*grads))
|
||||
|
||||
if len(self.stats_eigen) == 0:
|
||||
self.getStatsEigen()
|
||||
|
||||
qr = None
|
||||
# launch eigen-decomp on a queue thread
|
||||
if self._async:
|
||||
print('Use async eigen decomp')
|
||||
# get a list of factor loading tensors
|
||||
factorOps_dummy = self.computeStatsEigen()
|
||||
|
||||
# define a queue for the list of factor loading tensors
|
||||
queue = tf.FIFOQueue(1, [item.dtype for item in factorOps_dummy], shapes=[
|
||||
item.get_shape() for item in factorOps_dummy])
|
||||
enqueue_op = tf.cond(tf.logical_and(tf.equal(tf.mod(self.stats_step, self._kfac_update), tf.convert_to_tensor(
|
||||
0)), tf.greater_equal(self.stats_step, self._stats_accum_iter)), lambda: queue.enqueue(self.computeStatsEigen()), tf.no_op)
|
||||
|
||||
def dequeue_op():
|
||||
return queue.dequeue()
|
||||
|
||||
qr = tf.train.QueueRunner(queue, [enqueue_op])
|
||||
|
||||
updateOps = []
|
||||
global_step_op = tf.assign_add(self.global_step, 1)
|
||||
updateOps.append(global_step_op)
|
||||
|
||||
with tf.control_dependencies([global_step_op]):
|
||||
|
||||
# compute updates
|
||||
assert self._update_stats_op != None
|
||||
updateOps.append(self._update_stats_op)
|
||||
dependency_list = []
|
||||
if not self._async:
|
||||
dependency_list.append(self._update_stats_op)
|
||||
|
||||
with tf.control_dependencies(dependency_list):
|
||||
def no_op_wrapper():
|
||||
return tf.group(*[tf.assign_add(self.cold_step, 1)])
|
||||
|
||||
if not self._async:
|
||||
# synchronous eigen-decomp updates
|
||||
updateFactorOps = tf.cond(tf.logical_and(tf.equal(tf.mod(self.stats_step, self._kfac_update),
|
||||
tf.convert_to_tensor(0)),
|
||||
tf.greater_equal(self.stats_step, self._stats_accum_iter)), lambda: tf.group(*self.applyStatsEigen(self.computeStatsEigen())), no_op_wrapper)
|
||||
else:
|
||||
# asynchronous eigen-decomp updates using queue
|
||||
updateFactorOps = tf.cond(tf.greater_equal(self.stats_step, self._stats_accum_iter),
|
||||
lambda: tf.cond(tf.equal(queue.size(), tf.convert_to_tensor(0)),
|
||||
tf.no_op,
|
||||
|
||||
lambda: tf.group(
|
||||
*self.applyStatsEigen(dequeue_op())),
|
||||
),
|
||||
no_op_wrapper)
|
||||
|
||||
updateOps.append(updateFactorOps)
|
||||
|
||||
with tf.control_dependencies([updateFactorOps]):
|
||||
def gradOp():
|
||||
return list(g)
|
||||
|
||||
def getKfacGradOp():
|
||||
return self.getKfacPrecondUpdates(g, varlist)
|
||||
u = tf.cond(tf.greater(self.factor_step,
|
||||
tf.convert_to_tensor(0)), getKfacGradOp, gradOp)
|
||||
|
||||
optim = tf.train.MomentumOptimizer(
|
||||
self._lr * (1. - self._momentum), self._momentum)
|
||||
#optim = tf.train.AdamOptimizer(self._lr, epsilon=0.01)
|
||||
|
||||
def optimOp():
|
||||
def updateOptimOp():
|
||||
if self._full_stats_init:
|
||||
return tf.cond(tf.greater(self.factor_step, tf.convert_to_tensor(0)), lambda: optim.apply_gradients(list(zip(u, varlist))), tf.no_op)
|
||||
else:
|
||||
return optim.apply_gradients(list(zip(u, varlist)))
|
||||
if self._full_stats_init:
|
||||
return tf.cond(tf.greater_equal(self.stats_step, self._stats_accum_iter), updateOptimOp, tf.no_op)
|
||||
else:
|
||||
return tf.cond(tf.greater_equal(self.sgd_step, self._cold_iter), updateOptimOp, tf.no_op)
|
||||
updateOps.append(optimOp())
|
||||
|
||||
return tf.group(*updateOps), qr
|
||||
|
||||
def apply_gradients(self, grads):
|
||||
coldOptim = tf.train.MomentumOptimizer(
|
||||
self._cold_lr, self._momentum)
|
||||
|
||||
def coldSGDstart():
|
||||
sgd_grads, sgd_var = zip(*grads)
|
||||
|
||||
if self.max_grad_norm != None:
|
||||
sgd_grads, sgd_grad_norm = tf.clip_by_global_norm(sgd_grads,self.max_grad_norm)
|
||||
|
||||
sgd_grads = list(zip(sgd_grads,sgd_var))
|
||||
|
||||
sgd_step_op = tf.assign_add(self.sgd_step, 1)
|
||||
coldOptim_op = coldOptim.apply_gradients(sgd_grads)
|
||||
if KFAC_DEBUG:
|
||||
with tf.control_dependencies([sgd_step_op, coldOptim_op]):
|
||||
sgd_step_op = tf.Print(
|
||||
sgd_step_op, [self.sgd_step, tf.convert_to_tensor('doing cold sgd step')])
|
||||
return tf.group(*[sgd_step_op, coldOptim_op])
|
||||
|
||||
kfacOptim_op, qr = self.apply_gradients_kfac(grads)
|
||||
|
||||
def warmKFACstart():
|
||||
return kfacOptim_op
|
||||
|
||||
return tf.cond(tf.greater(self.sgd_step, self._cold_iter), warmKFACstart, coldSGDstart), qr
|
||||
|
||||
def minimize(self, loss, loss_sampled, var_list=None):
|
||||
grads = self.compute_gradients(loss, var_list=var_list)
|
||||
update_stats_op = self.compute_and_apply_stats(
|
||||
loss_sampled, var_list=var_list)
|
||||
return self.apply_gradients(grads)
|
124
baselines/acktr/kfac_utils.py
Normal file
124
baselines/acktr/kfac_utils.py
Normal file
@@ -0,0 +1,124 @@
|
||||
import tensorflow as tf
|
||||
import numpy as np
|
||||
|
||||
|
||||
def gmatmul(a, b, transpose_a=False, transpose_b=False, reduce_dim=None):
|
||||
if reduce_dim == None:
|
||||
# general batch matmul
|
||||
if len(a.get_shape()) == 3 and len(b.get_shape()) == 3:
|
||||
return tf.batch_matmul(a, b, adj_x=transpose_a, adj_y=transpose_b)
|
||||
elif len(a.get_shape()) == 3 and len(b.get_shape()) == 2:
|
||||
if transpose_b:
|
||||
N = b.get_shape()[0].value
|
||||
else:
|
||||
N = b.get_shape()[1].value
|
||||
B = a.get_shape()[0].value
|
||||
if transpose_a:
|
||||
K = a.get_shape()[1].value
|
||||
a = tf.reshape(tf.transpose(a, [0, 2, 1]), [-1, K])
|
||||
else:
|
||||
K = a.get_shape()[-1].value
|
||||
a = tf.reshape(a, [-1, K])
|
||||
result = tf.matmul(a, b, transpose_b=transpose_b)
|
||||
result = tf.reshape(result, [B, -1, N])
|
||||
return result
|
||||
elif len(a.get_shape()) == 2 and len(b.get_shape()) == 3:
|
||||
if transpose_a:
|
||||
M = a.get_shape()[1].value
|
||||
else:
|
||||
M = a.get_shape()[0].value
|
||||
B = b.get_shape()[0].value
|
||||
if transpose_b:
|
||||
K = b.get_shape()[-1].value
|
||||
b = tf.transpose(tf.reshape(b, [-1, K]), [1, 0])
|
||||
else:
|
||||
K = b.get_shape()[1].value
|
||||
b = tf.transpose(tf.reshape(
|
||||
tf.transpose(b, [0, 2, 1]), [-1, K]), [1, 0])
|
||||
result = tf.matmul(a, b, transpose_a=transpose_a)
|
||||
result = tf.transpose(tf.reshape(result, [M, B, -1]), [1, 0, 2])
|
||||
return result
|
||||
else:
|
||||
return tf.matmul(a, b, transpose_a=transpose_a, transpose_b=transpose_b)
|
||||
else:
|
||||
# weird batch matmul
|
||||
if len(a.get_shape()) == 2 and len(b.get_shape()) > 2:
|
||||
# reshape reduce_dim to the left most dim in b
|
||||
b_shape = b.get_shape()
|
||||
if reduce_dim != 0:
|
||||
b_dims = list(range(len(b_shape)))
|
||||
b_dims.remove(reduce_dim)
|
||||
b_dims.insert(0, reduce_dim)
|
||||
b = tf.transpose(b, b_dims)
|
||||
b_t_shape = b.get_shape()
|
||||
b = tf.reshape(b, [int(b_shape[reduce_dim]), -1])
|
||||
result = tf.matmul(a, b, transpose_a=transpose_a,
|
||||
transpose_b=transpose_b)
|
||||
result = tf.reshape(result, b_t_shape)
|
||||
if reduce_dim != 0:
|
||||
b_dims = list(range(len(b_shape)))
|
||||
b_dims.remove(0)
|
||||
b_dims.insert(reduce_dim, 0)
|
||||
result = tf.transpose(result, b_dims)
|
||||
return result
|
||||
|
||||
elif len(a.get_shape()) > 2 and len(b.get_shape()) == 2:
|
||||
# reshape reduce_dim to the right most dim in a
|
||||
a_shape = a.get_shape()
|
||||
outter_dim = len(a_shape) - 1
|
||||
reduce_dim = len(a_shape) - reduce_dim - 1
|
||||
if reduce_dim != outter_dim:
|
||||
a_dims = list(range(len(a_shape)))
|
||||
a_dims.remove(reduce_dim)
|
||||
a_dims.insert(outter_dim, reduce_dim)
|
||||
a = tf.transpose(a, a_dims)
|
||||
a_t_shape = a.get_shape()
|
||||
a = tf.reshape(a, [-1, int(a_shape[reduce_dim])])
|
||||
result = tf.matmul(a, b, transpose_a=transpose_a,
|
||||
transpose_b=transpose_b)
|
||||
result = tf.reshape(result, a_t_shape)
|
||||
if reduce_dim != outter_dim:
|
||||
a_dims = list(range(len(a_shape)))
|
||||
a_dims.remove(outter_dim)
|
||||
a_dims.insert(reduce_dim, outter_dim)
|
||||
result = tf.transpose(result, a_dims)
|
||||
return result
|
||||
|
||||
elif len(a.get_shape()) == 2 and len(b.get_shape()) == 2:
|
||||
return tf.matmul(a, b, transpose_a=transpose_a, transpose_b=transpose_b)
|
||||
|
||||
assert False, 'something went wrong'
|
||||
|
||||
|
||||
def clipoutNeg(vec, threshold=1e-6):
|
||||
mask = tf.cast(vec > threshold, tf.float32)
|
||||
return mask * vec
|
||||
|
||||
|
||||
def detectMinVal(input_mat, var, threshold=1e-6, name='', debug=False):
|
||||
eigen_min = tf.reduce_min(input_mat)
|
||||
eigen_max = tf.reduce_max(input_mat)
|
||||
eigen_ratio = eigen_max / eigen_min
|
||||
input_mat_clipped = clipoutNeg(input_mat, threshold)
|
||||
|
||||
if debug:
|
||||
input_mat_clipped = tf.cond(tf.logical_or(tf.greater(eigen_ratio, 0.), tf.less(eigen_ratio, -500)), lambda: input_mat_clipped, lambda: tf.Print(
|
||||
input_mat_clipped, [tf.convert_to_tensor('screwed ratio ' + name + ' eigen values!!!'), tf.convert_to_tensor(var.name), eigen_min, eigen_max, eigen_ratio]))
|
||||
|
||||
return input_mat_clipped
|
||||
|
||||
|
||||
def factorReshape(Q, e, grad, facIndx=0, ftype='act'):
|
||||
grad_shape = grad.get_shape()
|
||||
if ftype == 'act':
|
||||
assert e.get_shape()[0] == grad_shape[facIndx]
|
||||
expanded_shape = [1, ] * len(grad_shape)
|
||||
expanded_shape[facIndx] = -1
|
||||
e = tf.reshape(e, expanded_shape)
|
||||
if ftype == 'grad':
|
||||
assert e.get_shape()[0] == grad_shape[len(grad_shape) - facIndx - 1]
|
||||
expanded_shape = [1, ] * len(grad_shape)
|
||||
expanded_shape[len(grad_shape) - facIndx - 1] = -1
|
||||
e = tf.reshape(e, expanded_shape)
|
||||
|
||||
return Q, e
|
80
baselines/acktr/policies.py
Normal file
80
baselines/acktr/policies.py
Normal file
@@ -0,0 +1,80 @@
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
from baselines.acktr.utils import conv, fc, dense, conv_to_fc, sample, kl_div
|
||||
from baselines.common.distributions import make_pdtype
|
||||
import baselines.common.tf_util as U
|
||||
import gym
|
||||
|
||||
class CnnPolicy(object):
|
||||
|
||||
def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False):
|
||||
nbatch = nenv*nsteps
|
||||
nh, nw, nc = ob_space.shape
|
||||
ob_shape = (nbatch, nh, nw, nc*nstack)
|
||||
nact = ac_space.n
|
||||
X = tf.placeholder(tf.uint8, ob_shape) #obs
|
||||
with tf.variable_scope("model", reuse=reuse):
|
||||
h = conv(tf.cast(X, tf.float32)/255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2))
|
||||
h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))
|
||||
h3 = conv(h2, 'c3', nf=32, rf=3, stride=1, init_scale=np.sqrt(2))
|
||||
h3 = conv_to_fc(h3)
|
||||
h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
|
||||
pi = fc(h4, 'pi', nact, act=lambda x:x)
|
||||
vf = fc(h4, 'v', 1, act=lambda x:x)
|
||||
|
||||
v0 = vf[:, 0]
|
||||
a0 = sample(pi)
|
||||
self.initial_state = [] #not stateful
|
||||
|
||||
def step(ob, *_args, **_kwargs):
|
||||
a, v = sess.run([a0, v0], {X:ob})
|
||||
return a, v, [] #dummy state
|
||||
|
||||
def value(ob, *_args, **_kwargs):
|
||||
return sess.run(v0, {X:ob})
|
||||
|
||||
self.X = X
|
||||
self.pi = pi
|
||||
self.vf = vf
|
||||
self.step = step
|
||||
self.value = value
|
||||
|
||||
|
||||
class GaussianMlpPolicy(object):
|
||||
def __init__(self, ob_dim, ac_dim):
|
||||
# Here we'll construct a bunch of expressions, which will be used in two places:
|
||||
# (1) When sampling actions
|
||||
# (2) When computing loss functions, for the policy update
|
||||
# Variables specific to (1) have the word "sampled" in them,
|
||||
# whereas variables specific to (2) have the word "old" in them
|
||||
ob_no = tf.placeholder(tf.float32, shape=[None, ob_dim*2], name="ob") # batch of observations
|
||||
oldac_na = tf.placeholder(tf.float32, shape=[None, ac_dim], name="ac") # batch of actions previous actions
|
||||
oldac_dist = tf.placeholder(tf.float32, shape=[None, ac_dim*2], name="oldac_dist") # batch of actions previous action distributions
|
||||
adv_n = tf.placeholder(tf.float32, shape=[None], name="adv") # advantage function estimate
|
||||
oldlogprob_n = tf.placeholder(tf.float32, shape=[None], name='oldlogprob') # log probability of previous actions
|
||||
wd_dict = {}
|
||||
h1 = tf.nn.tanh(dense(ob_no, 64, "h1", weight_init=U.normc_initializer(1.0), bias_init=0.0, weight_loss_dict=wd_dict))
|
||||
h2 = tf.nn.tanh(dense(h1, 64, "h2", weight_init=U.normc_initializer(1.0), bias_init=0.0, weight_loss_dict=wd_dict))
|
||||
mean_na = dense(h2, ac_dim, "mean", weight_init=U.normc_initializer(0.1), bias_init=0.0, weight_loss_dict=wd_dict) # Mean control output
|
||||
self.wd_dict = wd_dict
|
||||
self.logstd_1a = logstd_1a = tf.get_variable("logstd", [ac_dim], tf.float32, tf.zeros_initializer()) # Variance on outputs
|
||||
logstd_1a = tf.expand_dims(logstd_1a, 0)
|
||||
std_1a = tf.exp(logstd_1a)
|
||||
std_na = tf.tile(std_1a, [tf.shape(mean_na)[0], 1])
|
||||
ac_dist = tf.concat([tf.reshape(mean_na, [-1, ac_dim]), tf.reshape(std_na, [-1, ac_dim])], 1)
|
||||
sampled_ac_na = tf.random_normal(tf.shape(ac_dist[:,ac_dim:])) * ac_dist[:,ac_dim:] + ac_dist[:,:ac_dim] # This is the sampled action we'll perform.
|
||||
logprobsampled_n = - U.sum(tf.log(ac_dist[:,ac_dim:]), axis=1) - 0.5 * tf.log(2.0*np.pi)*ac_dim - 0.5 * U.sum(tf.square(ac_dist[:,:ac_dim] - sampled_ac_na) / (tf.square(ac_dist[:,ac_dim:])), axis=1) # Logprob of sampled action
|
||||
logprob_n = - U.sum(tf.log(ac_dist[:,ac_dim:]), axis=1) - 0.5 * tf.log(2.0*np.pi)*ac_dim - 0.5 * U.sum(tf.square(ac_dist[:,:ac_dim] - oldac_na) / (tf.square(ac_dist[:,ac_dim:])), axis=1) # Logprob of previous actions under CURRENT policy (whereas oldlogprob_n is under OLD policy)
|
||||
kl = U.mean(kl_div(oldac_dist, ac_dist, ac_dim))
|
||||
#kl = .5 * U.mean(tf.square(logprob_n - oldlogprob_n)) # Approximation of KL divergence between old policy used to generate actions, and new policy used to compute logprob_n
|
||||
surr = - U.mean(adv_n * logprob_n) # Loss function that we'll differentiate to get the policy gradient
|
||||
surr_sampled = - U.mean(logprob_n) # Sampled loss of the policy
|
||||
self._act = U.function([ob_no], [sampled_ac_na, ac_dist, logprobsampled_n]) # Generate a new action and its logprob
|
||||
#self.compute_kl = U.function([ob_no, oldac_na, oldlogprob_n], kl) # Compute (approximate) KL divergence between old policy and new policy
|
||||
self.compute_kl = U.function([ob_no, oldac_dist], kl)
|
||||
self.update_info = ((ob_no, oldac_na, adv_n), surr, surr_sampled) # Input and output variables needed for computing loss
|
||||
U.initialize() # Initialize uninitialized TF variables
|
||||
|
||||
def act(self, ob):
|
||||
ac, ac_dist, logp = self._act(ob[None])
|
||||
return ac[0], ac_dist[0], logp[0]
|
36
baselines/acktr/run_atari.py
Normal file
36
baselines/acktr/run_atari.py
Normal file
@@ -0,0 +1,36 @@
|
||||
#!/usr/bin/env python
|
||||
import os, logging, gym
|
||||
from baselines import logger
|
||||
from baselines.common import set_global_seeds
|
||||
from baselines import bench
|
||||
from baselines.acktr.acktr_disc import learn
|
||||
from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
|
||||
from baselines.common.atari_wrappers import wrap_deepmind
|
||||
from baselines.acktr.policies import CnnPolicy
|
||||
|
||||
def train(env_id, num_timesteps, seed, num_cpu):
|
||||
num_timesteps //= 4
|
||||
|
||||
def make_env(rank):
|
||||
def _thunk():
|
||||
env = gym.make(env_id)
|
||||
env.seed(seed + rank)
|
||||
if logger.get_dir():
|
||||
env = bench.Monitor(env, os.path.join(logger.get_dir(), "{}.monitor.json".format(rank)))
|
||||
gym.logger.setLevel(logging.WARN)
|
||||
return wrap_deepmind(env)
|
||||
return _thunk
|
||||
|
||||
set_global_seeds(seed)
|
||||
env = SubprocVecEnv([make_env(i) for i in range(num_cpu)])
|
||||
|
||||
policy_fn = CnnPolicy
|
||||
learn(policy_fn, env, seed, total_timesteps=num_timesteps, nprocs=num_cpu)
|
||||
env.close()
|
||||
|
||||
def main():
|
||||
train('BreakoutNoFrameskip-v4', num_timesteps=int(40e6), seed=0, num_cpu=32)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
42
baselines/acktr/run_mujoco.py
Normal file
42
baselines/acktr/run_mujoco.py
Normal file
@@ -0,0 +1,42 @@
|
||||
#!/usr/bin/env python
|
||||
import argparse
|
||||
import logging
|
||||
import os
|
||||
import tensorflow as tf
|
||||
import gym
|
||||
from baselines import logger
|
||||
from baselines.common import set_global_seeds
|
||||
from baselines import bench
|
||||
from baselines.acktr.acktr_cont import learn
|
||||
from baselines.acktr.policies import GaussianMlpPolicy
|
||||
from baselines.acktr.value_functions import NeuralNetValueFunction
|
||||
|
||||
def train(env_id, num_timesteps, seed):
|
||||
env=gym.make(env_id)
|
||||
if logger.get_dir():
|
||||
env = bench.Monitor(env, os.path.join(logger.get_dir(), "monitor.json"))
|
||||
set_global_seeds(seed)
|
||||
env.seed(seed)
|
||||
gym.logger.setLevel(logging.WARN)
|
||||
|
||||
with tf.Session(config=tf.ConfigProto()) as session:
|
||||
ob_dim = env.observation_space.shape[0]
|
||||
ac_dim = env.action_space.shape[0]
|
||||
with tf.variable_scope("vf"):
|
||||
vf = NeuralNetValueFunction(ob_dim, ac_dim)
|
||||
with tf.variable_scope("pi"):
|
||||
policy = GaussianMlpPolicy(ob_dim, ac_dim)
|
||||
|
||||
learn(env, policy=policy, vf=vf,
|
||||
gamma=0.99, lam=0.97, timesteps_per_batch=2500,
|
||||
desired_kl=0.002,
|
||||
num_timesteps=num_timesteps, animate=False)
|
||||
|
||||
env.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
parser = argparse.ArgumentParser(description='Run Mujoco benchmark.')
|
||||
parser.add_argument('--env_id', type=str, default="Reacher-v1")
|
||||
args = parser.parse_args()
|
||||
train(args.env_id, num_timesteps=1e6, seed=1)
|
46
baselines/acktr/running_stat.py
Normal file
46
baselines/acktr/running_stat.py
Normal file
@@ -0,0 +1,46 @@
|
||||
import numpy as np
|
||||
|
||||
# http://www.johndcook.com/blog/standard_deviation/
|
||||
class RunningStat(object):
|
||||
def __init__(self, shape):
|
||||
self._n = 0
|
||||
self._M = np.zeros(shape)
|
||||
self._S = np.zeros(shape)
|
||||
def push(self, x):
|
||||
x = np.asarray(x)
|
||||
assert x.shape == self._M.shape
|
||||
self._n += 1
|
||||
if self._n == 1:
|
||||
self._M[...] = x
|
||||
else:
|
||||
oldM = self._M.copy()
|
||||
self._M[...] = oldM + (x - oldM)/self._n
|
||||
self._S[...] = self._S + (x - oldM)*(x - self._M)
|
||||
@property
|
||||
def n(self):
|
||||
return self._n
|
||||
@property
|
||||
def mean(self):
|
||||
return self._M
|
||||
@property
|
||||
def var(self):
|
||||
return self._S/(self._n - 1) if self._n > 1 else np.square(self._M)
|
||||
@property
|
||||
def std(self):
|
||||
return np.sqrt(self.var)
|
||||
@property
|
||||
def shape(self):
|
||||
return self._M.shape
|
||||
|
||||
def test_running_stat():
|
||||
for shp in ((), (3,), (3,4)):
|
||||
li = []
|
||||
rs = RunningStat(shp)
|
||||
for _ in range(5):
|
||||
val = np.random.randn(*shp)
|
||||
rs.push(val)
|
||||
li.append(val)
|
||||
m = np.mean(li, axis=0)
|
||||
assert np.allclose(rs.mean, m)
|
||||
v = np.square(m) if (len(li) == 1) else np.var(li, ddof=1, axis=0)
|
||||
assert np.allclose(rs.var, v)
|
200
baselines/acktr/utils.py
Normal file
200
baselines/acktr/utils.py
Normal file
@@ -0,0 +1,200 @@
|
||||
import os
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
import baselines.common.tf_util as U
|
||||
from collections import deque
|
||||
|
||||
def sample(logits):
|
||||
noise = tf.random_uniform(tf.shape(logits))
|
||||
return tf.argmax(logits - tf.log(-tf.log(noise)), 1)
|
||||
|
||||
def std(x):
|
||||
mean = tf.reduce_mean(x)
|
||||
var = tf.reduce_mean(tf.square(x-mean))
|
||||
return tf.sqrt(var)
|
||||
|
||||
def cat_entropy(logits):
|
||||
a0 = logits - tf.reduce_max(logits, 1, keep_dims=True)
|
||||
ea0 = tf.exp(a0)
|
||||
z0 = tf.reduce_sum(ea0, 1, keep_dims=True)
|
||||
p0 = ea0 / z0
|
||||
return tf.reduce_sum(p0 * (tf.log(z0) - a0), 1)
|
||||
|
||||
def cat_entropy_softmax(p0):
|
||||
return - tf.reduce_sum(p0 * tf.log(p0 + 1e-6), axis = 1)
|
||||
|
||||
def mse(pred, target):
|
||||
return tf.square(pred-target)/2.
|
||||
|
||||
def ortho_init(scale=1.0):
|
||||
def _ortho_init(shape, dtype, partition_info=None):
|
||||
#lasagne ortho init for tf
|
||||
shape = tuple(shape)
|
||||
if len(shape) == 2:
|
||||
flat_shape = shape
|
||||
elif len(shape) == 4: # assumes NHWC
|
||||
flat_shape = (np.prod(shape[:-1]), shape[-1])
|
||||
else:
|
||||
raise NotImplementedError
|
||||
a = np.random.normal(0.0, 1.0, flat_shape)
|
||||
u, _, v = np.linalg.svd(a, full_matrices=False)
|
||||
q = u if u.shape == flat_shape else v # pick the one with the correct shape
|
||||
q = q.reshape(shape)
|
||||
return (scale * q[:shape[0], :shape[1]]).astype(np.float32)
|
||||
return _ortho_init
|
||||
|
||||
def conv(x, scope, nf, rf, stride, pad='VALID', act=tf.nn.relu, init_scale=1.0):
|
||||
with tf.variable_scope(scope):
|
||||
nin = x.get_shape()[3].value
|
||||
w = tf.get_variable("w", [rf, rf, nin, nf], initializer=ortho_init(init_scale))
|
||||
b = tf.get_variable("b", [nf], initializer=tf.constant_initializer(0.0))
|
||||
z = tf.nn.conv2d(x, w, strides=[1, stride, stride, 1], padding=pad)+b
|
||||
h = act(z)
|
||||
return h
|
||||
|
||||
def fc(x, scope, nh, act=tf.nn.relu, init_scale=1.0):
|
||||
with tf.variable_scope(scope):
|
||||
nin = x.get_shape()[1].value
|
||||
w = tf.get_variable("w", [nin, nh], initializer=ortho_init(init_scale))
|
||||
b = tf.get_variable("b", [nh], initializer=tf.constant_initializer(0.0))
|
||||
z = tf.matmul(x, w)+b
|
||||
h = act(z)
|
||||
return h
|
||||
|
||||
def dense(x, size, name, weight_init=None, bias_init=0, weight_loss_dict=None, reuse=None):
|
||||
with tf.variable_scope(name, reuse=reuse):
|
||||
assert (len(U.scope_name().split('/')) == 2)
|
||||
|
||||
w = tf.get_variable("w", [x.get_shape()[1], size], initializer=weight_init)
|
||||
b = tf.get_variable("b", [size], initializer=tf.constant_initializer(bias_init))
|
||||
weight_decay_fc = 3e-4
|
||||
|
||||
if weight_loss_dict is not None:
|
||||
weight_decay = tf.multiply(tf.nn.l2_loss(w), weight_decay_fc, name='weight_decay_loss')
|
||||
if weight_loss_dict is not None:
|
||||
weight_loss_dict[w] = weight_decay_fc
|
||||
weight_loss_dict[b] = 0.0
|
||||
|
||||
tf.add_to_collection(U.scope_name().split('/')[0] + '_' + 'losses', weight_decay)
|
||||
|
||||
return tf.nn.bias_add(tf.matmul(x, w), b)
|
||||
|
||||
def conv_to_fc(x):
|
||||
nh = np.prod([v.value for v in x.get_shape()[1:]])
|
||||
x = tf.reshape(x, [-1, nh])
|
||||
return x
|
||||
|
||||
def kl_div(action_dist1, action_dist2, action_size):
|
||||
mean1, std1 = action_dist1[:, :action_size], action_dist1[:, action_size:]
|
||||
mean2, std2 = action_dist2[:, :action_size], action_dist2[:, action_size:]
|
||||
|
||||
numerator = tf.square(mean1 - mean2) + tf.square(std1) - tf.square(std2)
|
||||
denominator = 2 * tf.square(std2) + 1e-8
|
||||
return tf.reduce_sum(
|
||||
numerator/denominator + tf.log(std2) - tf.log(std1),reduction_indices=-1)
|
||||
|
||||
def discount_with_dones(rewards, dones, gamma):
|
||||
discounted = []
|
||||
r = 0
|
||||
for reward, done in zip(rewards[::-1], dones[::-1]):
|
||||
r = reward + gamma*r*(1.-done) # fixed off by one bug
|
||||
discounted.append(r)
|
||||
return discounted[::-1]
|
||||
|
||||
def find_trainable_variables(key):
|
||||
with tf.variable_scope(key):
|
||||
return tf.trainable_variables()
|
||||
|
||||
def make_path(f):
|
||||
return os.makedirs(f, exist_ok=True)
|
||||
|
||||
def constant(p):
|
||||
return 1
|
||||
|
||||
def linear(p):
|
||||
return 1-p
|
||||
|
||||
|
||||
def middle_drop(p):
|
||||
eps = 0.75
|
||||
if 1-p<eps:
|
||||
return eps*0.1
|
||||
return 1-p
|
||||
|
||||
def double_linear_con(p):
|
||||
p *= 2
|
||||
eps = 0.125
|
||||
if 1-p<eps:
|
||||
return eps
|
||||
return 1-p
|
||||
|
||||
|
||||
def double_middle_drop(p):
|
||||
eps1 = 0.75
|
||||
eps2 = 0.25
|
||||
if 1-p<eps1:
|
||||
if 1-p<eps2:
|
||||
return eps2*0.5
|
||||
return eps1*0.1
|
||||
return 1-p
|
||||
|
||||
|
||||
schedules = {
|
||||
'linear':linear,
|
||||
'constant':constant,
|
||||
'double_linear_con':double_linear_con,
|
||||
'middle_drop':middle_drop,
|
||||
'double_middle_drop':double_middle_drop
|
||||
}
|
||||
|
||||
class Scheduler(object):
|
||||
|
||||
def __init__(self, v, nvalues, schedule):
|
||||
self.n = 0.
|
||||
self.v = v
|
||||
self.nvalues = nvalues
|
||||
self.schedule = schedules[schedule]
|
||||
|
||||
def value(self):
|
||||
current_value = self.v*self.schedule(self.n/self.nvalues)
|
||||
self.n += 1.
|
||||
return current_value
|
||||
|
||||
def value_steps(self, steps):
|
||||
return self.v*self.schedule(steps/self.nvalues)
|
||||
|
||||
|
||||
class EpisodeStats:
|
||||
def __init__(self, nsteps, nenvs):
|
||||
self.episode_rewards = []
|
||||
for i in range(nenvs):
|
||||
self.episode_rewards.append([])
|
||||
self.lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths
|
||||
self.rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards
|
||||
self.nsteps = nsteps
|
||||
self.nenvs = nenvs
|
||||
|
||||
def feed(self, rewards, masks):
|
||||
rewards = np.reshape(rewards, [self.nenvs, self.nsteps])
|
||||
masks = np.reshape(masks, [self.nenvs, self.nsteps])
|
||||
for i in range(0, self.nenvs):
|
||||
for j in range(0, self.nsteps):
|
||||
self.episode_rewards[i].append(rewards[i][j])
|
||||
if masks[i][j]:
|
||||
l = len(self.episode_rewards[i])
|
||||
s = sum(self.episode_rewards[i])
|
||||
self.lenbuffer.append(l)
|
||||
self.rewbuffer.append(s)
|
||||
self.episode_rewards[i] = []
|
||||
|
||||
def mean_length(self):
|
||||
if self.lenbuffer:
|
||||
return np.mean(self.lenbuffer)
|
||||
else:
|
||||
return 0 # on the first params dump, no episodes are finished
|
||||
|
||||
def mean_reward(self):
|
||||
if self.rewbuffer:
|
||||
return np.mean(self.rewbuffer)
|
||||
else:
|
||||
return 0
|
50
baselines/acktr/value_functions.py
Normal file
50
baselines/acktr/value_functions.py
Normal file
@@ -0,0 +1,50 @@
|
||||
from baselines import logger
|
||||
import numpy as np
|
||||
from baselines import common
|
||||
from baselines.common import tf_util as U
|
||||
import tensorflow as tf
|
||||
import kfac
|
||||
from utils import dense
|
||||
|
||||
class NeuralNetValueFunction(object):
|
||||
def __init__(self, ob_dim, ac_dim): #pylint: disable=W0613
|
||||
X = tf.placeholder(tf.float32, shape=[None, ob_dim*2+ac_dim*2+2]) # batch of observations
|
||||
vtarg_n = tf.placeholder(tf.float32, shape=[None], name='vtarg')
|
||||
wd_dict = {}
|
||||
h1 = tf.nn.elu(dense(X, 64, "h1", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict))
|
||||
h2 = tf.nn.elu(dense(h1, 64, "h2", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict))
|
||||
vpred_n = dense(h2, 1, "hfinal", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)[:,0]
|
||||
sample_vpred_n = vpred_n + tf.random_normal(tf.shape(vpred_n))
|
||||
wd_loss = tf.get_collection("vf_losses", None)
|
||||
loss = U.mean(tf.square(vpred_n - vtarg_n)) + tf.add_n(wd_loss)
|
||||
loss_sampled = U.mean(tf.square(vpred_n - tf.stop_gradient(sample_vpred_n)))
|
||||
self._predict = U.function([X], vpred_n)
|
||||
optim = kfac.KfacOptimizer(learning_rate=0.001, cold_lr=0.001*(1-0.9), momentum=0.9, \
|
||||
clip_kl=0.3, epsilon=0.1, stats_decay=0.95, \
|
||||
async=1, kfac_update=2, cold_iter=50, \
|
||||
weight_decay_dict=wd_dict, max_grad_norm=None)
|
||||
vf_var_list = []
|
||||
for var in tf.trainable_variables():
|
||||
if "vf" in var.name:
|
||||
vf_var_list.append(var)
|
||||
|
||||
update_op, self.q_runner = optim.minimize(loss, loss_sampled, var_list=vf_var_list)
|
||||
self.do_update = U.function([X, vtarg_n], update_op) #pylint: disable=E1101
|
||||
U.initialize() # Initialize uninitialized TF variables
|
||||
def _preproc(self, path):
|
||||
l = pathlength(path)
|
||||
al = np.arange(l).reshape(-1,1)/10.0
|
||||
act = path["action_dist"].astype('float32')
|
||||
X = np.concatenate([path['observation'], act, al, np.ones((l, 1))], axis=1)
|
||||
return X
|
||||
def predict(self, path):
|
||||
return self._predict(self._preproc(path))
|
||||
def fit(self, paths, targvals):
|
||||
X = np.concatenate([self._preproc(p) for p in paths])
|
||||
y = np.concatenate(targvals)
|
||||
logger.record_tabular("EVBefore", common.explained_variance(self._predict(X), y))
|
||||
for _ in range(25): self.do_update(X, y)
|
||||
logger.record_tabular("EVAfter", common.explained_variance(self._predict(X), y))
|
||||
|
||||
def pathlength(path):
|
||||
return path["reward"].shape[0]
|
@@ -22,6 +22,13 @@ def get_task(benchmark, env_id):
|
||||
"""Get a task by env_id. Return None if the benchmark doesn't have the env"""
|
||||
return next(filter(lambda task: task['env_id'] == env_id, benchmark['tasks']), None)
|
||||
|
||||
def find_task_for_env_id_in_any_benchmark(env_id):
|
||||
for bm in _BENCHMARKS:
|
||||
for task in bm["tasks"]:
|
||||
if task["env_id"]==env_id:
|
||||
return bm, task
|
||||
return None, None
|
||||
|
||||
_ATARI_SUFFIX = 'NoFrameskip-v4'
|
||||
|
||||
register_benchmark({
|
||||
@@ -49,30 +56,61 @@ register_benchmark({
|
||||
})
|
||||
|
||||
|
||||
# MuJoCo
|
||||
|
||||
_mujocosmall = [
|
||||
'InvertedDoublePendulum-v1', 'InvertedPendulum-v1',
|
||||
'HalfCheetah-v1', 'Hopper-v1', 'Walker2d-v1',
|
||||
'Reacher-v1', 'Swimmer-v1']
|
||||
|
||||
register_benchmark({
|
||||
'name' : 'Mujoco1M',
|
||||
'description' : 'Some small 2D MuJoCo tasks, run for 1M timesteps',
|
||||
'tasks' : [{'env_id' : _envid, 'trials' : 3, 'num_timesteps' : int(1e6)} for _envid in _mujocosmall]
|
||||
})
|
||||
|
||||
_roboschool_mujoco = [
|
||||
'RoboschoolInvertedDoublePendulum-v0', 'RoboschoolInvertedPendulum-v0', # cartpole
|
||||
'RoboschoolHalfCheetah-v0', 'RoboschoolHopper-v0', 'RoboschoolWalker2d-v0', # forward walkers
|
||||
'RoboschoolReacher-v0'
|
||||
register_benchmark({
|
||||
'name' : 'MujocoWalkers',
|
||||
'description' : 'MuJoCo forward walkers, run for 8M, humanoid 100M',
|
||||
'tasks' : [
|
||||
{'env_id' : "Hopper-v1", 'trials' : 4, 'num_timesteps' : 8*1000000 },
|
||||
{'env_id' : "Walker2d-v1", 'trials' : 4, 'num_timesteps' : 8*1000000 },
|
||||
{'env_id' : "Humanoid-v1", 'trials' : 4, 'num_timesteps' : 100*1000000 },
|
||||
]
|
||||
})
|
||||
# To reproduce:
|
||||
# python3 baselines/baselines/ppo2/ppo2_run_benchmark.py gce MujocoWalkers myrun_ppo2_whiteobs1_cpu8
|
||||
# (observation input filters necessary)
|
||||
|
||||
|
||||
# Roboschool
|
||||
|
||||
register_benchmark({
|
||||
'name' : 'RoboschoolMujoco2M',
|
||||
'description' : 'Same small 2D tasks, still improving up to 2M',
|
||||
'tasks' : [{'env_id' : _envid, 'trials' : 3, 'num_timesteps' : int(2e6)} for _envid in _roboschool_mujoco]
|
||||
'name' : 'Roboschool8M',
|
||||
'description' : 'Small 2D tasks, up to 30 minutes to complete on 8 cores',
|
||||
'tasks' : [
|
||||
{'env_id' : "RoboschoolReacher-v1", 'trials' : 4, 'num_timesteps' : 2*1000000 },
|
||||
{'env_id' : "RoboschoolAnt-v1", 'trials' : 4, 'num_timesteps' : 8*1000000 },
|
||||
{'env_id' : "RoboschoolHalfCheetah-v1", 'trials' : 4, 'num_timesteps' : 8*1000000 },
|
||||
{'env_id' : "RoboschoolHopper-v1", 'trials' : 4, 'num_timesteps' : 8*1000000 },
|
||||
{'env_id' : "RoboschoolWalker2d-v1", 'trials' : 4, 'num_timesteps' : 8*1000000 },
|
||||
]
|
||||
})
|
||||
register_benchmark({
|
||||
'name' : 'RoboschoolHarder',
|
||||
'description' : 'Test your might!!! Up to 12 hours on 32 cores',
|
||||
'tasks' : [
|
||||
{'env_id' : "RoboschoolHumanoid-v1", 'trials' : 4, 'num_timesteps' : 100*1000000 },
|
||||
{'env_id' : "RoboschoolHumanoidFlagrun-v1", 'trials' : 4, 'num_timesteps' : 200*1000000 },
|
||||
{'env_id' : "RoboschoolHumanoidFlagrunHarder-v1", 'trials' : 4, 'num_timesteps' : 400*1000000 },
|
||||
]
|
||||
})
|
||||
# To reproduce:
|
||||
# python3 baselines/baselines/ppo2/ppo2_run_benchmark.py gce Roboschool8M myrun_ppo2_cpu8
|
||||
# python3 baselines/baselines/ppo2/ppo2_run_benchmark.py gce RoboschoolHarder myrun_ppo2_cpu32_large_samples65536
|
||||
# (Large network, train on 65536 samples each iteration. Also, _large is really necessary only for Harder)
|
||||
|
||||
|
||||
# Other
|
||||
|
||||
_atari50 = [ # actually 49
|
||||
'Alien', 'Amidar', 'Assault', 'Asterix', 'Asteroids',
|
||||
'Atlantis', 'BankHeist', 'BattleZone', 'BeamRider', 'Bowling',
|
||||
@@ -91,3 +129,12 @@ register_benchmark({
|
||||
'description' :'7 Atari games from Mnih et al. (2013), with pixel observations, 40M frames',
|
||||
'tasks' : [{'env_id' : _game + _ATARI_SUFFIX, 'trials' : 3, 'num_timesteps' : int(40e6)} for _game in _atari50]
|
||||
})
|
||||
|
||||
def env_shortname(s):
|
||||
"Make typical names above shorter, while keeping recognizable"
|
||||
s = s.replace("NoFrameskip", "")
|
||||
if s[:10]=="Roboschool": s = s[10:]
|
||||
i = s.rfind("-v")
|
||||
if i!=-1: s = s[:i]
|
||||
|
||||
return s.lower()
|
||||
|
@@ -117,7 +117,7 @@ class LoadMonitorResultsError(Exception):
|
||||
def get_monitor_files(dir):
|
||||
return glob(path.join(dir, "*" + Monitor.EXT))
|
||||
|
||||
def load_results(dir):
|
||||
def load_results(dir, raw_episodes=False):
|
||||
fnames = get_monitor_files(dir)
|
||||
if not fnames:
|
||||
raise LoadMonitorResultsError("no monitor files of the form *%s found in %s" % (Monitor.EXT, dir))
|
||||
@@ -137,10 +137,13 @@ def load_results(dir):
|
||||
for header in headers[1:]:
|
||||
assert header['env_id'] == header0['env_id'], "mixing data from two envs"
|
||||
episodes = sorted(episodes, key=lambda e: e['abstime'])
|
||||
return {
|
||||
'env_info': {'env_id': header0['env_id'], 'gym_version': header0['gym_version']},
|
||||
'episode_end_times': [e['abstime'] for e in episodes],
|
||||
'episode_lengths': [e['l'] for e in episodes],
|
||||
'episode_rewards': [e['r'] for e in episodes],
|
||||
'initial_reset_time': min([min(header['t_start'] for header in headers)])
|
||||
}
|
||||
if raw_episodes:
|
||||
return episodes
|
||||
else:
|
||||
return {
|
||||
'env_info': {'env_id': header0['env_id'], 'gym_version': header0['gym_version']},
|
||||
'episode_end_times': [e['abstime'] for e in episodes],
|
||||
'episode_lengths': [e['l'] for e in episodes],
|
||||
'episode_rewards': [e['r'] for e in episodes],
|
||||
'initial_reset_time': min([min(header['t_start'] for header in headers)])
|
||||
}
|
||||
|
@@ -108,7 +108,7 @@ class BernoulliPdType(PdType):
|
||||
# def flatparam(self):
|
||||
# return self.logits
|
||||
# def mode(self):
|
||||
# return U.argmax(self.logits, axis=1)
|
||||
# return U.argmax(self.logits, axis=-1)
|
||||
# def logp(self, x):
|
||||
# return -tf.nn.sparse_softmax_cross_entropy_with_logits(self.logits, x)
|
||||
# def kl(self, other):
|
||||
@@ -118,7 +118,7 @@ class BernoulliPdType(PdType):
|
||||
# return tf.nn.softmax_cross_entropy_with_logits(self.logits, self.ps)
|
||||
# def sample(self):
|
||||
# u = tf.random_uniform(tf.shape(self.logits))
|
||||
# return U.argmax(self.logits - tf.log(-tf.log(u)), axis=1)
|
||||
# return U.argmax(self.logits - tf.log(-tf.log(u)), axis=-1)
|
||||
|
||||
class CategoricalPd(Pd):
|
||||
def __init__(self, logits):
|
||||
@@ -126,27 +126,33 @@ class CategoricalPd(Pd):
|
||||
def flatparam(self):
|
||||
return self.logits
|
||||
def mode(self):
|
||||
return U.argmax(self.logits, axis=1)
|
||||
return U.argmax(self.logits, axis=-1)
|
||||
def neglogp(self, x):
|
||||
return tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=x)
|
||||
# return tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=x)
|
||||
# Note: we can't use sparse_softmax_cross_entropy_with_logits because
|
||||
# the implementation does not allow second-order derivatives...
|
||||
one_hot_actions = tf.one_hot(x, self.logits.get_shape().as_list()[-1])
|
||||
return tf.nn.softmax_cross_entropy_with_logits(
|
||||
logits=self.logits,
|
||||
labels=one_hot_actions)
|
||||
def kl(self, other):
|
||||
a0 = self.logits - U.max(self.logits, axis=1, keepdims=True)
|
||||
a1 = other.logits - U.max(other.logits, axis=1, keepdims=True)
|
||||
a0 = self.logits - U.max(self.logits, axis=-1, keepdims=True)
|
||||
a1 = other.logits - U.max(other.logits, axis=-1, keepdims=True)
|
||||
ea0 = tf.exp(a0)
|
||||
ea1 = tf.exp(a1)
|
||||
z0 = U.sum(ea0, axis=1, keepdims=True)
|
||||
z1 = U.sum(ea1, axis=1, keepdims=True)
|
||||
z0 = U.sum(ea0, axis=-1, keepdims=True)
|
||||
z1 = U.sum(ea1, axis=-1, keepdims=True)
|
||||
p0 = ea0 / z0
|
||||
return U.sum(p0 * (a0 - tf.log(z0) - a1 + tf.log(z1)), axis=1)
|
||||
return U.sum(p0 * (a0 - tf.log(z0) - a1 + tf.log(z1)), axis=-1)
|
||||
def entropy(self):
|
||||
a0 = self.logits - U.max(self.logits, axis=1, keepdims=True)
|
||||
a0 = self.logits - U.max(self.logits, axis=-1, keepdims=True)
|
||||
ea0 = tf.exp(a0)
|
||||
z0 = U.sum(ea0, axis=1, keepdims=True)
|
||||
z0 = U.sum(ea0, axis=-1, keepdims=True)
|
||||
p0 = ea0 / z0
|
||||
return U.sum(p0 * (tf.log(z0) - a0), axis=1)
|
||||
return U.sum(p0 * (tf.log(z0) - a0), axis=-1)
|
||||
def sample(self):
|
||||
u = tf.random_uniform(tf.shape(self.logits))
|
||||
return tf.argmax(self.logits - tf.log(-tf.log(u)), axis=1)
|
||||
return tf.argmax(self.logits - tf.log(-tf.log(u)), axis=-1)
|
||||
@classmethod
|
||||
def fromflat(cls, flat):
|
||||
return cls(flat)
|
||||
@@ -177,7 +183,7 @@ class MultiCategoricalPd(Pd):
|
||||
class DiagGaussianPd(Pd):
|
||||
def __init__(self, flat):
|
||||
self.flat = flat
|
||||
mean, logstd = tf.split(axis=len(flat.get_shape()) - 1, num_or_size_splits=2, value=flat)
|
||||
mean, logstd = tf.split(axis=len(flat.shape)-1, num_or_size_splits=2, value=flat)
|
||||
self.mean = mean
|
||||
self.logstd = logstd
|
||||
self.std = tf.exp(logstd)
|
||||
@@ -186,14 +192,14 @@ class DiagGaussianPd(Pd):
|
||||
def mode(self):
|
||||
return self.mean
|
||||
def neglogp(self, x):
|
||||
return 0.5 * U.sum(tf.square((x - self.mean) / self.std), axis=len(x.get_shape()) - 1) \
|
||||
return 0.5 * U.sum(tf.square((x - self.mean) / self.std), axis=-1) \
|
||||
+ 0.5 * np.log(2.0 * np.pi) * tf.to_float(tf.shape(x)[-1]) \
|
||||
+ U.sum(self.logstd, axis=len(x.get_shape()) - 1)
|
||||
+ U.sum(self.logstd, axis=-1)
|
||||
def kl(self, other):
|
||||
assert isinstance(other, DiagGaussianPd)
|
||||
return U.sum(other.logstd - self.logstd + (tf.square(self.std) + tf.square(self.mean - other.mean)) / (2.0 * tf.square(other.std)) - 0.5, axis=-1)
|
||||
def entropy(self):
|
||||
return U.sum(self.logstd + .5 * np.log(2.0 * np.pi * np.e), -1)
|
||||
return U.sum(self.logstd + .5 * np.log(2.0 * np.pi * np.e), axis=-1)
|
||||
def sample(self):
|
||||
return self.mean + self.std * tf.random_normal(tf.shape(self.mean))
|
||||
@classmethod
|
||||
@@ -209,11 +215,11 @@ class BernoulliPd(Pd):
|
||||
def mode(self):
|
||||
return tf.round(self.ps)
|
||||
def neglogp(self, x):
|
||||
return U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=tf.to_float(x)), axis=1)
|
||||
return U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=tf.to_float(x)), axis=-1)
|
||||
def kl(self, other):
|
||||
return U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=other.logits, labels=self.ps), axis=1) - U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps), axis=1)
|
||||
return U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=other.logits, labels=self.ps), axis=-1) - U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps), axis=-1)
|
||||
def entropy(self):
|
||||
return U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps), axis=1)
|
||||
return U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps), axis=-1)
|
||||
def sample(self):
|
||||
u = tf.random_uniform(tf.shape(self.ps))
|
||||
return tf.to_float(math_ops.less(u, self.ps))
|
||||
@@ -286,4 +292,3 @@ def validate_probtype(probtype, pdparam):
|
||||
klval_ll = - entval - logliks.mean() #pylint: disable=E1101
|
||||
klval_ll_stderr = logliks.std() / np.sqrt(N) #pylint: disable=E1101
|
||||
assert np.abs(klval - klval_ll) < 3 * klval_ll_stderr # within 3 sigmas
|
||||
|
||||
|
19
baselines/common/vec_env/__init__.py
Normal file
19
baselines/common/vec_env/__init__.py
Normal file
@@ -0,0 +1,19 @@
|
||||
class VecEnv(object):
|
||||
"""
|
||||
Vectorized environment base class
|
||||
"""
|
||||
def step(self, vac):
|
||||
"""
|
||||
Apply sequence of actions to sequence of environments
|
||||
actions -> (observations, rewards, news)
|
||||
|
||||
where 'news' is a boolean vector indicating whether each element is new.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
def reset(self):
|
||||
"""
|
||||
Reset all environments
|
||||
"""
|
||||
raise NotImplementedError
|
||||
def close(self):
|
||||
pass
|
74
baselines/common/vec_env/subproc_vec_env.py
Normal file
74
baselines/common/vec_env/subproc_vec_env.py
Normal file
@@ -0,0 +1,74 @@
|
||||
import numpy as np
|
||||
from multiprocessing import Process, Pipe
|
||||
from baselines.common.vec_env import VecEnv
|
||||
|
||||
def worker(remote, env_fn_wrapper):
|
||||
env = env_fn_wrapper.x()
|
||||
while True:
|
||||
cmd, data = remote.recv()
|
||||
if cmd == 'step':
|
||||
ob, reward, done, info = env.step(data)
|
||||
if done:
|
||||
ob = env.reset()
|
||||
remote.send((ob, reward, done, info))
|
||||
elif cmd == 'reset':
|
||||
ob = env.reset()
|
||||
remote.send(ob)
|
||||
elif cmd == 'close':
|
||||
remote.close()
|
||||
break
|
||||
elif cmd == 'get_spaces':
|
||||
remote.send((env.action_space, env.observation_space))
|
||||
else:
|
||||
raise NotImplementedError
|
||||
|
||||
class CloudpickleWrapper(object):
|
||||
"""
|
||||
Uses cloudpickle to serialize contents (otherwise multiprocessing tries to use pickle)
|
||||
"""
|
||||
def __init__(self, x):
|
||||
self.x = x
|
||||
def __getstate__(self):
|
||||
import cloudpickle
|
||||
return cloudpickle.dumps(self.x)
|
||||
def __setstate__(self, ob):
|
||||
import pickle
|
||||
self.x = pickle.loads(ob)
|
||||
|
||||
class SubprocVecEnv(VecEnv):
|
||||
def __init__(self, env_fns):
|
||||
"""
|
||||
envs: list of gym environments to run in subprocesses
|
||||
"""
|
||||
nenvs = len(env_fns)
|
||||
self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(nenvs)])
|
||||
self.ps = [Process(target=worker, args=(work_remote, CloudpickleWrapper(env_fn)))
|
||||
for (work_remote, env_fn) in zip(self.work_remotes, env_fns)]
|
||||
for p in self.ps:
|
||||
p.start()
|
||||
|
||||
self.remotes[0].send(('get_spaces', None))
|
||||
self.action_space, self.observation_space = self.remotes[0].recv()
|
||||
|
||||
|
||||
def step(self, actions):
|
||||
for remote, action in zip(self.remotes, actions):
|
||||
remote.send(('step', action))
|
||||
results = [remote.recv() for remote in self.remotes]
|
||||
obs, rews, dones, infos = zip(*results)
|
||||
return np.stack(obs), np.stack(rews), np.stack(dones), infos
|
||||
|
||||
def reset(self):
|
||||
for remote in self.remotes:
|
||||
remote.send(('reset', None))
|
||||
return np.stack([remote.recv() for remote in self.remotes])
|
||||
|
||||
def close(self):
|
||||
for remote in self.remotes:
|
||||
remote.send(('close', None))
|
||||
for p in self.ps:
|
||||
p.join()
|
||||
|
||||
@property
|
||||
def num_envs(self):
|
||||
return len(self.remotes)
|
@@ -15,7 +15,7 @@ python -m baselines.deepq.experiments.enjoy_cartpole
|
||||
```
|
||||
|
||||
|
||||
Be sure to check out the source code of [both](experiments/train_cartpole.py) [files](experiments/enjoy_cartpole.py)!
|
||||
Be sure to check out the source code of [both](baselines/deepq/experiments/train_cartpole.py) [files](baselines/deepq/experiments/enjoy_cartpole.py)!
|
||||
|
||||
## If you wish to apply DQN to solve a problem.
|
||||
|
||||
@@ -49,4 +49,4 @@ Once you pick a model, you can download it and visualize the learned policy. Be
|
||||
python -m baselines.deepq.experiments.atari.download_model --blob model-atari-duel-pong-1 --model-dir /tmp/models
|
||||
python -m baselines.deepq.experiments.atari.enjoy --model-dir /tmp/models/model-atari-duel-pong-1 --env Pong --dueling
|
||||
|
||||
```
|
||||
```
|
@@ -43,7 +43,6 @@ def parse_args():
|
||||
parser.add_argument("--target-update-freq", type=int, default=40000, help="number of iterations between every target network update")
|
||||
parser.add_argument("--param-noise-update-freq", type=int, default=50, help="number of iterations between every re-scaling of the parameter noise")
|
||||
parser.add_argument("--param-noise-reset-freq", type=int, default=10000, help="maximum number of steps to take per episode before re-perturbing the exploration policy")
|
||||
parser.add_argument("--param-noise-threshold", type=float, default=0.05, help="the desired KL divergence between perturbed and non-perturbed policy. set to < 0 to use a KL divergence relative to the eps-greedy exploration")
|
||||
# Bells and whistles
|
||||
boolean_flag(parser, "double-q", default=True, help="whether or not to use double q learning")
|
||||
boolean_flag(parser, "dueling", default=False, help="whether or not to use dueling model")
|
||||
@@ -202,14 +201,11 @@ if __name__ == '__main__':
|
||||
reset = True
|
||||
|
||||
update_eps = 0.01 # ensures that we cannot get stuck completely
|
||||
if args.param_noise_threshold >= 0.:
|
||||
update_param_noise_threshold = args.param_noise_threshold
|
||||
else:
|
||||
# Compute the threshold such that the KL divergence between perturbed and non-perturbed
|
||||
# policy is comparable to eps-greedy exploration with eps = exploration.value(t).
|
||||
# See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
|
||||
# for detailed explanation.
|
||||
update_param_noise_threshold = -np.log(1. - exploration.value(num_iters) + exploration.value(num_iters) / float(env.action_space.n))
|
||||
# Compute the threshold such that the KL divergence between perturbed and non-perturbed
|
||||
# policy is comparable to eps-greedy exploration with eps = exploration.value(t).
|
||||
# See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
|
||||
# for detailed explanation.
|
||||
update_param_noise_threshold = -np.log(1. - exploration.value(num_iters) + exploration.value(num_iters) / float(env.action_space.n))
|
||||
kwargs['reset'] = reset
|
||||
kwargs['update_param_noise_threshold'] = update_param_noise_threshold
|
||||
kwargs['update_param_noise_scale'] = (num_iters % args.param_noise_update_freq == 0)
|
||||
|
@@ -95,7 +95,6 @@ def learn(env,
|
||||
prioritized_replay_eps=1e-6,
|
||||
num_cpu=16,
|
||||
param_noise=False,
|
||||
param_noise_threshold=0.05,
|
||||
callback=None):
|
||||
"""Train a deepq model.
|
||||
|
||||
@@ -225,14 +224,11 @@ def learn(env,
|
||||
update_param_noise_threshold = 0.
|
||||
else:
|
||||
update_eps = 0.
|
||||
if param_noise_threshold >= 0.:
|
||||
update_param_noise_threshold = param_noise_threshold
|
||||
else:
|
||||
# Compute the threshold such that the KL divergence between perturbed and non-perturbed
|
||||
# policy is comparable to eps-greedy exploration with eps = exploration.value(t).
|
||||
# See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
|
||||
# for detailed explanation.
|
||||
update_param_noise_threshold = -np.log(1. - exploration.value(t) + exploration.value(t) / float(env.action_space.n))
|
||||
# Compute the threshold such that the KL divergence between perturbed and non-perturbed
|
||||
# policy is comparable to eps-greedy exploration with eps = exploration.value(t).
|
||||
# See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
|
||||
# for detailed explanation.
|
||||
update_param_noise_threshold = -np.log(1. - exploration.value(t) + exploration.value(t) / float(env.action_space.n))
|
||||
kwargs['reset'] = reset
|
||||
kwargs['update_param_noise_threshold'] = update_param_noise_threshold
|
||||
kwargs['update_param_noise_scale'] = True
|
||||
|
@@ -1,13 +1,3 @@
|
||||
"""
|
||||
|
||||
See README.md for a description of the logging API.
|
||||
|
||||
OFF state corresponds to having Logger.CURRENT == Logger.DEFAULT
|
||||
ON state is otherwise
|
||||
|
||||
"""
|
||||
|
||||
from collections import OrderedDict
|
||||
import os
|
||||
import sys
|
||||
import shutil
|
||||
@@ -17,7 +7,7 @@ import time
|
||||
import datetime
|
||||
import tempfile
|
||||
|
||||
LOG_OUTPUT_FORMATS = ['stdout', 'log', 'json', 'tensorboard']
|
||||
LOG_OUTPUT_FORMATS = ['stdout', 'log', 'json']
|
||||
|
||||
DEBUG = 10
|
||||
INFO = 20
|
||||
@@ -49,9 +39,12 @@ class HumanOutputFormat(OutputFormat):
|
||||
|
||||
def writekvs(self, kvs):
|
||||
# Create strings for printing
|
||||
key2str = OrderedDict()
|
||||
key2str = {}
|
||||
for (key, val) in kvs.items():
|
||||
valstr = '%-8.3g' % (val,) if hasattr(val, '__float__') else val
|
||||
if isinstance(val, float):
|
||||
valstr = '%-8.3g' % (val,)
|
||||
else:
|
||||
valstr = str(val)
|
||||
key2str[self._truncate(key)] = self._truncate(valstr)
|
||||
|
||||
# Find max widths
|
||||
@@ -61,7 +54,7 @@ class HumanOutputFormat(OutputFormat):
|
||||
# Write out the data
|
||||
dashes = '-' * (keywidth + valwidth + 7)
|
||||
lines = [dashes]
|
||||
for (key, val) in key2str.items():
|
||||
for (key, val) in sorted(key2str.items()):
|
||||
lines.append('| %s%s | %s%s |' % (
|
||||
key,
|
||||
' ' * (keywidth - len(key)),
|
||||
@@ -150,7 +143,6 @@ def make_output_format(format, ev_dir):
|
||||
# API
|
||||
# ================================================================
|
||||
|
||||
|
||||
def logkv(key, val):
|
||||
"""
|
||||
Log a value of some diagnostic
|
||||
@@ -158,6 +150,12 @@ def logkv(key, val):
|
||||
"""
|
||||
Logger.CURRENT.logkv(key, val)
|
||||
|
||||
def logkvs(d):
|
||||
"""
|
||||
Log a dictionary of key-value pairs
|
||||
"""
|
||||
for (k, v) in d.items():
|
||||
logkv(k, v)
|
||||
|
||||
def dumpkvs():
|
||||
"""
|
||||
@@ -168,10 +166,8 @@ def dumpkvs():
|
||||
"""
|
||||
Logger.CURRENT.dumpkvs()
|
||||
|
||||
|
||||
# for backwards compatibility
|
||||
record_tabular = logkv
|
||||
dump_tabular = dumpkvs
|
||||
def getkvs():
|
||||
return Logger.CURRENT.name2val
|
||||
|
||||
|
||||
def log(*args, level=INFO):
|
||||
@@ -203,7 +199,6 @@ def set_level(level):
|
||||
"""
|
||||
Logger.CURRENT.set_level(level)
|
||||
|
||||
|
||||
def get_dir():
|
||||
"""
|
||||
Get directory that log files are being written to.
|
||||
@@ -211,18 +206,20 @@ def get_dir():
|
||||
"""
|
||||
return Logger.CURRENT.get_dir()
|
||||
|
||||
record_tabular = logkv
|
||||
dump_tabular = dumpkvs
|
||||
|
||||
# ================================================================
|
||||
# Backend
|
||||
# ================================================================
|
||||
|
||||
|
||||
class Logger(object):
|
||||
DEFAULT = None # A logger with no output files. (See right below class definition)
|
||||
# So that you can still log to the terminal without setting up any output files
|
||||
CURRENT = None # Current logger being used by the free functions above
|
||||
|
||||
def __init__(self, dir, output_formats):
|
||||
self.name2val = OrderedDict() # values this iteration
|
||||
self.name2val = {} # values this iteration
|
||||
self.level = INFO
|
||||
self.dir = dir
|
||||
self.output_formats = output_formats
|
||||
@@ -233,6 +230,7 @@ class Logger(object):
|
||||
self.name2val[key] = val
|
||||
|
||||
def dumpkvs(self):
|
||||
if self.level == DISABLED: return
|
||||
for fmt in self.output_formats:
|
||||
fmt.writekvs(self.name2val)
|
||||
self.name2val.clear()
|
||||
@@ -259,57 +257,30 @@ class Logger(object):
|
||||
for fmt in self.output_formats:
|
||||
fmt.writeseq(args)
|
||||
|
||||
Logger.DEFAULT = Logger.CURRENT = Logger(dir=None, output_formats=[HumanOutputFormat(sys.stdout)])
|
||||
|
||||
def configure(dir=None, format_strs=None):
|
||||
assert Logger.CURRENT is Logger.DEFAULT,\
|
||||
"Only call logger.configure() when it's in the default state. Try calling logger.reset() first."
|
||||
prevlogger = Logger.CURRENT
|
||||
if dir is None:
|
||||
dir = os.getenv('OPENAI_LOGDIR')
|
||||
if dir is None:
|
||||
dir = osp.join(tempfile.gettempdir(),
|
||||
datetime.datetime.now().strftime("openai-%Y-%m-%d-%H-%M-%S-%f"))
|
||||
if format_strs is None:
|
||||
format_strs = LOG_OUTPUT_FORMATS
|
||||
output_formats = [make_output_format(f, dir) for f in format_strs]
|
||||
Logger.CURRENT = Logger(dir=dir, output_formats=output_formats)
|
||||
log('Logging to %s'%dir)
|
||||
|
||||
def reset():
|
||||
Logger.CURRENT = Logger.DEFAULT
|
||||
log('Reset logger')
|
||||
|
||||
|
||||
# ================================================================
|
||||
|
||||
Logger.DEFAULT = Logger(output_formats=[HumanOutputFormat(sys.stdout)], dir=None)
|
||||
Logger.CURRENT = Logger.DEFAULT
|
||||
|
||||
|
||||
class session(object):
|
||||
"""
|
||||
Context manager that sets up the loggers for an experiment.
|
||||
"""
|
||||
|
||||
CURRENT = None # Set to a LoggerContext object using enter/exit or context manager
|
||||
|
||||
def __init__(self, dir=None, format_strs=None):
|
||||
if dir is None:
|
||||
dir = os.getenv('OPENAI_LOGDIR')
|
||||
if dir is None:
|
||||
dir = osp.join(tempfile.gettempdir(),
|
||||
datetime.datetime.now().strftime("openai-%Y-%m-%d-%H-%M-%S-%f"))
|
||||
self.dir = dir
|
||||
if format_strs is None:
|
||||
format_strs = LOG_OUTPUT_FORMATS
|
||||
output_formats = [make_output_format(f, dir) for f in format_strs]
|
||||
Logger.CURRENT = Logger(dir=dir, output_formats=output_formats)
|
||||
print('Logging to', dir)
|
||||
|
||||
def __enter__(self):
|
||||
os.makedirs(self.evaluation_dir(), exist_ok=True)
|
||||
output_formats = [make_output_format(f, self.evaluation_dir())
|
||||
for f in LOG_OUTPUT_FORMATS]
|
||||
Logger.CURRENT = Logger(dir=self.dir, output_formats=output_formats)
|
||||
os.environ['OPENAI_LOGDIR'] = self.evaluation_dir()
|
||||
|
||||
def __exit__(self, *args):
|
||||
Logger.CURRENT.close()
|
||||
Logger.CURRENT = Logger.DEFAULT
|
||||
|
||||
def evaluation_dir(self):
|
||||
return self.dir
|
||||
|
||||
def _setup():
|
||||
logdir = os.getenv('OPENAI_LOGDIR')
|
||||
if logdir:
|
||||
session(logdir).__enter__()
|
||||
|
||||
_setup()
|
||||
|
||||
# ================================================================
|
||||
|
||||
|
||||
def _demo():
|
||||
info("hi")
|
||||
debug("shouldn't appear")
|
||||
@@ -319,19 +290,19 @@ def _demo():
|
||||
if os.path.exists(dir):
|
||||
shutil.rmtree(dir)
|
||||
with session(dir=dir):
|
||||
record_tabular("a", 3)
|
||||
record_tabular("b", 2.5)
|
||||
dump_tabular()
|
||||
record_tabular("b", -2.5)
|
||||
record_tabular("a", 5.5)
|
||||
dump_tabular()
|
||||
logkv("a", 3)
|
||||
logkv("b", 2.5)
|
||||
dumpkvs()
|
||||
logkv("b", -2.5)
|
||||
logkv("a", 5.5)
|
||||
dumpkvs()
|
||||
info("^^^ should see a = 5.5")
|
||||
|
||||
record_tabular("b", -2.5)
|
||||
dump_tabular()
|
||||
logkv("b", -2.5)
|
||||
dumpkvs()
|
||||
|
||||
record_tabular("a", "longasslongasslongasslongasslongasslongassvalue")
|
||||
dump_tabular()
|
||||
logkv("a", "longasslongasslongasslongasslongasslongassvalue")
|
||||
dumpkvs()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
@@ -22,7 +22,6 @@ def train(env_id, num_timesteps, seed, num_cpu):
|
||||
rank = MPI.COMM_WORLD.Get_rank()
|
||||
sess = U.single_threaded_session()
|
||||
sess.__enter__()
|
||||
logger.session().__enter__()
|
||||
if rank != 0: logger.set_level(logger.DISABLED)
|
||||
workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
|
||||
set_global_seeds(workerseed)
|
||||
|
@@ -9,7 +9,6 @@ import sys
|
||||
def train(env_id, num_timesteps, seed):
|
||||
from baselines.pposgd import mlp_policy, pposgd_simple
|
||||
U.make_session(num_cpu=1).__enter__()
|
||||
logger.session().__enter__()
|
||||
set_global_seeds(seed)
|
||||
env = gym.make(env_id)
|
||||
def policy_fn(name, ob_space, ac_space):
|
||||
|
@@ -24,7 +24,6 @@ def train(env_id, num_timesteps, seed, num_cpu):
|
||||
rank = MPI.COMM_WORLD.Get_rank()
|
||||
sess = U.single_threaded_session()
|
||||
sess.__enter__()
|
||||
logger.session().__enter__()
|
||||
if rank != 0:
|
||||
logger.set_level(logger.DISABLED)
|
||||
|
||||
|
@@ -19,7 +19,6 @@ def train(env_id, num_timesteps, seed):
|
||||
if whoami == "parent":
|
||||
return
|
||||
import baselines.common.tf_util as U
|
||||
logger.session().__enter__()
|
||||
sess = U.single_threaded_session()
|
||||
sess.__enter__()
|
||||
|
||||
|
Reference in New Issue
Block a user