* joshim5 changes (width and height to WarpFrame wrapper) * match network output with action distribution via a linear layer only if necessary (#167) * support color vs. grayscale option in WarpFrame wrapper (#166) * support color vs. grayscale option in WarpFrame wrapper * Support color in other wrappers * Updated per Peters suggestions * fixing test failures * ppo2 with microbatches (#168) * pass microbatch_size to the model during construction * microbatch fixes and test (#169) * microbatch fixes and test * tiny cleanup * added assertions to the test * vpg-related fix * Peterz joshim5 subclass ppo2 model (#170) * microbatch fixes and test * tiny cleanup * added assertions to the test * vpg-related fix * subclassing the model to make microbatched version of model WIP * made microbatched model a subclass of ppo2 Model * flake8 complaint * mpi-less ppo2 (resolving merge conflict) * flake8 and mpi4py imports in ppo2/model.py * more un-mpying * merge master * updates to the benchmark viewer code + autopep8 (#184) * viz docs and syntactic sugar wip * update viewer yaml to use persistent volume claims * move plot_util to baselines.common, update links * use 1Tb hard drive for results viewer * small updates to benchmark vizualizer code * autopep8 * autopep8 * any folder can be a benchmark * massage games image a little bit * fixed --preload option in app.py * remove preload from run_viewer.sh * remove pdb breakpoints * update bench-viewer.yaml * fixed bug (#185) * fixed bug it's wrong to do the else statement, because no other nodes would start. * changed the fix slightly
157 lines
5.7 KiB
Python
157 lines
5.7 KiB
Python
import tensorflow as tf
|
|
import functools
|
|
|
|
from baselines.common.tf_util import get_session, save_variables, load_variables
|
|
from baselines.common.tf_util import initialize
|
|
|
|
try:
|
|
from baselines.common.mpi_adam_optimizer import MpiAdamOptimizer
|
|
from mpi4py import MPI
|
|
from baselines.common.mpi_util import sync_from_root
|
|
except ImportError:
|
|
MPI = None
|
|
|
|
class Model(object):
|
|
"""
|
|
We use this object to :
|
|
__init__:
|
|
- Creates the step_model
|
|
- Creates the train_model
|
|
|
|
train():
|
|
- Make the training part (feedforward and retropropagation of gradients)
|
|
|
|
save/load():
|
|
- Save load the model
|
|
"""
|
|
def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train,
|
|
nsteps, ent_coef, vf_coef, max_grad_norm, microbatch_size=None):
|
|
self.sess = sess = get_session()
|
|
|
|
with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE):
|
|
# CREATE OUR TWO MODELS
|
|
# act_model that is used for sampling
|
|
act_model = policy(nbatch_act, 1, sess)
|
|
|
|
# Train model for training
|
|
if microbatch_size is None:
|
|
train_model = policy(nbatch_train, nsteps, sess)
|
|
else:
|
|
train_model = policy(microbatch_size, nsteps, sess)
|
|
|
|
# CREATE THE PLACEHOLDERS
|
|
self.A = A = train_model.pdtype.sample_placeholder([None])
|
|
self.ADV = ADV = tf.placeholder(tf.float32, [None])
|
|
self.R = R = tf.placeholder(tf.float32, [None])
|
|
# Keep track of old actor
|
|
self.OLDNEGLOGPAC = OLDNEGLOGPAC = tf.placeholder(tf.float32, [None])
|
|
# Keep track of old critic
|
|
self.OLDVPRED = OLDVPRED = tf.placeholder(tf.float32, [None])
|
|
self.LR = LR = tf.placeholder(tf.float32, [])
|
|
# Cliprange
|
|
self.CLIPRANGE = CLIPRANGE = tf.placeholder(tf.float32, [])
|
|
|
|
neglogpac = train_model.pd.neglogp(A)
|
|
|
|
# Calculate the entropy
|
|
# Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy.
|
|
entropy = tf.reduce_mean(train_model.pd.entropy())
|
|
|
|
# CALCULATE THE LOSS
|
|
# Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss
|
|
|
|
# Clip the value to reduce variability during Critic training
|
|
# Get the predicted value
|
|
vpred = train_model.vf
|
|
vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, - CLIPRANGE, CLIPRANGE)
|
|
# Unclipped value
|
|
vf_losses1 = tf.square(vpred - R)
|
|
# Clipped value
|
|
vf_losses2 = tf.square(vpredclipped - R)
|
|
|
|
vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))
|
|
|
|
# Calculate ratio (pi current policy / pi old policy)
|
|
ratio = tf.exp(OLDNEGLOGPAC - neglogpac)
|
|
|
|
# Defining Loss = - J is equivalent to max J
|
|
pg_losses = -ADV * ratio
|
|
|
|
pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE)
|
|
|
|
# Final PG loss
|
|
pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2))
|
|
approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC))
|
|
clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE)))
|
|
|
|
# Total loss
|
|
loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef
|
|
|
|
# UPDATE THE PARAMETERS USING LOSS
|
|
# 1. Get the model parameters
|
|
params = tf.trainable_variables('ppo2_model')
|
|
# 2. Build our trainer
|
|
if MPI is not None:
|
|
self.trainer = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=LR, epsilon=1e-5)
|
|
else:
|
|
self.trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5)
|
|
# 3. Calculate the gradients
|
|
grads_and_var = self.trainer.compute_gradients(loss, params)
|
|
grads, var = zip(*grads_and_var)
|
|
|
|
if max_grad_norm is not None:
|
|
# Clip the gradients (normalize)
|
|
grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
|
|
grads_and_var = list(zip(grads, var))
|
|
# zip aggregate each gradient with parameters associated
|
|
# For instance zip(ABCD, xyza) => Ax, By, Cz, Da
|
|
|
|
self.grads = grads
|
|
self.var = var
|
|
self._train_op = self.trainer.apply_gradients(grads_and_var)
|
|
self.loss_names = ['policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac']
|
|
self.stats_list = [pg_loss, vf_loss, entropy, approxkl, clipfrac]
|
|
|
|
|
|
self.train_model = train_model
|
|
self.act_model = act_model
|
|
self.step = act_model.step
|
|
self.value = act_model.value
|
|
self.initial_state = act_model.initial_state
|
|
|
|
self.save = functools.partial(save_variables, sess=sess)
|
|
self.load = functools.partial(load_variables, sess=sess)
|
|
|
|
initialize()
|
|
global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="")
|
|
if MPI is not None:
|
|
sync_from_root(sess, global_variables) #pylint: disable=E1101
|
|
|
|
def train(self, lr, cliprange, obs, returns, masks, actions, values, neglogpacs, states=None):
|
|
# Here we calculate advantage A(s,a) = R + yV(s') - V(s)
|
|
# Returns = R + yV(s')
|
|
advs = returns - values
|
|
|
|
# Normalize the advantages
|
|
advs = (advs - advs.mean()) / (advs.std() + 1e-8)
|
|
|
|
td_map = {
|
|
self.train_model.X : obs,
|
|
self.A : actions,
|
|
self.ADV : advs,
|
|
self.R : returns,
|
|
self.LR : lr,
|
|
self.CLIPRANGE : cliprange,
|
|
self.OLDNEGLOGPAC : neglogpacs,
|
|
self.OLDVPRED : values
|
|
}
|
|
if states is not None:
|
|
td_map[self.train_model.S] = states
|
|
td_map[self.train_model.M] = masks
|
|
|
|
return self.sess.run(
|
|
self.stats_list + [self._train_op],
|
|
td_map
|
|
)[:-1]
|
|
|