refactor acktr (#560)
* refactor acktr * setup.cfg now tests style/syntax in acktr as well * flake8 complaints * added note about continuous action spaces for acktr into the README.md
This commit is contained in:
@@ -5,4 +5,5 @@
|
||||
- `python -m baselines.run --alg=acktr --env=PongNoFrameskip-v4` runs the algorithm for 40M frames = 10M timesteps on an Atari Pong. See help (`-h`) for more options.
|
||||
- also refer to the repo-wide [README.md](../../README.md#training-models)
|
||||
|
||||
|
||||
## ACKTR with continuous action spaces
|
||||
The code of ACKTR has been refactored to handle both discrete and continuous action spaces uniformly. In the original version, discrete and continuous action spaces were handled by different code (actkr_disc.py and acktr_cont.py) with little overlap. If interested in the original version of the acktr for continuous action spaces, use `old_acktr_cont` branch. Note that original code performs better on the mujoco tasks than the refactored version; we are still investigating why.
|
||||
|
@@ -1 +1,152 @@
|
||||
from baselines.acktr.acktr_disc import *
|
||||
import os.path as osp
|
||||
import time
|
||||
import functools
|
||||
import tensorflow as tf
|
||||
from baselines import logger
|
||||
|
||||
from baselines.common import set_global_seeds, explained_variance
|
||||
from baselines.common.policies import build_policy
|
||||
from baselines.common.tf_util import get_session, save_variables, load_variables
|
||||
|
||||
from baselines.a2c.runner import Runner
|
||||
from baselines.a2c.utils import Scheduler, find_trainable_variables
|
||||
from baselines.acktr import kfac
|
||||
|
||||
|
||||
class Model(object):
|
||||
|
||||
def __init__(self, policy, ob_space, ac_space, nenvs,total_timesteps, nprocs=32, nsteps=20,
|
||||
ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5,
|
||||
kfac_clip=0.001, lrschedule='linear', is_async=True):
|
||||
|
||||
self.sess = sess = get_session()
|
||||
nbatch = nenvs * nsteps
|
||||
A = tf.placeholder(ac_space.dtype, [nbatch,] + list(ac_space.shape))
|
||||
ADV = tf.placeholder(tf.float32, [nbatch])
|
||||
R = tf.placeholder(tf.float32, [nbatch])
|
||||
PG_LR = tf.placeholder(tf.float32, [])
|
||||
VF_LR = tf.placeholder(tf.float32, [])
|
||||
|
||||
with tf.variable_scope('acktr_model', reuse=tf.AUTO_REUSE):
|
||||
self.model = step_model = policy(nenvs, 1, sess=sess)
|
||||
self.model2 = train_model = policy(nenvs*nsteps, nsteps, sess=sess)
|
||||
|
||||
neglogpac = train_model.pd.neglogp(A)
|
||||
self.logits = train_model.pi
|
||||
|
||||
##training loss
|
||||
pg_loss = tf.reduce_mean(ADV*neglogpac)
|
||||
entropy = tf.reduce_mean(train_model.pd.entropy())
|
||||
pg_loss = pg_loss - ent_coef * entropy
|
||||
vf_loss = tf.losses.mean_squared_error(tf.squeeze(train_model.vf), R)
|
||||
train_loss = pg_loss + vf_coef * vf_loss
|
||||
|
||||
|
||||
##Fisher loss construction
|
||||
self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(neglogpac)
|
||||
sample_net = train_model.vf + tf.random_normal(tf.shape(train_model.vf))
|
||||
self.vf_fisher = vf_fisher_loss = - vf_fisher_coef*tf.reduce_mean(tf.pow(train_model.vf - tf.stop_gradient(sample_net), 2))
|
||||
self.joint_fisher = joint_fisher_loss = pg_fisher_loss + vf_fisher_loss
|
||||
|
||||
self.params=params = find_trainable_variables("acktr_model")
|
||||
|
||||
self.grads_check = grads = tf.gradients(train_loss,params)
|
||||
|
||||
with tf.device('/gpu:0'):
|
||||
self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip,\
|
||||
momentum=0.9, kfac_update=1, epsilon=0.01,\
|
||||
stats_decay=0.99, is_async=is_async, cold_iter=10, max_grad_norm=max_grad_norm)
|
||||
|
||||
# update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss, var_list=params)
|
||||
optim.compute_and_apply_stats(joint_fisher_loss, var_list=params)
|
||||
train_op, q_runner = optim.apply_gradients(list(zip(grads,params)))
|
||||
self.q_runner = q_runner
|
||||
self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)
|
||||
|
||||
def train(obs, states, rewards, masks, actions, values):
|
||||
advs = rewards - values
|
||||
for step in range(len(obs)):
|
||||
cur_lr = self.lr.value()
|
||||
|
||||
td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, PG_LR:cur_lr, VF_LR:cur_lr}
|
||||
if states is not None:
|
||||
td_map[train_model.S] = states
|
||||
td_map[train_model.M] = masks
|
||||
|
||||
policy_loss, value_loss, policy_entropy, _ = sess.run(
|
||||
[pg_loss, vf_loss, entropy, train_op],
|
||||
td_map
|
||||
)
|
||||
return policy_loss, value_loss, policy_entropy
|
||||
|
||||
|
||||
self.train = train
|
||||
self.save = functools.partial(save_variables, sess=sess)
|
||||
self.load = functools.partial(load_variables, sess=sess)
|
||||
self.train_model = train_model
|
||||
self.step_model = step_model
|
||||
self.step = step_model.step
|
||||
self.value = step_model.value
|
||||
self.initial_state = step_model.initial_state
|
||||
tf.global_variables_initializer().run(session=sess)
|
||||
|
||||
def learn(network, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interval=1, nprocs=32, nsteps=20,
|
||||
ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5,
|
||||
kfac_clip=0.001, save_interval=None, lrschedule='linear', load_path=None, is_async=True, **network_kwargs):
|
||||
set_global_seeds(seed)
|
||||
|
||||
|
||||
if network == 'cnn':
|
||||
network_kwargs['one_dim_bias'] = True
|
||||
|
||||
policy = build_policy(env, network, **network_kwargs)
|
||||
|
||||
nenvs = env.num_envs
|
||||
ob_space = env.observation_space
|
||||
ac_space = env.action_space
|
||||
make_model = lambda : Model(policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=nprocs, nsteps
|
||||
=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, vf_fisher_coef=
|
||||
vf_fisher_coef, lr=lr, max_grad_norm=max_grad_norm, kfac_clip=kfac_clip,
|
||||
lrschedule=lrschedule, is_async=is_async)
|
||||
if save_interval and logger.get_dir():
|
||||
import cloudpickle
|
||||
with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh:
|
||||
fh.write(cloudpickle.dumps(make_model))
|
||||
model = make_model()
|
||||
|
||||
if load_path is not None:
|
||||
model.load(load_path)
|
||||
|
||||
runner = Runner(env, model, nsteps=nsteps, gamma=gamma)
|
||||
nbatch = nenvs*nsteps
|
||||
tstart = time.time()
|
||||
coord = tf.train.Coordinator()
|
||||
if is_async:
|
||||
enqueue_threads = model.q_runner.create_threads(model.sess, coord=coord, start=True)
|
||||
else:
|
||||
enqueue_threads = []
|
||||
|
||||
for update in range(1, total_timesteps//nbatch+1):
|
||||
obs, states, rewards, masks, actions, values = runner.run()
|
||||
policy_loss, value_loss, policy_entropy = model.train(obs, states, rewards, masks, actions, values)
|
||||
model.old_obs = obs
|
||||
nseconds = time.time()-tstart
|
||||
fps = int((update*nbatch)/nseconds)
|
||||
if update % log_interval == 0 or update == 1:
|
||||
ev = explained_variance(values, rewards)
|
||||
logger.record_tabular("nupdates", update)
|
||||
logger.record_tabular("total_timesteps", update*nbatch)
|
||||
logger.record_tabular("fps", fps)
|
||||
logger.record_tabular("policy_entropy", float(policy_entropy))
|
||||
logger.record_tabular("policy_loss", float(policy_loss))
|
||||
logger.record_tabular("value_loss", float(value_loss))
|
||||
logger.record_tabular("explained_variance", float(ev))
|
||||
logger.dump_tabular()
|
||||
|
||||
if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir():
|
||||
savepath = osp.join(logger.get_dir(), 'checkpoint%.5i'%update)
|
||||
print('Saving to', savepath)
|
||||
model.save(savepath)
|
||||
coord.request_stop()
|
||||
coord.join(enqueue_threads)
|
||||
return model
|
||||
|
@@ -1,142 +0,0 @@
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
from baselines import logger
|
||||
import baselines.common as common
|
||||
from baselines.common import tf_util as U
|
||||
from baselines.acktr import kfac
|
||||
from baselines.common.filters import ZFilter
|
||||
|
||||
def pathlength(path):
|
||||
return path["reward"].shape[0]# Loss function that we'll differentiate to get the policy gradient
|
||||
|
||||
def rollout(env, policy, max_pathlength, animate=False, obfilter=None):
|
||||
"""
|
||||
Simulate the env and policy for max_pathlength steps
|
||||
"""
|
||||
ob = env.reset()
|
||||
prev_ob = np.float32(np.zeros(ob.shape))
|
||||
if obfilter: ob = obfilter(ob)
|
||||
terminated = False
|
||||
|
||||
obs = []
|
||||
acs = []
|
||||
ac_dists = []
|
||||
logps = []
|
||||
rewards = []
|
||||
for _ in range(max_pathlength):
|
||||
if animate:
|
||||
env.render()
|
||||
state = np.concatenate([ob, prev_ob], -1)
|
||||
obs.append(state)
|
||||
ac, ac_dist, logp = policy.act(state)
|
||||
acs.append(ac)
|
||||
ac_dists.append(ac_dist)
|
||||
logps.append(logp)
|
||||
prev_ob = np.copy(ob)
|
||||
scaled_ac = env.action_space.low + (ac + 1.) * 0.5 * (env.action_space.high - env.action_space.low)
|
||||
scaled_ac = np.clip(scaled_ac, env.action_space.low, env.action_space.high)
|
||||
ob, rew, done, _ = env.step(scaled_ac)
|
||||
if obfilter: ob = obfilter(ob)
|
||||
rewards.append(rew)
|
||||
if done:
|
||||
terminated = True
|
||||
break
|
||||
return {"observation" : np.array(obs), "terminated" : terminated,
|
||||
"reward" : np.array(rewards), "action" : np.array(acs),
|
||||
"action_dist": np.array(ac_dists), "logp" : np.array(logps)}
|
||||
|
||||
def learn(env, policy, vf, gamma, lam, timesteps_per_batch, num_timesteps,
|
||||
animate=False, callback=None, desired_kl=0.002):
|
||||
|
||||
obfilter = ZFilter(env.observation_space.shape)
|
||||
|
||||
max_pathlength = env.spec.timestep_limit
|
||||
stepsize = tf.Variable(initial_value=np.float32(np.array(0.03)), name='stepsize')
|
||||
inputs, loss, loss_sampled = policy.update_info
|
||||
optim = kfac.KfacOptimizer(learning_rate=stepsize, cold_lr=stepsize*(1-0.9), momentum=0.9, kfac_update=2,\
|
||||
epsilon=1e-2, stats_decay=0.99, async_=1, cold_iter=1,
|
||||
weight_decay_dict=policy.wd_dict, max_grad_norm=None)
|
||||
pi_var_list = []
|
||||
for var in tf.trainable_variables():
|
||||
if "pi" in var.name:
|
||||
pi_var_list.append(var)
|
||||
|
||||
update_op, q_runner = optim.minimize(loss, loss_sampled, var_list=pi_var_list)
|
||||
do_update = U.function(inputs, update_op)
|
||||
U.initialize()
|
||||
|
||||
# start queue runners
|
||||
enqueue_threads = []
|
||||
coord = tf.train.Coordinator()
|
||||
for qr in [q_runner, vf.q_runner]:
|
||||
assert (qr != None)
|
||||
enqueue_threads.extend(qr.create_threads(tf.get_default_session(), coord=coord, start=True))
|
||||
|
||||
i = 0
|
||||
timesteps_so_far = 0
|
||||
while True:
|
||||
if timesteps_so_far > num_timesteps:
|
||||
break
|
||||
logger.log("********** Iteration %i ************"%i)
|
||||
|
||||
# Collect paths until we have enough timesteps
|
||||
timesteps_this_batch = 0
|
||||
paths = []
|
||||
while True:
|
||||
path = rollout(env, policy, max_pathlength, animate=(len(paths)==0 and (i % 10 == 0) and animate), obfilter=obfilter)
|
||||
paths.append(path)
|
||||
n = pathlength(path)
|
||||
timesteps_this_batch += n
|
||||
timesteps_so_far += n
|
||||
if timesteps_this_batch > timesteps_per_batch:
|
||||
break
|
||||
|
||||
# Estimate advantage function
|
||||
vtargs = []
|
||||
advs = []
|
||||
for path in paths:
|
||||
rew_t = path["reward"]
|
||||
return_t = common.discount(rew_t, gamma)
|
||||
vtargs.append(return_t)
|
||||
vpred_t = vf.predict(path)
|
||||
vpred_t = np.append(vpred_t, 0.0 if path["terminated"] else vpred_t[-1])
|
||||
delta_t = rew_t + gamma*vpred_t[1:] - vpred_t[:-1]
|
||||
adv_t = common.discount(delta_t, gamma * lam)
|
||||
advs.append(adv_t)
|
||||
# Update value function
|
||||
vf.fit(paths, vtargs)
|
||||
|
||||
# Build arrays for policy update
|
||||
ob_no = np.concatenate([path["observation"] for path in paths])
|
||||
action_na = np.concatenate([path["action"] for path in paths])
|
||||
oldac_dist = np.concatenate([path["action_dist"] for path in paths])
|
||||
adv_n = np.concatenate(advs)
|
||||
standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8)
|
||||
|
||||
# Policy update
|
||||
do_update(ob_no, action_na, standardized_adv_n)
|
||||
|
||||
min_stepsize = np.float32(1e-8)
|
||||
max_stepsize = np.float32(1e0)
|
||||
# Adjust stepsize
|
||||
kl = policy.compute_kl(ob_no, oldac_dist)
|
||||
if kl > desired_kl * 2:
|
||||
logger.log("kl too high")
|
||||
tf.assign(stepsize, tf.maximum(min_stepsize, stepsize / 1.5)).eval()
|
||||
elif kl < desired_kl / 2:
|
||||
logger.log("kl too low")
|
||||
tf.assign(stepsize, tf.minimum(max_stepsize, stepsize * 1.5)).eval()
|
||||
else:
|
||||
logger.log("kl just right!")
|
||||
|
||||
logger.record_tabular("EpRewMean", np.mean([path["reward"].sum() for path in paths]))
|
||||
logger.record_tabular("EpRewSEM", np.std([path["reward"].sum()/np.sqrt(len(paths)) for path in paths]))
|
||||
logger.record_tabular("EpLenMean", np.mean([pathlength(path) for path in paths]))
|
||||
logger.record_tabular("KL", kl)
|
||||
if callback:
|
||||
callback()
|
||||
logger.dump_tabular()
|
||||
i += 1
|
||||
|
||||
coord.request_stop()
|
||||
coord.join(enqueue_threads)
|
@@ -1,150 +0,0 @@
|
||||
import os.path as osp
|
||||
import time
|
||||
import functools
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
from baselines import logger
|
||||
|
||||
from baselines.common import set_global_seeds, explained_variance
|
||||
from baselines.common.policies import build_policy
|
||||
from baselines.common.tf_util import get_session, save_variables, load_variables
|
||||
|
||||
from baselines.a2c.runner import Runner
|
||||
from baselines.a2c.utils import discount_with_dones
|
||||
from baselines.a2c.utils import Scheduler, find_trainable_variables
|
||||
from baselines.acktr import kfac
|
||||
|
||||
|
||||
class Model(object):
|
||||
|
||||
def __init__(self, policy, ob_space, ac_space, nenvs,total_timesteps, nprocs=32, nsteps=20,
|
||||
ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5,
|
||||
kfac_clip=0.001, lrschedule='linear'):
|
||||
|
||||
self.sess = sess = get_session()
|
||||
nact = ac_space.n
|
||||
nbatch = nenvs * nsteps
|
||||
A = tf.placeholder(tf.int32, [nbatch])
|
||||
ADV = tf.placeholder(tf.float32, [nbatch])
|
||||
R = tf.placeholder(tf.float32, [nbatch])
|
||||
PG_LR = tf.placeholder(tf.float32, [])
|
||||
VF_LR = tf.placeholder(tf.float32, [])
|
||||
|
||||
with tf.variable_scope('acktr_model', reuse=tf.AUTO_REUSE):
|
||||
self.model = step_model = policy(nenvs, 1, sess=sess)
|
||||
self.model2 = train_model = policy(nenvs*nsteps, nsteps, sess=sess)
|
||||
|
||||
neglogpac = train_model.pd.neglogp(A)
|
||||
self.logits = logits = train_model.pi
|
||||
|
||||
##training loss
|
||||
pg_loss = tf.reduce_mean(ADV*neglogpac)
|
||||
entropy = tf.reduce_mean(train_model.pd.entropy())
|
||||
pg_loss = pg_loss - ent_coef * entropy
|
||||
vf_loss = tf.losses.mean_squared_error(tf.squeeze(train_model.vf), R)
|
||||
train_loss = pg_loss + vf_coef * vf_loss
|
||||
|
||||
|
||||
##Fisher loss construction
|
||||
self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(neglogpac)
|
||||
sample_net = train_model.vf + tf.random_normal(tf.shape(train_model.vf))
|
||||
self.vf_fisher = vf_fisher_loss = - vf_fisher_coef*tf.reduce_mean(tf.pow(train_model.vf - tf.stop_gradient(sample_net), 2))
|
||||
self.joint_fisher = joint_fisher_loss = pg_fisher_loss + vf_fisher_loss
|
||||
|
||||
self.params=params = find_trainable_variables("acktr_model")
|
||||
|
||||
self.grads_check = grads = tf.gradients(train_loss,params)
|
||||
|
||||
with tf.device('/gpu:0'):
|
||||
self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip,\
|
||||
momentum=0.9, kfac_update=1, epsilon=0.01,\
|
||||
stats_decay=0.99, async_=1, cold_iter=10, max_grad_norm=max_grad_norm)
|
||||
|
||||
update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss, var_list=params)
|
||||
train_op, q_runner = optim.apply_gradients(list(zip(grads,params)))
|
||||
self.q_runner = q_runner
|
||||
self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)
|
||||
|
||||
def train(obs, states, rewards, masks, actions, values):
|
||||
advs = rewards - values
|
||||
for step in range(len(obs)):
|
||||
cur_lr = self.lr.value()
|
||||
|
||||
td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, PG_LR:cur_lr}
|
||||
if states is not None:
|
||||
td_map[train_model.S] = states
|
||||
td_map[train_model.M] = masks
|
||||
|
||||
policy_loss, value_loss, policy_entropy, _ = sess.run(
|
||||
[pg_loss, vf_loss, entropy, train_op],
|
||||
td_map
|
||||
)
|
||||
return policy_loss, value_loss, policy_entropy
|
||||
|
||||
|
||||
self.train = train
|
||||
self.save = functools.partial(save_variables, sess=sess)
|
||||
self.load = functools.partial(load_variables, sess=sess)
|
||||
self.train_model = train_model
|
||||
self.step_model = step_model
|
||||
self.step = step_model.step
|
||||
self.value = step_model.value
|
||||
self.initial_state = step_model.initial_state
|
||||
tf.global_variables_initializer().run(session=sess)
|
||||
|
||||
def learn(network, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interval=1, nprocs=32, nsteps=20,
|
||||
ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5,
|
||||
kfac_clip=0.001, save_interval=None, lrschedule='linear', load_path=None, **network_kwargs):
|
||||
set_global_seeds(seed)
|
||||
|
||||
|
||||
if network == 'cnn':
|
||||
network_kwargs['one_dim_bias'] = True
|
||||
|
||||
policy = build_policy(env, network, **network_kwargs)
|
||||
|
||||
nenvs = env.num_envs
|
||||
ob_space = env.observation_space
|
||||
ac_space = env.action_space
|
||||
make_model = lambda : Model(policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=nprocs, nsteps
|
||||
=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, vf_fisher_coef=
|
||||
vf_fisher_coef, lr=lr, max_grad_norm=max_grad_norm, kfac_clip=kfac_clip,
|
||||
lrschedule=lrschedule)
|
||||
if save_interval and logger.get_dir():
|
||||
import cloudpickle
|
||||
with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh:
|
||||
fh.write(cloudpickle.dumps(make_model))
|
||||
model = make_model()
|
||||
|
||||
if load_path is not None:
|
||||
model.load(load_path)
|
||||
|
||||
runner = Runner(env, model, nsteps=nsteps, gamma=gamma)
|
||||
nbatch = nenvs*nsteps
|
||||
tstart = time.time()
|
||||
coord = tf.train.Coordinator()
|
||||
enqueue_threads = model.q_runner.create_threads(model.sess, coord=coord, start=True)
|
||||
for update in range(1, total_timesteps//nbatch+1):
|
||||
obs, states, rewards, masks, actions, values = runner.run()
|
||||
policy_loss, value_loss, policy_entropy = model.train(obs, states, rewards, masks, actions, values)
|
||||
model.old_obs = obs
|
||||
nseconds = time.time()-tstart
|
||||
fps = int((update*nbatch)/nseconds)
|
||||
if update % log_interval == 0 or update == 1:
|
||||
ev = explained_variance(values, rewards)
|
||||
logger.record_tabular("nupdates", update)
|
||||
logger.record_tabular("total_timesteps", update*nbatch)
|
||||
logger.record_tabular("fps", fps)
|
||||
logger.record_tabular("policy_entropy", float(policy_entropy))
|
||||
logger.record_tabular("policy_loss", float(policy_loss))
|
||||
logger.record_tabular("value_loss", float(value_loss))
|
||||
logger.record_tabular("explained_variance", float(ev))
|
||||
logger.dump_tabular()
|
||||
|
||||
if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir():
|
||||
savepath = osp.join(logger.get_dir(), 'checkpoint%.5i'%update)
|
||||
print('Saving to', savepath)
|
||||
model.save(savepath)
|
||||
coord.request_stop()
|
||||
coord.join(enqueue_threads)
|
||||
return model
|
5
baselines/acktr/defaults.py
Normal file
5
baselines/acktr/defaults.py
Normal file
@@ -0,0 +1,5 @@
|
||||
def mujoco():
|
||||
return dict(
|
||||
nsteps=2500,
|
||||
value_network='copy'
|
||||
)
|
@@ -1,6 +1,8 @@
|
||||
import tensorflow as tf
|
||||
import numpy as np
|
||||
import re
|
||||
|
||||
# flake8: noqa F403, F405
|
||||
from baselines.acktr.kfac_utils import *
|
||||
from functools import reduce
|
||||
|
||||
@@ -10,14 +12,14 @@ KFAC_DEBUG = False
|
||||
|
||||
class KfacOptimizer():
|
||||
|
||||
def __init__(self, learning_rate=0.01, momentum=0.9, clip_kl=0.01, kfac_update=2, stats_accum_iter=60, full_stats_init=False, cold_iter=100, cold_lr=None, async_=False, async_stats=False, epsilon=1e-2, stats_decay=0.95, blockdiag_bias=False, channel_fac=False, factored_damping=False, approxT2=False, use_float64=False, weight_decay_dict={},max_grad_norm=0.5):
|
||||
def __init__(self, learning_rate=0.01, momentum=0.9, clip_kl=0.01, kfac_update=2, stats_accum_iter=60, full_stats_init=False, cold_iter=100, cold_lr=None, is_async=False, async_stats=False, epsilon=1e-2, stats_decay=0.95, blockdiag_bias=False, channel_fac=False, factored_damping=False, approxT2=False, use_float64=False, weight_decay_dict={},max_grad_norm=0.5):
|
||||
self.max_grad_norm = max_grad_norm
|
||||
self._lr = learning_rate
|
||||
self._momentum = momentum
|
||||
self._clip_kl = clip_kl
|
||||
self._channel_fac = channel_fac
|
||||
self._kfac_update = kfac_update
|
||||
self._async = async_
|
||||
self._async = is_async
|
||||
self._async_stats = async_stats
|
||||
self._epsilon = epsilon
|
||||
self._stats_decay = stats_decay
|
||||
|
@@ -1,42 +0,0 @@
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
from baselines.acktr.utils import dense, kl_div
|
||||
import baselines.common.tf_util as U
|
||||
|
||||
class GaussianMlpPolicy(object):
|
||||
def __init__(self, ob_dim, ac_dim):
|
||||
# Here we'll construct a bunch of expressions, which will be used in two places:
|
||||
# (1) When sampling actions
|
||||
# (2) When computing loss functions, for the policy update
|
||||
# Variables specific to (1) have the word "sampled" in them,
|
||||
# whereas variables specific to (2) have the word "old" in them
|
||||
ob_no = tf.placeholder(tf.float32, shape=[None, ob_dim*2], name="ob") # batch of observations
|
||||
oldac_na = tf.placeholder(tf.float32, shape=[None, ac_dim], name="ac") # batch of actions previous actions
|
||||
oldac_dist = tf.placeholder(tf.float32, shape=[None, ac_dim*2], name="oldac_dist") # batch of actions previous action distributions
|
||||
adv_n = tf.placeholder(tf.float32, shape=[None], name="adv") # advantage function estimate
|
||||
wd_dict = {}
|
||||
h1 = tf.nn.tanh(dense(ob_no, 64, "h1", weight_init=U.normc_initializer(1.0), bias_init=0.0, weight_loss_dict=wd_dict))
|
||||
h2 = tf.nn.tanh(dense(h1, 64, "h2", weight_init=U.normc_initializer(1.0), bias_init=0.0, weight_loss_dict=wd_dict))
|
||||
mean_na = dense(h2, ac_dim, "mean", weight_init=U.normc_initializer(0.1), bias_init=0.0, weight_loss_dict=wd_dict) # Mean control output
|
||||
self.wd_dict = wd_dict
|
||||
self.logstd_1a = logstd_1a = tf.get_variable("logstd", [ac_dim], tf.float32, tf.zeros_initializer()) # Variance on outputs
|
||||
logstd_1a = tf.expand_dims(logstd_1a, 0)
|
||||
std_1a = tf.exp(logstd_1a)
|
||||
std_na = tf.tile(std_1a, [tf.shape(mean_na)[0], 1])
|
||||
ac_dist = tf.concat([tf.reshape(mean_na, [-1, ac_dim]), tf.reshape(std_na, [-1, ac_dim])], 1)
|
||||
sampled_ac_na = tf.random_normal(tf.shape(ac_dist[:,ac_dim:])) * ac_dist[:,ac_dim:] + ac_dist[:,:ac_dim] # This is the sampled action we'll perform.
|
||||
logprobsampled_n = - tf.reduce_sum(tf.log(ac_dist[:,ac_dim:]), axis=1) - 0.5 * tf.log(2.0*np.pi)*ac_dim - 0.5 * tf.reduce_sum(tf.square(ac_dist[:,:ac_dim] - sampled_ac_na) / (tf.square(ac_dist[:,ac_dim:])), axis=1) # Logprob of sampled action
|
||||
logprob_n = - tf.reduce_sum(tf.log(ac_dist[:,ac_dim:]), axis=1) - 0.5 * tf.log(2.0*np.pi)*ac_dim - 0.5 * tf.reduce_sum(tf.square(ac_dist[:,:ac_dim] - oldac_na) / (tf.square(ac_dist[:,ac_dim:])), axis=1) # Logprob of previous actions under CURRENT policy (whereas oldlogprob_n is under OLD policy)
|
||||
kl = tf.reduce_mean(kl_div(oldac_dist, ac_dist, ac_dim))
|
||||
#kl = .5 * tf.reduce_mean(tf.square(logprob_n - oldlogprob_n)) # Approximation of KL divergence between old policy used to generate actions, and new policy used to compute logprob_n
|
||||
surr = - tf.reduce_mean(adv_n * logprob_n) # Loss function that we'll differentiate to get the policy gradient
|
||||
surr_sampled = - tf.reduce_mean(logprob_n) # Sampled loss of the policy
|
||||
self._act = U.function([ob_no], [sampled_ac_na, ac_dist, logprobsampled_n]) # Generate a new action and its logprob
|
||||
#self.compute_kl = U.function([ob_no, oldac_na, oldlogprob_n], kl) # Compute (approximate) KL divergence between old policy and new policy
|
||||
self.compute_kl = U.function([ob_no, oldac_dist], kl)
|
||||
self.update_info = ((ob_no, oldac_na, adv_n), surr, surr_sampled) # Input and output variables needed for computing loss
|
||||
U.initialize() # Initialize uninitialized TF variables
|
||||
|
||||
def act(self, ob):
|
||||
ac, ac_dist, logp = self._act(ob[None])
|
||||
return ac[0], ac_dist[0], logp[0]
|
@@ -1,34 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import tensorflow as tf
|
||||
from baselines import logger
|
||||
from baselines.common.cmd_util import make_mujoco_env, mujoco_arg_parser
|
||||
from baselines.acktr.acktr_cont import learn
|
||||
from baselines.acktr.policies import GaussianMlpPolicy
|
||||
from baselines.acktr.value_functions import NeuralNetValueFunction
|
||||
|
||||
def train(env_id, num_timesteps, seed):
|
||||
env = make_mujoco_env(env_id, seed)
|
||||
|
||||
with tf.Session(config=tf.ConfigProto()):
|
||||
ob_dim = env.observation_space.shape[0]
|
||||
ac_dim = env.action_space.shape[0]
|
||||
with tf.variable_scope("vf"):
|
||||
vf = NeuralNetValueFunction(ob_dim, ac_dim)
|
||||
with tf.variable_scope("pi"):
|
||||
policy = GaussianMlpPolicy(ob_dim, ac_dim)
|
||||
|
||||
learn(env, policy=policy, vf=vf,
|
||||
gamma=0.99, lam=0.97, timesteps_per_batch=2500,
|
||||
desired_kl=0.002,
|
||||
num_timesteps=num_timesteps, animate=False)
|
||||
|
||||
env.close()
|
||||
|
||||
def main():
|
||||
args = mujoco_arg_parser().parse_args()
|
||||
logger.configure()
|
||||
train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
@@ -1,50 +0,0 @@
|
||||
from baselines import logger
|
||||
import numpy as np
|
||||
import baselines.common as common
|
||||
from baselines.common import tf_util as U
|
||||
import tensorflow as tf
|
||||
from baselines.acktr import kfac
|
||||
from baselines.acktr.utils import dense
|
||||
|
||||
class NeuralNetValueFunction(object):
|
||||
def __init__(self, ob_dim, ac_dim): #pylint: disable=W0613
|
||||
X = tf.placeholder(tf.float32, shape=[None, ob_dim*2+ac_dim*2+2]) # batch of observations
|
||||
vtarg_n = tf.placeholder(tf.float32, shape=[None], name='vtarg')
|
||||
wd_dict = {}
|
||||
h1 = tf.nn.elu(dense(X, 64, "h1", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict))
|
||||
h2 = tf.nn.elu(dense(h1, 64, "h2", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict))
|
||||
vpred_n = dense(h2, 1, "hfinal", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)[:,0]
|
||||
sample_vpred_n = vpred_n + tf.random_normal(tf.shape(vpred_n))
|
||||
wd_loss = tf.get_collection("vf_losses", None)
|
||||
loss = tf.reduce_mean(tf.square(vpred_n - vtarg_n)) + tf.add_n(wd_loss)
|
||||
loss_sampled = tf.reduce_mean(tf.square(vpred_n - tf.stop_gradient(sample_vpred_n)))
|
||||
self._predict = U.function([X], vpred_n)
|
||||
optim = kfac.KfacOptimizer(learning_rate=0.001, cold_lr=0.001*(1-0.9), momentum=0.9, \
|
||||
clip_kl=0.3, epsilon=0.1, stats_decay=0.95, \
|
||||
async_=1, kfac_update=2, cold_iter=50, \
|
||||
weight_decay_dict=wd_dict, max_grad_norm=None)
|
||||
vf_var_list = []
|
||||
for var in tf.trainable_variables():
|
||||
if "vf" in var.name:
|
||||
vf_var_list.append(var)
|
||||
|
||||
update_op, self.q_runner = optim.minimize(loss, loss_sampled, var_list=vf_var_list)
|
||||
self.do_update = U.function([X, vtarg_n], update_op) #pylint: disable=E1101
|
||||
U.initialize() # Initialize uninitialized TF variables
|
||||
def _preproc(self, path):
|
||||
l = pathlength(path)
|
||||
al = np.arange(l).reshape(-1,1)/10.0
|
||||
act = path["action_dist"].astype('float32')
|
||||
X = np.concatenate([path['observation'], act, al, np.ones((l, 1))], axis=1)
|
||||
return X
|
||||
def predict(self, path):
|
||||
return self._predict(self._preproc(path))
|
||||
def fit(self, paths, targvals):
|
||||
X = np.concatenate([self._preproc(p) for p in paths])
|
||||
y = np.concatenate(targvals)
|
||||
logger.record_tabular("EVBefore", common.explained_variance(self._predict(X), y))
|
||||
for _ in range(25): self.do_update(X, y)
|
||||
logger.record_tabular("EVAfter", common.explained_variance(self._predict(X), y))
|
||||
|
||||
def pathlength(path):
|
||||
return path["reward"].shape[0]
|
Reference in New Issue
Block a user