refactor acktr (#560)

* refactor acktr

* setup.cfg now tests style/syntax in acktr as well

* flake8 complaints

* added note about continuous action spaces for acktr into the README.md
This commit is contained in:
pzhokhov
2018-09-20 16:05:26 -07:00
committed by GitHub
parent 0e7048b89f
commit 0f4ae2fb2a
10 changed files with 163 additions and 428 deletions

View File

@@ -5,4 +5,5 @@
- `python -m baselines.run --alg=acktr --env=PongNoFrameskip-v4` runs the algorithm for 40M frames = 10M timesteps on an Atari Pong. See help (`-h`) for more options.
- also refer to the repo-wide [README.md](../../README.md#training-models)
## ACKTR with continuous action spaces
The code of ACKTR has been refactored to handle both discrete and continuous action spaces uniformly. In the original version, discrete and continuous action spaces were handled by different code (actkr_disc.py and acktr_cont.py) with little overlap. If interested in the original version of the acktr for continuous action spaces, use `old_acktr_cont` branch. Note that original code performs better on the mujoco tasks than the refactored version; we are still investigating why.

View File

@@ -1 +1,152 @@
from baselines.acktr.acktr_disc import *
import os.path as osp
import time
import functools
import tensorflow as tf
from baselines import logger
from baselines.common import set_global_seeds, explained_variance
from baselines.common.policies import build_policy
from baselines.common.tf_util import get_session, save_variables, load_variables
from baselines.a2c.runner import Runner
from baselines.a2c.utils import Scheduler, find_trainable_variables
from baselines.acktr import kfac
class Model(object):
def __init__(self, policy, ob_space, ac_space, nenvs,total_timesteps, nprocs=32, nsteps=20,
ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5,
kfac_clip=0.001, lrschedule='linear', is_async=True):
self.sess = sess = get_session()
nbatch = nenvs * nsteps
A = tf.placeholder(ac_space.dtype, [nbatch,] + list(ac_space.shape))
ADV = tf.placeholder(tf.float32, [nbatch])
R = tf.placeholder(tf.float32, [nbatch])
PG_LR = tf.placeholder(tf.float32, [])
VF_LR = tf.placeholder(tf.float32, [])
with tf.variable_scope('acktr_model', reuse=tf.AUTO_REUSE):
self.model = step_model = policy(nenvs, 1, sess=sess)
self.model2 = train_model = policy(nenvs*nsteps, nsteps, sess=sess)
neglogpac = train_model.pd.neglogp(A)
self.logits = train_model.pi
##training loss
pg_loss = tf.reduce_mean(ADV*neglogpac)
entropy = tf.reduce_mean(train_model.pd.entropy())
pg_loss = pg_loss - ent_coef * entropy
vf_loss = tf.losses.mean_squared_error(tf.squeeze(train_model.vf), R)
train_loss = pg_loss + vf_coef * vf_loss
##Fisher loss construction
self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(neglogpac)
sample_net = train_model.vf + tf.random_normal(tf.shape(train_model.vf))
self.vf_fisher = vf_fisher_loss = - vf_fisher_coef*tf.reduce_mean(tf.pow(train_model.vf - tf.stop_gradient(sample_net), 2))
self.joint_fisher = joint_fisher_loss = pg_fisher_loss + vf_fisher_loss
self.params=params = find_trainable_variables("acktr_model")
self.grads_check = grads = tf.gradients(train_loss,params)
with tf.device('/gpu:0'):
self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip,\
momentum=0.9, kfac_update=1, epsilon=0.01,\
stats_decay=0.99, is_async=is_async, cold_iter=10, max_grad_norm=max_grad_norm)
# update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss, var_list=params)
optim.compute_and_apply_stats(joint_fisher_loss, var_list=params)
train_op, q_runner = optim.apply_gradients(list(zip(grads,params)))
self.q_runner = q_runner
self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)
def train(obs, states, rewards, masks, actions, values):
advs = rewards - values
for step in range(len(obs)):
cur_lr = self.lr.value()
td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, PG_LR:cur_lr, VF_LR:cur_lr}
if states is not None:
td_map[train_model.S] = states
td_map[train_model.M] = masks
policy_loss, value_loss, policy_entropy, _ = sess.run(
[pg_loss, vf_loss, entropy, train_op],
td_map
)
return policy_loss, value_loss, policy_entropy
self.train = train
self.save = functools.partial(save_variables, sess=sess)
self.load = functools.partial(load_variables, sess=sess)
self.train_model = train_model
self.step_model = step_model
self.step = step_model.step
self.value = step_model.value
self.initial_state = step_model.initial_state
tf.global_variables_initializer().run(session=sess)
def learn(network, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interval=1, nprocs=32, nsteps=20,
ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5,
kfac_clip=0.001, save_interval=None, lrschedule='linear', load_path=None, is_async=True, **network_kwargs):
set_global_seeds(seed)
if network == 'cnn':
network_kwargs['one_dim_bias'] = True
policy = build_policy(env, network, **network_kwargs)
nenvs = env.num_envs
ob_space = env.observation_space
ac_space = env.action_space
make_model = lambda : Model(policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=nprocs, nsteps
=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, vf_fisher_coef=
vf_fisher_coef, lr=lr, max_grad_norm=max_grad_norm, kfac_clip=kfac_clip,
lrschedule=lrschedule, is_async=is_async)
if save_interval and logger.get_dir():
import cloudpickle
with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh:
fh.write(cloudpickle.dumps(make_model))
model = make_model()
if load_path is not None:
model.load(load_path)
runner = Runner(env, model, nsteps=nsteps, gamma=gamma)
nbatch = nenvs*nsteps
tstart = time.time()
coord = tf.train.Coordinator()
if is_async:
enqueue_threads = model.q_runner.create_threads(model.sess, coord=coord, start=True)
else:
enqueue_threads = []
for update in range(1, total_timesteps//nbatch+1):
obs, states, rewards, masks, actions, values = runner.run()
policy_loss, value_loss, policy_entropy = model.train(obs, states, rewards, masks, actions, values)
model.old_obs = obs
nseconds = time.time()-tstart
fps = int((update*nbatch)/nseconds)
if update % log_interval == 0 or update == 1:
ev = explained_variance(values, rewards)
logger.record_tabular("nupdates", update)
logger.record_tabular("total_timesteps", update*nbatch)
logger.record_tabular("fps", fps)
logger.record_tabular("policy_entropy", float(policy_entropy))
logger.record_tabular("policy_loss", float(policy_loss))
logger.record_tabular("value_loss", float(value_loss))
logger.record_tabular("explained_variance", float(ev))
logger.dump_tabular()
if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir():
savepath = osp.join(logger.get_dir(), 'checkpoint%.5i'%update)
print('Saving to', savepath)
model.save(savepath)
coord.request_stop()
coord.join(enqueue_threads)
return model

View File

@@ -1,142 +0,0 @@
import numpy as np
import tensorflow as tf
from baselines import logger
import baselines.common as common
from baselines.common import tf_util as U
from baselines.acktr import kfac
from baselines.common.filters import ZFilter
def pathlength(path):
return path["reward"].shape[0]# Loss function that we'll differentiate to get the policy gradient
def rollout(env, policy, max_pathlength, animate=False, obfilter=None):
"""
Simulate the env and policy for max_pathlength steps
"""
ob = env.reset()
prev_ob = np.float32(np.zeros(ob.shape))
if obfilter: ob = obfilter(ob)
terminated = False
obs = []
acs = []
ac_dists = []
logps = []
rewards = []
for _ in range(max_pathlength):
if animate:
env.render()
state = np.concatenate([ob, prev_ob], -1)
obs.append(state)
ac, ac_dist, logp = policy.act(state)
acs.append(ac)
ac_dists.append(ac_dist)
logps.append(logp)
prev_ob = np.copy(ob)
scaled_ac = env.action_space.low + (ac + 1.) * 0.5 * (env.action_space.high - env.action_space.low)
scaled_ac = np.clip(scaled_ac, env.action_space.low, env.action_space.high)
ob, rew, done, _ = env.step(scaled_ac)
if obfilter: ob = obfilter(ob)
rewards.append(rew)
if done:
terminated = True
break
return {"observation" : np.array(obs), "terminated" : terminated,
"reward" : np.array(rewards), "action" : np.array(acs),
"action_dist": np.array(ac_dists), "logp" : np.array(logps)}
def learn(env, policy, vf, gamma, lam, timesteps_per_batch, num_timesteps,
animate=False, callback=None, desired_kl=0.002):
obfilter = ZFilter(env.observation_space.shape)
max_pathlength = env.spec.timestep_limit
stepsize = tf.Variable(initial_value=np.float32(np.array(0.03)), name='stepsize')
inputs, loss, loss_sampled = policy.update_info
optim = kfac.KfacOptimizer(learning_rate=stepsize, cold_lr=stepsize*(1-0.9), momentum=0.9, kfac_update=2,\
epsilon=1e-2, stats_decay=0.99, async_=1, cold_iter=1,
weight_decay_dict=policy.wd_dict, max_grad_norm=None)
pi_var_list = []
for var in tf.trainable_variables():
if "pi" in var.name:
pi_var_list.append(var)
update_op, q_runner = optim.minimize(loss, loss_sampled, var_list=pi_var_list)
do_update = U.function(inputs, update_op)
U.initialize()
# start queue runners
enqueue_threads = []
coord = tf.train.Coordinator()
for qr in [q_runner, vf.q_runner]:
assert (qr != None)
enqueue_threads.extend(qr.create_threads(tf.get_default_session(), coord=coord, start=True))
i = 0
timesteps_so_far = 0
while True:
if timesteps_so_far > num_timesteps:
break
logger.log("********** Iteration %i ************"%i)
# Collect paths until we have enough timesteps
timesteps_this_batch = 0
paths = []
while True:
path = rollout(env, policy, max_pathlength, animate=(len(paths)==0 and (i % 10 == 0) and animate), obfilter=obfilter)
paths.append(path)
n = pathlength(path)
timesteps_this_batch += n
timesteps_so_far += n
if timesteps_this_batch > timesteps_per_batch:
break
# Estimate advantage function
vtargs = []
advs = []
for path in paths:
rew_t = path["reward"]
return_t = common.discount(rew_t, gamma)
vtargs.append(return_t)
vpred_t = vf.predict(path)
vpred_t = np.append(vpred_t, 0.0 if path["terminated"] else vpred_t[-1])
delta_t = rew_t + gamma*vpred_t[1:] - vpred_t[:-1]
adv_t = common.discount(delta_t, gamma * lam)
advs.append(adv_t)
# Update value function
vf.fit(paths, vtargs)
# Build arrays for policy update
ob_no = np.concatenate([path["observation"] for path in paths])
action_na = np.concatenate([path["action"] for path in paths])
oldac_dist = np.concatenate([path["action_dist"] for path in paths])
adv_n = np.concatenate(advs)
standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8)
# Policy update
do_update(ob_no, action_na, standardized_adv_n)
min_stepsize = np.float32(1e-8)
max_stepsize = np.float32(1e0)
# Adjust stepsize
kl = policy.compute_kl(ob_no, oldac_dist)
if kl > desired_kl * 2:
logger.log("kl too high")
tf.assign(stepsize, tf.maximum(min_stepsize, stepsize / 1.5)).eval()
elif kl < desired_kl / 2:
logger.log("kl too low")
tf.assign(stepsize, tf.minimum(max_stepsize, stepsize * 1.5)).eval()
else:
logger.log("kl just right!")
logger.record_tabular("EpRewMean", np.mean([path["reward"].sum() for path in paths]))
logger.record_tabular("EpRewSEM", np.std([path["reward"].sum()/np.sqrt(len(paths)) for path in paths]))
logger.record_tabular("EpLenMean", np.mean([pathlength(path) for path in paths]))
logger.record_tabular("KL", kl)
if callback:
callback()
logger.dump_tabular()
i += 1
coord.request_stop()
coord.join(enqueue_threads)

View File

@@ -1,150 +0,0 @@
import os.path as osp
import time
import functools
import numpy as np
import tensorflow as tf
from baselines import logger
from baselines.common import set_global_seeds, explained_variance
from baselines.common.policies import build_policy
from baselines.common.tf_util import get_session, save_variables, load_variables
from baselines.a2c.runner import Runner
from baselines.a2c.utils import discount_with_dones
from baselines.a2c.utils import Scheduler, find_trainable_variables
from baselines.acktr import kfac
class Model(object):
def __init__(self, policy, ob_space, ac_space, nenvs,total_timesteps, nprocs=32, nsteps=20,
ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5,
kfac_clip=0.001, lrschedule='linear'):
self.sess = sess = get_session()
nact = ac_space.n
nbatch = nenvs * nsteps
A = tf.placeholder(tf.int32, [nbatch])
ADV = tf.placeholder(tf.float32, [nbatch])
R = tf.placeholder(tf.float32, [nbatch])
PG_LR = tf.placeholder(tf.float32, [])
VF_LR = tf.placeholder(tf.float32, [])
with tf.variable_scope('acktr_model', reuse=tf.AUTO_REUSE):
self.model = step_model = policy(nenvs, 1, sess=sess)
self.model2 = train_model = policy(nenvs*nsteps, nsteps, sess=sess)
neglogpac = train_model.pd.neglogp(A)
self.logits = logits = train_model.pi
##training loss
pg_loss = tf.reduce_mean(ADV*neglogpac)
entropy = tf.reduce_mean(train_model.pd.entropy())
pg_loss = pg_loss - ent_coef * entropy
vf_loss = tf.losses.mean_squared_error(tf.squeeze(train_model.vf), R)
train_loss = pg_loss + vf_coef * vf_loss
##Fisher loss construction
self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(neglogpac)
sample_net = train_model.vf + tf.random_normal(tf.shape(train_model.vf))
self.vf_fisher = vf_fisher_loss = - vf_fisher_coef*tf.reduce_mean(tf.pow(train_model.vf - tf.stop_gradient(sample_net), 2))
self.joint_fisher = joint_fisher_loss = pg_fisher_loss + vf_fisher_loss
self.params=params = find_trainable_variables("acktr_model")
self.grads_check = grads = tf.gradients(train_loss,params)
with tf.device('/gpu:0'):
self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip,\
momentum=0.9, kfac_update=1, epsilon=0.01,\
stats_decay=0.99, async_=1, cold_iter=10, max_grad_norm=max_grad_norm)
update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss, var_list=params)
train_op, q_runner = optim.apply_gradients(list(zip(grads,params)))
self.q_runner = q_runner
self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)
def train(obs, states, rewards, masks, actions, values):
advs = rewards - values
for step in range(len(obs)):
cur_lr = self.lr.value()
td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, PG_LR:cur_lr}
if states is not None:
td_map[train_model.S] = states
td_map[train_model.M] = masks
policy_loss, value_loss, policy_entropy, _ = sess.run(
[pg_loss, vf_loss, entropy, train_op],
td_map
)
return policy_loss, value_loss, policy_entropy
self.train = train
self.save = functools.partial(save_variables, sess=sess)
self.load = functools.partial(load_variables, sess=sess)
self.train_model = train_model
self.step_model = step_model
self.step = step_model.step
self.value = step_model.value
self.initial_state = step_model.initial_state
tf.global_variables_initializer().run(session=sess)
def learn(network, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interval=1, nprocs=32, nsteps=20,
ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5,
kfac_clip=0.001, save_interval=None, lrschedule='linear', load_path=None, **network_kwargs):
set_global_seeds(seed)
if network == 'cnn':
network_kwargs['one_dim_bias'] = True
policy = build_policy(env, network, **network_kwargs)
nenvs = env.num_envs
ob_space = env.observation_space
ac_space = env.action_space
make_model = lambda : Model(policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=nprocs, nsteps
=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, vf_fisher_coef=
vf_fisher_coef, lr=lr, max_grad_norm=max_grad_norm, kfac_clip=kfac_clip,
lrschedule=lrschedule)
if save_interval and logger.get_dir():
import cloudpickle
with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh:
fh.write(cloudpickle.dumps(make_model))
model = make_model()
if load_path is not None:
model.load(load_path)
runner = Runner(env, model, nsteps=nsteps, gamma=gamma)
nbatch = nenvs*nsteps
tstart = time.time()
coord = tf.train.Coordinator()
enqueue_threads = model.q_runner.create_threads(model.sess, coord=coord, start=True)
for update in range(1, total_timesteps//nbatch+1):
obs, states, rewards, masks, actions, values = runner.run()
policy_loss, value_loss, policy_entropy = model.train(obs, states, rewards, masks, actions, values)
model.old_obs = obs
nseconds = time.time()-tstart
fps = int((update*nbatch)/nseconds)
if update % log_interval == 0 or update == 1:
ev = explained_variance(values, rewards)
logger.record_tabular("nupdates", update)
logger.record_tabular("total_timesteps", update*nbatch)
logger.record_tabular("fps", fps)
logger.record_tabular("policy_entropy", float(policy_entropy))
logger.record_tabular("policy_loss", float(policy_loss))
logger.record_tabular("value_loss", float(value_loss))
logger.record_tabular("explained_variance", float(ev))
logger.dump_tabular()
if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir():
savepath = osp.join(logger.get_dir(), 'checkpoint%.5i'%update)
print('Saving to', savepath)
model.save(savepath)
coord.request_stop()
coord.join(enqueue_threads)
return model

View File

@@ -0,0 +1,5 @@
def mujoco():
return dict(
nsteps=2500,
value_network='copy'
)

View File

@@ -1,6 +1,8 @@
import tensorflow as tf
import numpy as np
import re
# flake8: noqa F403, F405
from baselines.acktr.kfac_utils import *
from functools import reduce
@@ -10,14 +12,14 @@ KFAC_DEBUG = False
class KfacOptimizer():
def __init__(self, learning_rate=0.01, momentum=0.9, clip_kl=0.01, kfac_update=2, stats_accum_iter=60, full_stats_init=False, cold_iter=100, cold_lr=None, async_=False, async_stats=False, epsilon=1e-2, stats_decay=0.95, blockdiag_bias=False, channel_fac=False, factored_damping=False, approxT2=False, use_float64=False, weight_decay_dict={},max_grad_norm=0.5):
def __init__(self, learning_rate=0.01, momentum=0.9, clip_kl=0.01, kfac_update=2, stats_accum_iter=60, full_stats_init=False, cold_iter=100, cold_lr=None, is_async=False, async_stats=False, epsilon=1e-2, stats_decay=0.95, blockdiag_bias=False, channel_fac=False, factored_damping=False, approxT2=False, use_float64=False, weight_decay_dict={},max_grad_norm=0.5):
self.max_grad_norm = max_grad_norm
self._lr = learning_rate
self._momentum = momentum
self._clip_kl = clip_kl
self._channel_fac = channel_fac
self._kfac_update = kfac_update
self._async = async_
self._async = is_async
self._async_stats = async_stats
self._epsilon = epsilon
self._stats_decay = stats_decay

View File

@@ -1,42 +0,0 @@
import numpy as np
import tensorflow as tf
from baselines.acktr.utils import dense, kl_div
import baselines.common.tf_util as U
class GaussianMlpPolicy(object):
def __init__(self, ob_dim, ac_dim):
# Here we'll construct a bunch of expressions, which will be used in two places:
# (1) When sampling actions
# (2) When computing loss functions, for the policy update
# Variables specific to (1) have the word "sampled" in them,
# whereas variables specific to (2) have the word "old" in them
ob_no = tf.placeholder(tf.float32, shape=[None, ob_dim*2], name="ob") # batch of observations
oldac_na = tf.placeholder(tf.float32, shape=[None, ac_dim], name="ac") # batch of actions previous actions
oldac_dist = tf.placeholder(tf.float32, shape=[None, ac_dim*2], name="oldac_dist") # batch of actions previous action distributions
adv_n = tf.placeholder(tf.float32, shape=[None], name="adv") # advantage function estimate
wd_dict = {}
h1 = tf.nn.tanh(dense(ob_no, 64, "h1", weight_init=U.normc_initializer(1.0), bias_init=0.0, weight_loss_dict=wd_dict))
h2 = tf.nn.tanh(dense(h1, 64, "h2", weight_init=U.normc_initializer(1.0), bias_init=0.0, weight_loss_dict=wd_dict))
mean_na = dense(h2, ac_dim, "mean", weight_init=U.normc_initializer(0.1), bias_init=0.0, weight_loss_dict=wd_dict) # Mean control output
self.wd_dict = wd_dict
self.logstd_1a = logstd_1a = tf.get_variable("logstd", [ac_dim], tf.float32, tf.zeros_initializer()) # Variance on outputs
logstd_1a = tf.expand_dims(logstd_1a, 0)
std_1a = tf.exp(logstd_1a)
std_na = tf.tile(std_1a, [tf.shape(mean_na)[0], 1])
ac_dist = tf.concat([tf.reshape(mean_na, [-1, ac_dim]), tf.reshape(std_na, [-1, ac_dim])], 1)
sampled_ac_na = tf.random_normal(tf.shape(ac_dist[:,ac_dim:])) * ac_dist[:,ac_dim:] + ac_dist[:,:ac_dim] # This is the sampled action we'll perform.
logprobsampled_n = - tf.reduce_sum(tf.log(ac_dist[:,ac_dim:]), axis=1) - 0.5 * tf.log(2.0*np.pi)*ac_dim - 0.5 * tf.reduce_sum(tf.square(ac_dist[:,:ac_dim] - sampled_ac_na) / (tf.square(ac_dist[:,ac_dim:])), axis=1) # Logprob of sampled action
logprob_n = - tf.reduce_sum(tf.log(ac_dist[:,ac_dim:]), axis=1) - 0.5 * tf.log(2.0*np.pi)*ac_dim - 0.5 * tf.reduce_sum(tf.square(ac_dist[:,:ac_dim] - oldac_na) / (tf.square(ac_dist[:,ac_dim:])), axis=1) # Logprob of previous actions under CURRENT policy (whereas oldlogprob_n is under OLD policy)
kl = tf.reduce_mean(kl_div(oldac_dist, ac_dist, ac_dim))
#kl = .5 * tf.reduce_mean(tf.square(logprob_n - oldlogprob_n)) # Approximation of KL divergence between old policy used to generate actions, and new policy used to compute logprob_n
surr = - tf.reduce_mean(adv_n * logprob_n) # Loss function that we'll differentiate to get the policy gradient
surr_sampled = - tf.reduce_mean(logprob_n) # Sampled loss of the policy
self._act = U.function([ob_no], [sampled_ac_na, ac_dist, logprobsampled_n]) # Generate a new action and its logprob
#self.compute_kl = U.function([ob_no, oldac_na, oldlogprob_n], kl) # Compute (approximate) KL divergence between old policy and new policy
self.compute_kl = U.function([ob_no, oldac_dist], kl)
self.update_info = ((ob_no, oldac_na, adv_n), surr, surr_sampled) # Input and output variables needed for computing loss
U.initialize() # Initialize uninitialized TF variables
def act(self, ob):
ac, ac_dist, logp = self._act(ob[None])
return ac[0], ac_dist[0], logp[0]

View File

@@ -1,34 +0,0 @@
#!/usr/bin/env python3
import tensorflow as tf
from baselines import logger
from baselines.common.cmd_util import make_mujoco_env, mujoco_arg_parser
from baselines.acktr.acktr_cont import learn
from baselines.acktr.policies import GaussianMlpPolicy
from baselines.acktr.value_functions import NeuralNetValueFunction
def train(env_id, num_timesteps, seed):
env = make_mujoco_env(env_id, seed)
with tf.Session(config=tf.ConfigProto()):
ob_dim = env.observation_space.shape[0]
ac_dim = env.action_space.shape[0]
with tf.variable_scope("vf"):
vf = NeuralNetValueFunction(ob_dim, ac_dim)
with tf.variable_scope("pi"):
policy = GaussianMlpPolicy(ob_dim, ac_dim)
learn(env, policy=policy, vf=vf,
gamma=0.99, lam=0.97, timesteps_per_batch=2500,
desired_kl=0.002,
num_timesteps=num_timesteps, animate=False)
env.close()
def main():
args = mujoco_arg_parser().parse_args()
logger.configure()
train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
if __name__ == "__main__":
main()

View File

@@ -1,50 +0,0 @@
from baselines import logger
import numpy as np
import baselines.common as common
from baselines.common import tf_util as U
import tensorflow as tf
from baselines.acktr import kfac
from baselines.acktr.utils import dense
class NeuralNetValueFunction(object):
def __init__(self, ob_dim, ac_dim): #pylint: disable=W0613
X = tf.placeholder(tf.float32, shape=[None, ob_dim*2+ac_dim*2+2]) # batch of observations
vtarg_n = tf.placeholder(tf.float32, shape=[None], name='vtarg')
wd_dict = {}
h1 = tf.nn.elu(dense(X, 64, "h1", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict))
h2 = tf.nn.elu(dense(h1, 64, "h2", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict))
vpred_n = dense(h2, 1, "hfinal", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)[:,0]
sample_vpred_n = vpred_n + tf.random_normal(tf.shape(vpred_n))
wd_loss = tf.get_collection("vf_losses", None)
loss = tf.reduce_mean(tf.square(vpred_n - vtarg_n)) + tf.add_n(wd_loss)
loss_sampled = tf.reduce_mean(tf.square(vpred_n - tf.stop_gradient(sample_vpred_n)))
self._predict = U.function([X], vpred_n)
optim = kfac.KfacOptimizer(learning_rate=0.001, cold_lr=0.001*(1-0.9), momentum=0.9, \
clip_kl=0.3, epsilon=0.1, stats_decay=0.95, \
async_=1, kfac_update=2, cold_iter=50, \
weight_decay_dict=wd_dict, max_grad_norm=None)
vf_var_list = []
for var in tf.trainable_variables():
if "vf" in var.name:
vf_var_list.append(var)
update_op, self.q_runner = optim.minimize(loss, loss_sampled, var_list=vf_var_list)
self.do_update = U.function([X, vtarg_n], update_op) #pylint: disable=E1101
U.initialize() # Initialize uninitialized TF variables
def _preproc(self, path):
l = pathlength(path)
al = np.arange(l).reshape(-1,1)/10.0
act = path["action_dist"].astype('float32')
X = np.concatenate([path['observation'], act, al, np.ones((l, 1))], axis=1)
return X
def predict(self, path):
return self._predict(self._preproc(path))
def fit(self, paths, targvals):
X = np.concatenate([self._preproc(p) for p in paths])
y = np.concatenate(targvals)
logger.record_tabular("EVBefore", common.explained_variance(self._predict(X), y))
for _ in range(25): self.do_update(X, y)
logger.record_tabular("EVAfter", common.explained_variance(self._predict(X), y))
def pathlength(path):
return path["reward"].shape[0]

View File

@@ -7,9 +7,3 @@ exclude =
baselines/ddpg,
baselines/ppo1,
baselines/bench,
baselines/acktr,