fix gail tf_util usage

This commit is contained in:
Oleg Klimov
2018-02-05 07:51:27 -08:00
parent 16d7d23b7d
commit 2793971c10
4 changed files with 22 additions and 22 deletions

View File

@@ -41,7 +41,7 @@ class TransitionClassifier(object):
expert_loss = tf.reduce_mean(expert_loss)
# Build entropy loss
logits = tf.concat([generator_logits, expert_logits], 0)
entropy = tf.reduce_mean(U.logit_bernoulli_entropy(logits))
entropy = tf.reduce_mean(logit_bernoulli_entropy(logits))
entropy_loss = -entcoeff*entropy
# Loss + Accuracy terms
self.losses = [generator_loss, expert_loss, entropy, entropy_loss, generator_acc, expert_acc]
@@ -77,7 +77,7 @@ class TransitionClassifier(object):
return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)
def get_reward(self, obs, acs):
sess = U.get_session()
sess = tf.get_default_session()
if len(obs.shape) == 1:
obs = np.expand_dims(obs, 0)
if len(acs.shape) == 1:

View File

@@ -9,6 +9,7 @@ import gym
import baselines.common.tf_util as U
from baselines.common.mpi_running_mean_std import RunningMeanStd
from baselines.common.distributions import make_pdtype
from baselines.acktr.utils import dense
class MlpPolicy(object):
@@ -35,18 +36,19 @@ class MlpPolicy(object):
obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
last_out = obz
for i in range(num_hid_layers):
last_out = tf.nn.tanh(U.dense(last_out, hid_size, "vffc%i" % (i+1), weight_init=U.normc_initializer(1.0)))
self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:, 0]
last_out = tf.nn.tanh(dense(last_out, hid_size, "vffc%i" % (i+1), weight_init=U.normc_initializer(1.0)))
self.vpred = dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:, 0]
last_out = obz
for i in range(num_hid_layers):
last_out = tf.nn.tanh(U.dense(last_out, hid_size, "polfc%i" % (i+1), weight_init=U.normc_initializer(1.0)))
last_out = tf.nn.tanh(dense(last_out, hid_size, "polfc%i" % (i+1), weight_init=U.normc_initializer(1.0)))
if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
mean = U.dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01))
mean = dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01))
logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer())
pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1)
pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
else:
pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01))
pdparam = dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01))
self.pd = pdtype.pdfromflat(pdparam)

View File

@@ -22,7 +22,7 @@ from baselines.gail.adversary import TransitionClassifier
def argsparser():
parser = argparse.ArgumentParser("Tensorflow Implementation of GAIL")
parser.add_argument('--env_id', help='environment ID', default='Hopper-v1')
parser.add_argument('--env_id', help='environment ID', default='Hopper-v2')
parser.add_argument('--seed', help='RNG seed', type=int, default=0)
parser.add_argument('--expert_path', type=str, default='data/deterministic.trpo.Hopper.0.00.npz')
parser.add_argument('--checkpoint_dir', help='the directory to save model', default='checkpoint')

View File

@@ -130,14 +130,14 @@ def learn(env, policy_func, reward_giver, expert_dataset, rank,
kloldnew = oldpi.pd.kl(pi.pd)
ent = pi.pd.entropy()
meankl = tf_util.reduce_mean(kloldnew)
meanent = tf_util.reduce_mean(ent)
meankl = tf.reduce_mean(kloldnew)
meanent = tf.reduce_mean(ent)
entbonus = entcoeff * meanent
vferr = tf_util.reduce_mean(tf.square(pi.vpred - ret))
vferr = tf.reduce_mean(tf.square(pi.vpred - ret))
ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold
surrgain = tf_util.reduce_mean(ratio * atarg)
surrgain = tf.reduce_mean(ratio * atarg)
optimgain = surrgain + entbonus
losses = [optimgain, meankl, entbonus, surrgain, meanent]
@@ -146,8 +146,8 @@ def learn(env, policy_func, reward_giver, expert_dataset, rank,
dist = meankl
all_var_list = pi.get_trainable_variables()
var_list = [v for v in all_var_list if v.name.split("/")[1] == "pol"]
vf_var_list = [v for v in all_var_list if v.name.split("/")[1] == "vf"]
var_list = [v for v in all_var_list if v.name.startswith("pi/pol") or v.name.startswith("pi/logstd")]
vf_var_list = [v for v in all_var_list if v.name.startswith("pi/vff")]
assert len(var_list) == len(vf_var_list) + 1
d_adam = MpiAdam(reward_giver.get_trainable_variables())
vfadam = MpiAdam(vf_var_list)
@@ -163,7 +163,7 @@ def learn(env, policy_func, reward_giver, expert_dataset, rank,
sz = U.intprod(shape)
tangents.append(tf.reshape(flat_tangent[start:start+sz], shape))
start += sz
gvp = tf.add_n([U.sum(g*tangent) for (g, tangent) in zipsame(klgrads, tangents)]) # pylint: disable=E1111
gvp = tf.add_n([tf.reduce_sum(g*tangent) for (g, tangent) in zipsame(klgrads, tangents)]) # pylint: disable=E1111
fvp = U.flatgrad(gvp, var_list)
assign_old_eq_new = U.function([], [], updates=[tf.assign(oldv, newv)
@@ -190,7 +190,6 @@ def learn(env, policy_func, reward_giver, expert_dataset, rank,
out /= nworkers
return out
writer = U.file_writer(log_dir)
U.initialize()
th_init = get_flat()
MPI.COMM_WORLD.Bcast(th_init, root=0)
@@ -232,7 +231,10 @@ def learn(env, policy_func, reward_giver, expert_dataset, rank,
# Save model
if rank == 0 and iters_so_far % save_per_iter == 0 and ckpt_dir is not None:
U.save_state(os.path.join(ckpt_dir, task_name))
fname = os.path.join(ckpt_dir, task_name)
os.makedirs(os.path.dirname(fname), exist_ok=True)
saver = tf.train.Saver()
saver.save(tf.get_default_session(), fname)
logger.log("********** Iteration %i ************" % iters_so_far)
@@ -346,10 +348,6 @@ def learn(env, policy_func, reward_giver, expert_dataset, rank,
if rank == 0:
logger.dump_tabular()
g_loss_stats.add_all_summary(writer, g_losses, iters_so_far)
d_loss_stats.add_all_summary(writer, np.mean(d_losses, axis=0), iters_so_far)
ep_stats.add_all_summary(writer, [np.mean(true_rewbuffer), np.mean(rewbuffer),
np.mean(lenbuffer)], iters_so_far)
def flatten_lists(listoflists):