fix gail tf_util usage
This commit is contained in:
@@ -41,7 +41,7 @@ class TransitionClassifier(object):
|
||||
expert_loss = tf.reduce_mean(expert_loss)
|
||||
# Build entropy loss
|
||||
logits = tf.concat([generator_logits, expert_logits], 0)
|
||||
entropy = tf.reduce_mean(U.logit_bernoulli_entropy(logits))
|
||||
entropy = tf.reduce_mean(logit_bernoulli_entropy(logits))
|
||||
entropy_loss = -entcoeff*entropy
|
||||
# Loss + Accuracy terms
|
||||
self.losses = [generator_loss, expert_loss, entropy, entropy_loss, generator_acc, expert_acc]
|
||||
@@ -77,7 +77,7 @@ class TransitionClassifier(object):
|
||||
return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)
|
||||
|
||||
def get_reward(self, obs, acs):
|
||||
sess = U.get_session()
|
||||
sess = tf.get_default_session()
|
||||
if len(obs.shape) == 1:
|
||||
obs = np.expand_dims(obs, 0)
|
||||
if len(acs.shape) == 1:
|
||||
|
@@ -9,6 +9,7 @@ import gym
|
||||
import baselines.common.tf_util as U
|
||||
from baselines.common.mpi_running_mean_std import RunningMeanStd
|
||||
from baselines.common.distributions import make_pdtype
|
||||
from baselines.acktr.utils import dense
|
||||
|
||||
|
||||
class MlpPolicy(object):
|
||||
@@ -35,18 +36,19 @@ class MlpPolicy(object):
|
||||
obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
|
||||
last_out = obz
|
||||
for i in range(num_hid_layers):
|
||||
last_out = tf.nn.tanh(U.dense(last_out, hid_size, "vffc%i" % (i+1), weight_init=U.normc_initializer(1.0)))
|
||||
self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:, 0]
|
||||
last_out = tf.nn.tanh(dense(last_out, hid_size, "vffc%i" % (i+1), weight_init=U.normc_initializer(1.0)))
|
||||
self.vpred = dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:, 0]
|
||||
|
||||
last_out = obz
|
||||
for i in range(num_hid_layers):
|
||||
last_out = tf.nn.tanh(U.dense(last_out, hid_size, "polfc%i" % (i+1), weight_init=U.normc_initializer(1.0)))
|
||||
last_out = tf.nn.tanh(dense(last_out, hid_size, "polfc%i" % (i+1), weight_init=U.normc_initializer(1.0)))
|
||||
|
||||
if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
|
||||
mean = U.dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01))
|
||||
mean = dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01))
|
||||
logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer())
|
||||
pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1)
|
||||
pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
|
||||
else:
|
||||
pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01))
|
||||
pdparam = dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01))
|
||||
|
||||
self.pd = pdtype.pdfromflat(pdparam)
|
||||
|
||||
|
@@ -22,7 +22,7 @@ from baselines.gail.adversary import TransitionClassifier
|
||||
|
||||
def argsparser():
|
||||
parser = argparse.ArgumentParser("Tensorflow Implementation of GAIL")
|
||||
parser.add_argument('--env_id', help='environment ID', default='Hopper-v1')
|
||||
parser.add_argument('--env_id', help='environment ID', default='Hopper-v2')
|
||||
parser.add_argument('--seed', help='RNG seed', type=int, default=0)
|
||||
parser.add_argument('--expert_path', type=str, default='data/deterministic.trpo.Hopper.0.00.npz')
|
||||
parser.add_argument('--checkpoint_dir', help='the directory to save model', default='checkpoint')
|
||||
|
@@ -130,14 +130,14 @@ def learn(env, policy_func, reward_giver, expert_dataset, rank,
|
||||
|
||||
kloldnew = oldpi.pd.kl(pi.pd)
|
||||
ent = pi.pd.entropy()
|
||||
meankl = tf_util.reduce_mean(kloldnew)
|
||||
meanent = tf_util.reduce_mean(ent)
|
||||
meankl = tf.reduce_mean(kloldnew)
|
||||
meanent = tf.reduce_mean(ent)
|
||||
entbonus = entcoeff * meanent
|
||||
|
||||
vferr = tf_util.reduce_mean(tf.square(pi.vpred - ret))
|
||||
vferr = tf.reduce_mean(tf.square(pi.vpred - ret))
|
||||
|
||||
ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold
|
||||
surrgain = tf_util.reduce_mean(ratio * atarg)
|
||||
surrgain = tf.reduce_mean(ratio * atarg)
|
||||
|
||||
optimgain = surrgain + entbonus
|
||||
losses = [optimgain, meankl, entbonus, surrgain, meanent]
|
||||
@@ -146,8 +146,8 @@ def learn(env, policy_func, reward_giver, expert_dataset, rank,
|
||||
dist = meankl
|
||||
|
||||
all_var_list = pi.get_trainable_variables()
|
||||
var_list = [v for v in all_var_list if v.name.split("/")[1] == "pol"]
|
||||
vf_var_list = [v for v in all_var_list if v.name.split("/")[1] == "vf"]
|
||||
var_list = [v for v in all_var_list if v.name.startswith("pi/pol") or v.name.startswith("pi/logstd")]
|
||||
vf_var_list = [v for v in all_var_list if v.name.startswith("pi/vff")]
|
||||
assert len(var_list) == len(vf_var_list) + 1
|
||||
d_adam = MpiAdam(reward_giver.get_trainable_variables())
|
||||
vfadam = MpiAdam(vf_var_list)
|
||||
@@ -163,7 +163,7 @@ def learn(env, policy_func, reward_giver, expert_dataset, rank,
|
||||
sz = U.intprod(shape)
|
||||
tangents.append(tf.reshape(flat_tangent[start:start+sz], shape))
|
||||
start += sz
|
||||
gvp = tf.add_n([U.sum(g*tangent) for (g, tangent) in zipsame(klgrads, tangents)]) # pylint: disable=E1111
|
||||
gvp = tf.add_n([tf.reduce_sum(g*tangent) for (g, tangent) in zipsame(klgrads, tangents)]) # pylint: disable=E1111
|
||||
fvp = U.flatgrad(gvp, var_list)
|
||||
|
||||
assign_old_eq_new = U.function([], [], updates=[tf.assign(oldv, newv)
|
||||
@@ -190,7 +190,6 @@ def learn(env, policy_func, reward_giver, expert_dataset, rank,
|
||||
out /= nworkers
|
||||
return out
|
||||
|
||||
writer = U.file_writer(log_dir)
|
||||
U.initialize()
|
||||
th_init = get_flat()
|
||||
MPI.COMM_WORLD.Bcast(th_init, root=0)
|
||||
@@ -232,7 +231,10 @@ def learn(env, policy_func, reward_giver, expert_dataset, rank,
|
||||
|
||||
# Save model
|
||||
if rank == 0 and iters_so_far % save_per_iter == 0 and ckpt_dir is not None:
|
||||
U.save_state(os.path.join(ckpt_dir, task_name))
|
||||
fname = os.path.join(ckpt_dir, task_name)
|
||||
os.makedirs(os.path.dirname(fname), exist_ok=True)
|
||||
saver = tf.train.Saver()
|
||||
saver.save(tf.get_default_session(), fname)
|
||||
|
||||
logger.log("********** Iteration %i ************" % iters_so_far)
|
||||
|
||||
@@ -346,10 +348,6 @@ def learn(env, policy_func, reward_giver, expert_dataset, rank,
|
||||
|
||||
if rank == 0:
|
||||
logger.dump_tabular()
|
||||
g_loss_stats.add_all_summary(writer, g_losses, iters_so_far)
|
||||
d_loss_stats.add_all_summary(writer, np.mean(d_losses, axis=0), iters_so_far)
|
||||
ep_stats.add_all_summary(writer, [np.mean(true_rewbuffer), np.mean(rewbuffer),
|
||||
np.mean(lenbuffer)], iters_so_far)
|
||||
|
||||
|
||||
def flatten_lists(listoflists):
|
||||
|
Reference in New Issue
Block a user