fix trpo_mpi bug where logstd wasn’t included
This commit is contained in:
@@ -146,8 +146,9 @@ def learn(env, policy_func, reward_giver, expert_dataset, rank,
|
|||||||
dist = meankl
|
dist = meankl
|
||||||
|
|
||||||
all_var_list = pi.get_trainable_variables()
|
all_var_list = pi.get_trainable_variables()
|
||||||
var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")]
|
var_list = [v for v in all_var_list if v.name.split("/")[1] == "pol"]
|
||||||
vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")]
|
vf_var_list = [v for v in all_var_list if v.name.split("/")[1] == "vf"]
|
||||||
|
assert len(var_list) == len(vf_var_list) + 1
|
||||||
d_adam = MpiAdam(reward_giver.get_trainable_variables())
|
d_adam = MpiAdam(reward_giver.get_trainable_variables())
|
||||||
vfadam = MpiAdam(vf_var_list)
|
vfadam = MpiAdam(vf_var_list)
|
||||||
|
|
||||||
|
@@ -22,21 +22,23 @@ class MlpPolicy(object):
|
|||||||
with tf.variable_scope("obfilter"):
|
with tf.variable_scope("obfilter"):
|
||||||
self.ob_rms = RunningMeanStd(shape=ob_space.shape)
|
self.ob_rms = RunningMeanStd(shape=ob_space.shape)
|
||||||
|
|
||||||
obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
|
with tf.variable_scope('vf'):
|
||||||
last_out = obz
|
obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
|
||||||
for i in range(num_hid_layers):
|
last_out = obz
|
||||||
last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name="vffc%i"%(i+1), kernel_initializer=U.normc_initializer(1.0)))
|
for i in range(num_hid_layers):
|
||||||
self.vpred = tf.layers.dense(last_out, 1, name='vffinal', kernel_initializer=U.normc_initializer(1.0))[:,0]
|
last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name="fc%i"%(i+1), kernel_initializer=U.normc_initializer(1.0)))
|
||||||
|
self.vpred = tf.layers.dense(last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:,0]
|
||||||
|
|
||||||
last_out = obz
|
with tf.variable_scope('pol'):
|
||||||
for i in range(num_hid_layers):
|
last_out = obz
|
||||||
last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name='polfc%i'%(i+1), kernel_initializer=U.normc_initializer(1.0)))
|
for i in range(num_hid_layers):
|
||||||
if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
|
last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name='fc%i'%(i+1), kernel_initializer=U.normc_initializer(1.0)))
|
||||||
mean = tf.layers.dense(last_out, pdtype.param_shape()[0]//2, name='polfinal', kernel_initializer=U.normc_initializer(0.01))
|
if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
|
||||||
logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer())
|
mean = tf.layers.dense(last_out, pdtype.param_shape()[0]//2, name='final', kernel_initializer=U.normc_initializer(0.01))
|
||||||
pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
|
logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer())
|
||||||
else:
|
pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
|
||||||
pdparam = tf.layers.dense(last_out, pdtype.param_shape()[0], name='polfinal', kernel_initializer=U.normc_initializer(0.01))
|
else:
|
||||||
|
pdparam = tf.layers.dense(last_out, pdtype.param_shape()[0], name='final', kernel_initializer=U.normc_initializer(0.01))
|
||||||
|
|
||||||
self.pd = pdtype.pdfromflat(pdparam)
|
self.pd = pdtype.pdfromflat(pdparam)
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user