diff --git a/baselines/ppo1/cnn_policy.py b/baselines/ppo1/cnn_policy.py index b823e76..57160a2 100644 --- a/baselines/ppo1/cnn_policy.py +++ b/baselines/ppo1/cnn_policy.py @@ -18,7 +18,7 @@ class CnnPolicy(object): sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) - + x = ob / 255.0 if kind == 'small': # from A3C paper x = tf.nn.relu(U.conv2d(x, 16, "l1", [8, 8], [4, 4], pad="VALID")) @@ -49,7 +49,7 @@ class CnnPolicy(object): ac1, vpred1 = self._act(stochastic, ob[None]) return ac1[0], vpred1[0] def get_variables(self): - return tf.get_collection(tf.GraphKeys.VARIABLES, self.scope) + return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope) def get_trainable_variables(self): return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope) def get_initial_state(self): diff --git a/baselines/ppo1/mlp_policy.py b/baselines/ppo1/mlp_policy.py index 9211891..4be3b19 100644 --- a/baselines/ppo1/mlp_policy.py +++ b/baselines/ppo1/mlp_policy.py @@ -18,7 +18,7 @@ class MlpPolicy(object): sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) - + with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) @@ -27,12 +27,12 @@ class MlpPolicy(object): for i in range(num_hid_layers): last_out = tf.nn.tanh(U.dense(last_out, hid_size, "vffc%i"%(i+1), weight_init=U.normc_initializer(1.0))) self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:,0] - + last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh(U.dense(last_out, hid_size, "polfc%i"%(i+1), weight_init=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): - mean = U.dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01)) + mean = U.dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01)) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer()) pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1) else: @@ -51,7 +51,7 @@ class MlpPolicy(object): ac1, vpred1 = self._act(stochastic, ob[None]) return ac1[0], vpred1[0] def get_variables(self): - return tf.get_collection(tf.GraphKeys.VARIABLES, self.scope) + return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope) def get_trainable_variables(self): return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope) def get_initial_state(self): diff --git a/baselines/trpo_mpi/nosharing_cnn_policy.py b/baselines/trpo_mpi/nosharing_cnn_policy.py index 49fc3b8..6f96cd5 100644 --- a/baselines/trpo_mpi/nosharing_cnn_policy.py +++ b/baselines/trpo_mpi/nosharing_cnn_policy.py @@ -18,7 +18,7 @@ class CnnPolicy(object): sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) - + obscaled = ob / 255.0 with tf.variable_scope("pol"): @@ -49,7 +49,7 @@ class CnnPolicy(object): ac1, vpred1 = self._act(stochastic, ob[None]) return ac1[0], vpred1[0] def get_variables(self): - return tf.get_collection(tf.GraphKeys.VARIABLES, self.scope) + return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope) def get_trainable_variables(self): return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope) def get_initial_state(self):