diff --git a/baselines/ppo1/pposgd_simple.py b/baselines/ppo1/pposgd_simple.py index 2ce734f..60d1d3a 100644 --- a/baselines/ppo1/pposgd_simple.py +++ b/baselines/ppo1/pposgd_simple.py @@ -97,7 +97,6 @@ def learn(env, policy_fn, *, ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule - clip_param = clip_param * lrmult # Annealed clipping parameter epsilon ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) diff --git a/baselines/ppo1/run_humanoid.py b/baselines/ppo1/run_humanoid.py index 17b42b5..15b2ad0 100644 --- a/baselines/ppo1/run_humanoid.py +++ b/baselines/ppo1/run_humanoid.py @@ -19,16 +19,17 @@ def train(num_timesteps, seed, model_path=None): # these are good enough to make humanoid walk, but whether those are # an absolute best or not is not certain env = RewScale(env, 0.1) + logger.log("NOTE: reward will be scaled by a factor of 10 in logged stats. Check the monitor for unscaled reward.") pi = pposgd_simple.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_actorbatch=2048, - clip_param=0.2, entcoeff=0.0, + clip_param=0.1, entcoeff=0.0, optim_epochs=10, - optim_stepsize=3e-4, + optim_stepsize=1e-4, optim_batchsize=64, gamma=0.99, lam=0.95, - schedule='linear', + schedule='constant', ) env.close() if model_path: