Fix atari wrapper (affecting a2c perf) and pposgd mujoco performance

- removed vf clipping in pposgd - that was severely degrading performance on mujoco because it didn’t account for scale of returns
- switched adam epsilon in pposgd_simple
- brought back no-ops in atari wrapper (oops)
- added readmes
- revamped run_X_benchmark scripts to have standard form
- cleaned up DDPG a little, removed deprecated SimpleMonitor and non-idiomatic usage of logger
This commit is contained in:
John Schulman
2017-08-27 22:13:48 -07:00
parent 3f676f7d1e
commit d9f194f797
22 changed files with 133 additions and 229 deletions

5
baselines/ddpg/README.md Normal file
View File

@@ -0,0 +1,5 @@
# DDPG
- Original paper: https://arxiv.org/abs/1509.02971
- Baselines post: https://blog.openai.com/better-exploration-with-parameter-noise/
- `python -m baselines.ddpg.main` runs the algorithm for 1M frames = 10M timesteps on a Mujoco environment. See help (`-h`) for more options.

View File

@@ -1,19 +1,11 @@
import argparse
import time
import os
from tempfile import mkdtemp
import sys
import subprocess
import threading
import json
from baselines.common.mpi_fork import mpi_fork
from baselines import logger
from baselines.logger import Logger
import logging
from baselines import logger, bench
from baselines.common.misc_util import (
set_global_seeds,
boolean_flag,
SimpleMonitor
)
import baselines.ddpg.training as training
from baselines.ddpg.models import Actor, Critic
@@ -24,42 +16,22 @@ import gym
import tensorflow as tf
from mpi4py import MPI
def run(env_id, seed, noise_type, num_cpu, layer_norm, logdir, gym_monitor, evaluation, bind_to_core, **kwargs):
kwargs['logdir'] = logdir
whoami = mpi_fork(num_cpu, bind_to_core=bind_to_core)
if whoami == 'parent':
sys.exit(0)
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs):
# Configure things.
rank = MPI.COMM_WORLD.Get_rank()
if rank != 0:
# Write to temp directory for all non-master workers.
actual_dir = None
Logger.CURRENT.close()
Logger.CURRENT = Logger(dir=mkdtemp(), output_formats=[])
logger.set_level(logger.DISABLED)
# Create envs.
if rank == 0:
env = gym.make(env_id)
if gym_monitor and logdir:
env = gym.wrappers.Monitor(env, os.path.join(logdir, 'gym_train'), force=True)
env = SimpleMonitor(env)
if rank != 0: logger.set_level(logger.DISABLED)
if evaluation:
eval_env = gym.make(env_id)
if gym_monitor and logdir:
eval_env = gym.wrappers.Monitor(eval_env, os.path.join(logdir, 'gym_eval'), force=True)
eval_env = SimpleMonitor(eval_env)
else:
eval_env = None
# Create envs.
env = gym.make(env_id)
env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), "%i.monitor.json"%rank))
gym.logger.setLevel(logging.WARN)
if evaluation and rank==0:
eval_env = gym.make(env_id)
eval_env = bench.Monitor(eval_env, os.path.join(logger.get_dir(), 'gym_eval'))
env = bench.Monitor(env, None)
else:
env = gym.make(env_id)
if evaluation:
eval_env = gym.make(env_id)
else:
eval_env = None
eval_env = None
# Parse noise_type
action_noise = None
@@ -103,22 +75,20 @@ def run(env_id, seed, noise_type, num_cpu, layer_norm, logdir, gym_monitor, eval
env.close()
if eval_env is not None:
eval_env.close()
Logger.CURRENT.close()
if rank == 0:
logger.info('total runtime: {}s'.format(time.time() - start_time))
def parse_args():
parser = argparse.ArgumentParser()
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--env-id', type=str, default='HalfCheetah-v1')
boolean_flag(parser, 'render-eval', default=False)
boolean_flag(parser, 'layer-norm', default=True)
boolean_flag(parser, 'render', default=False)
parser.add_argument('--num-cpu', type=int, default=1)
boolean_flag(parser, 'normalize-returns', default=False)
boolean_flag(parser, 'normalize-observations', default=True)
parser.add_argument('--seed', type=int, default=0)
parser.add_argument('--seed', help='RNG seed', type=int, default=0)
parser.add_argument('--critic-l2-reg', type=float, default=1e-2)
parser.add_argument('--batch-size', type=int, default=64) # per MPI worker
parser.add_argument('--actor-lr', type=float, default=1e-4)
@@ -133,29 +103,11 @@ def parse_args():
parser.add_argument('--nb-eval-steps', type=int, default=100) # per epoch cycle and MPI worker
parser.add_argument('--nb-rollout-steps', type=int, default=100) # per epoch cycle and MPI worker
parser.add_argument('--noise-type', type=str, default='adaptive-param_0.2') # choices are adaptive-param_xx, ou_xx, normal_xx, none
parser.add_argument('--logdir', type=str, default=None)
boolean_flag(parser, 'gym-monitor', default=False)
boolean_flag(parser, 'evaluation', default=True)
boolean_flag(parser, 'bind-to-core', default=False)
boolean_flag(parser, 'evaluation', default=False)
return vars(parser.parse_args())
if __name__ == '__main__':
args = parse_args()
# Figure out what logdir to use.
if args['logdir'] is None:
args['logdir'] = os.getenv('OPENAI_LOGDIR')
# Print and save arguments.
logger.info('Arguments:')
for key in sorted(args.keys()):
logger.info('{}: {}'.format(key, args[key]))
logger.info('')
if args['logdir']:
with open(os.path.join(args['logdir'], 'args.json'), 'w') as f:
json.dump(args, f)
# Run actual script.
run(**args)

View File

@@ -14,7 +14,7 @@ from mpi4py import MPI
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic,
normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, logdir,
normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise,
popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory,
tau=0.01, eval_env=None, param_noise_adaption_interval=50):
rank = MPI.COMM_WORLD.Get_rank()
@@ -178,7 +178,7 @@ def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, pa
logger.record_tabular(key, combined_stats[key])
logger.dump_tabular()
logger.info('')
logdir = logger.get_dir()
if rank == 0 and logdir:
if hasattr(env, 'get_state'):
with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: