From 36ee5d17071424f30071bcdc72ff11b18c577529 Mon Sep 17 00:00:00 2001 From: pzhokhov Date: Wed, 6 Jun 2018 11:39:13 -0700 Subject: [PATCH] Import internal changes (#422) * import rl-algs from 2e3a166 commit * extra import of the baselines badge * exported commit with identity test * proper rng seeding in the test_identity * import internal --- baselines/common/cmd_util.py | 6 +- baselines/common/tf_util.py | 1 - baselines/common/vec_env/__init__.py | 2 +- baselines/common/vec_env/dummy_vec_env.py | 4 +- baselines/common/vec_env/subproc_vec_env.py | 20 +++++- baselines/logger.py | 21 +++--- baselines/ppo1/README.md | 2 + baselines/ppo1/pposgd_simple.py | 2 + baselines/ppo1/run_humanoid.py | 75 +++++++++++++++++++++ baselines/ppo1/run_robotics.py | 40 +++++++++++ 10 files changed, 152 insertions(+), 21 deletions(-) create mode 100644 baselines/ppo1/run_humanoid.py create mode 100644 baselines/ppo1/run_robotics.py diff --git a/baselines/common/cmd_util.py b/baselines/common/cmd_util.py index 64df993..5707695 100644 --- a/baselines/common/cmd_util.py +++ b/baselines/common/cmd_util.py @@ -3,6 +3,7 @@ Helpers for scripts like run_atari.py. """ import os +from mpi4py import MPI import gym from gym.wrappers import FlattenDictWrapper from baselines import logger @@ -30,9 +31,10 @@ def make_mujoco_env(env_id, seed): """ Create a wrapped, monitored gym.Env for MuJoCo. """ - set_global_seeds(seed) + rank = MPI.COMM_WORLD.Get_rank() + set_global_seeds(seed + 10000 * rank) env = gym.make(env_id) - env = Monitor(env, logger.get_dir()) + env = Monitor(env, os.path.join(logger.get_dir(), str(rank))) env.seed(seed) return env diff --git a/baselines/common/tf_util.py b/baselines/common/tf_util.py index fbc9fae..afcd593 100644 --- a/baselines/common/tf_util.py +++ b/baselines/common/tf_util.py @@ -55,7 +55,6 @@ def make_session(num_cpu=None, make_default=False, graph=None): tf_config = tf.ConfigProto( inter_op_parallelism_threads=num_cpu, intra_op_parallelism_threads=num_cpu) - tf_config.gpu_options.allocator_type = 'BFC' if make_default: return tf.InteractiveSession(config=tf_config, graph=graph) else: diff --git a/baselines/common/vec_env/__init__.py b/baselines/common/vec_env/__init__.py index 146ca87..eb07310 100644 --- a/baselines/common/vec_env/__init__.py +++ b/baselines/common/vec_env/__init__.py @@ -77,7 +77,7 @@ class VecEnv(ABC): self.step_async(actions) return self.step_wait() - def render(self): + def render(self, mode='human'): logger.warn('Render not defined for %s'%self) @property diff --git a/baselines/common/vec_env/dummy_vec_env.py b/baselines/common/vec_env/dummy_vec_env.py index d5851e1..d0ae455 100644 --- a/baselines/common/vec_env/dummy_vec_env.py +++ b/baselines/common/vec_env/dummy_vec_env.py @@ -50,8 +50,8 @@ class DummyVecEnv(VecEnv): def close(self): return - def render(self): - return [e.render() for e in self.envs] + def render(self, mode='human'): + return [e.render(mode=mode) for e in self.envs] def _save_obs(self, e, obs): for k in self.keys: diff --git a/baselines/common/vec_env/subproc_vec_env.py b/baselines/common/vec_env/subproc_vec_env.py index f19d83a..fb55df4 100644 --- a/baselines/common/vec_env/subproc_vec_env.py +++ b/baselines/common/vec_env/subproc_vec_env.py @@ -1,6 +1,7 @@ import numpy as np from multiprocessing import Process, Pipe from baselines.common.vec_env import VecEnv, CloudpickleWrapper +from baselines.common.tile_images import tile_images def worker(remote, parent_remote, env_fn_wrapper): @@ -16,9 +17,8 @@ def worker(remote, parent_remote, env_fn_wrapper): elif cmd == 'reset': ob = env.reset() remote.send(ob) - elif cmd == 'reset_task': - ob = env.reset_task() - remote.send(ob) + elif cmd == 'render': + remote.send(env.render(mode='rgb_array')) elif cmd == 'close': remote.close() break @@ -81,3 +81,17 @@ class SubprocVecEnv(VecEnv): for p in self.ps: p.join() self.closed = True + + def render(self, mode='human'): + for pipe in self.remotes: + pipe.send(('render', None)) + imgs = [pipe.recv() for pipe in self.remotes] + bigimg = tile_images(imgs) + if mode == 'human': + import cv2 + cv2.imshow('vecenv', bigimg[:,:,::-1]) + cv2.waitKey(1) + elif mode == 'rgb_array': + return bigimg + else: + raise NotImplementedError \ No newline at end of file diff --git a/baselines/logger.py b/baselines/logger.py index 888db76..0abad0e 100644 --- a/baselines/logger.py +++ b/baselines/logger.py @@ -8,10 +8,6 @@ import datetime import tempfile from collections import defaultdict -LOG_OUTPUT_FORMATS = ['stdout', 'log', 'csv'] -LOG_OUTPUT_FORMATS_MPI = ['log'] -# Also valid: json, tensorboard - DEBUG = 10 INFO = 20 WARN = 30 @@ -75,8 +71,11 @@ class HumanOutputFormat(KVWriter, SeqWriter): return s[:20] + '...' if len(s) > 23 else s def writeseq(self, seq): - for arg in seq: - self.file.write(arg) + seq = list(seq) + for (i, elem) in enumerate(seq): + self.file.write(elem) + if i < len(seq) - 1: # add space unless this is the last one + self.file.write(' ') self.file.write('\n') self.file.flush() @@ -363,13 +362,11 @@ def configure(dir=None, format_strs=None): log_suffix = "-rank%03i" % rank if format_strs is None: - strs, strs_mpi = os.getenv('OPENAI_LOG_FORMAT'), os.getenv('OPENAI_LOG_FORMAT_MPI') - format_strs = strs_mpi if rank>0 else strs - if format_strs is not None: - format_strs = format_strs.split(',') + if rank == 0: + format_strs = os.getenv('OPENAI_LOG_FORMAT', 'stdout,log,csv').split(',') else: - format_strs = LOG_OUTPUT_FORMATS_MPI if rank>0 else LOG_OUTPUT_FORMATS - + format_strs = os.getenv('OPENAI_LOG_FORMAT_MPI', 'log').split(',') + format_strs = filter(None, format_strs) output_formats = [make_output_format(f, dir, log_suffix) for f in format_strs] Logger.CURRENT = Logger(dir=dir, output_formats=output_formats) diff --git a/baselines/ppo1/README.md b/baselines/ppo1/README.md index f45d141..1faf5ad 100644 --- a/baselines/ppo1/README.md +++ b/baselines/ppo1/README.md @@ -5,3 +5,5 @@ - `mpirun -np 8 python -m baselines.ppo1.run_atari` runs the algorithm for 40M frames = 10M timesteps on an Atari game. See help (`-h`) for more options. - `python -m baselines.ppo1.run_mujoco` runs the algorithm for 1M frames on a Mujoco environment. +- Train mujoco 3d humanoid (with optimal-ish hyperparameters): `mpirun -np 16 python -m baselines.ppo1.run_humanoid --model-path=/path/to/model` +- Render the 3d humanoid: `python -m baselines.ppo1.run_humanoid --play --model-path=/path/to/model` diff --git a/baselines/ppo1/pposgd_simple.py b/baselines/ppo1/pposgd_simple.py index 4853464..f2f13a6 100644 --- a/baselines/ppo1/pposgd_simple.py +++ b/baselines/ppo1/pposgd_simple.py @@ -212,5 +212,7 @@ def learn(env, policy_fn, *, if MPI.COMM_WORLD.Get_rank()==0: logger.dump_tabular() + return pi + def flatten_lists(listoflists): return [el for list_ in listoflists for el in list_] diff --git a/baselines/ppo1/run_humanoid.py b/baselines/ppo1/run_humanoid.py new file mode 100644 index 0000000..d7d8f5a --- /dev/null +++ b/baselines/ppo1/run_humanoid.py @@ -0,0 +1,75 @@ +#!/usr/bin/env python3 +import os +from baselines.common.cmd_util import make_mujoco_env, mujoco_arg_parser +from baselines.common import tf_util as U +from baselines import logger + +import gym + +def train(num_timesteps, seed, model_path=None): + env_id = 'Humanoid-v2' + from baselines.ppo1 import mlp_policy, pposgd_simple + U.make_session(num_cpu=1).__enter__() + def policy_fn(name, ob_space, ac_space): + return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, + hid_size=64, num_hid_layers=2) + env = make_mujoco_env(env_id, seed) + + # parameters below were the best found in a simple random search + # these are good enough to make humanoid walk, but whether those are + # an absolute best or not is not certain + env = RewScale(env, 0.1) + pi = pposgd_simple.learn(env, policy_fn, + max_timesteps=num_timesteps, + timesteps_per_actorbatch=2048, + clip_param=0.2, entcoeff=0.0, + optim_epochs=10, + optim_stepsize=3e-4, + optim_batchsize=64, + gamma=0.99, + lam=0.95, + schedule='linear', + ) + env.close() + if model_path: + U.save_state(model_path) + + return pi + +class RewScale(gym.RewardWrapper): + def __init__(self, env, scale): + gym.RewardWrapper.__init__(self, env) + self.scale = scale + def reward(self, r): + return r * self.scale + +def main(): + logger.configure() + parser = mujoco_arg_parser() + parser.add_argument('--model-path', default=os.path.join(logger.get_dir(), 'humanoid_policy')) + parser.set_defaults(num_timesteps=int(2e7)) + + args = parser.parse_args() + + if not args.play: + # train the model + train(num_timesteps=args.num_timesteps, seed=args.seed, model_path=args.model_path) + else: + # construct the model object, load pre-trained model and render + pi = train(num_timesteps=1, seed=args.seed) + U.load_state(args.model_path) + env = make_mujoco_env('Humanoid-v2', seed=0) + + ob = env.reset() + while True: + action = pi.act(stochastic=False, ob=ob)[0] + ob, _, done, _ = env.step(action) + env.render() + if done: + ob = env.reset() + + + + +if __name__ == '__main__': + main() diff --git a/baselines/ppo1/run_robotics.py b/baselines/ppo1/run_robotics.py new file mode 100644 index 0000000..7d84185 --- /dev/null +++ b/baselines/ppo1/run_robotics.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python3 + +from mpi4py import MPI +from baselines.common import set_global_seeds +from baselines import logger +from baselines.common.cmd_util import make_robotics_env, robotics_arg_parser +import mujoco_py + + +def train(env_id, num_timesteps, seed): + from baselines.ppo1 import mlp_policy, pposgd_simple + import baselines.common.tf_util as U + rank = MPI.COMM_WORLD.Get_rank() + sess = U.single_threaded_session() + sess.__enter__() + mujoco_py.ignore_mujoco_warnings().__enter__() + workerseed = seed + 10000 * rank + set_global_seeds(workerseed) + env = make_robotics_env(env_id, workerseed, rank=rank) + def policy_fn(name, ob_space, ac_space): + return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, + hid_size=256, num_hid_layers=3) + + pposgd_simple.learn(env, policy_fn, + max_timesteps=num_timesteps, + timesteps_per_actorbatch=2048, + clip_param=0.2, entcoeff=0.0, + optim_epochs=5, optim_stepsize=3e-4, optim_batchsize=256, + gamma=0.99, lam=0.95, schedule='linear', + ) + env.close() + + +def main(): + args = robotics_arg_parser().parse_args() + train(args.env, num_timesteps=args.num_timesteps, seed=args.seed) + + +if __name__ == '__main__': + main()