From d90638b56505b6d61b4253fc89e5b47da364a2e1 Mon Sep 17 00:00:00 2001 From: Matthias Plappert Date: Mon, 26 Feb 2018 10:32:11 +0100 Subject: [PATCH] Minor improvements --- baselines/her/README.md | 35 +++++++++++++++++++++++++++++++ baselines/her/experiment/train.py | 29 +++++++++++++------------ 2 files changed, 51 insertions(+), 13 deletions(-) create mode 100644 baselines/her/README.md diff --git a/baselines/her/README.md b/baselines/her/README.md new file mode 100644 index 0000000..cffdb12 --- /dev/null +++ b/baselines/her/README.md @@ -0,0 +1,35 @@ +# Hindsight Experience Replay +For details on Hindsight Experience Replay (HER), please read the [paper](https://arxiv.org/pdf/1707.01495.pdf). + +## How to use Hindsight Experience Replay + +### Getting started +Training an agent is very simple: +```bash +python -m baselines.her.experiment.train +``` +This will train a DDPG+HER agent on the `FetchReach` environment. +You should see the success rate go up quickly to `1.0`, which means that the agent achieves the +desired goal in 100% of the cases. +The training script logs other diagnostics as well and pickles the best policy so far (w.r.t. to its test success rate), +the latest policy, and, if enabled, a history of policies every K epochs. + +To inspect what the agent has learned, use the play script: +```bash +python -m baselines.her.experiment.play /path/to/an/experiment/policy_best.pkl +``` +You can try it right now with the results of the training step (the script prints out the path for you). +This should visualize the current policy for 10 episodes and will also print statistics. + + +### Advanced usage +The train script comes with advanced features like MPI support, that allows to scale across all cores of a single machine. +To see all available options, simply run this command: +```bash +python -m baselines.her.experiment.train --help +``` +To run on, say, 20 CPU cores, you can use the following command: +```bash +python -m baselines.her.experiment.train --num_cpu 20 +``` +That's it, you are now running rollouts using 20 MPI workers and average gradients for network updates across all 20 core. diff --git a/baselines/her/experiment/train.py b/baselines/her/experiment/train.py index c8d2405..6ccf17d 100644 --- a/baselines/her/experiment/train.py +++ b/baselines/her/experiment/train.py @@ -94,11 +94,14 @@ def launch( rank = MPI.COMM_WORLD.Get_rank() # Configure logging - if rank == 0 and (logdir or logger.get_dir() is None): - logger.configure(dir=logdir) - logdir = logger.get_dir() - os.makedirs(logdir, exist_ok=True) - assert logger.get_dir() is not None + if rank == 0: + if logdir or logger.get_dir() is None: + logger.configure(dir=logdir) + else: + logger.configure() + logdir = logger.get_dir() + assert logdir is not None + os.makedirs(logdir, exist_ok=True) # Seed everything. rank_seed = seed + 1000000 * rank @@ -153,14 +156,14 @@ def launch( @click.command() -@click.option('--env_name', type=str, default='FetchReach-v0') -@click.option('--logdir', type=str, default=None) -@click.option('--n_epochs', type=int, default=50) -@click.option('--num_cpu', type=int, default=1) -@click.option('--seed', type=int, default=0) -@click.option('--policy_save_interval', type=int, default=5) -@click.option('--replay_strategy', type=click.Choice(['future', 'none']), default='future') -@click.option('--clip_return', type=int, default=1) +@click.option('--env_name', type=str, default='FetchReach-v0', help='the name of the OpenAI Gym environment that you want to train on') +@click.option('--logdir', type=str, default=None, help='the path to where logs and policy pickles should go. If not specified, creates a folder in /tmp/') +@click.option('--n_epochs', type=int, default=50, help='the number of training epochs to run') +@click.option('--num_cpu', type=int, default=1, help='the number of CPU cores to use (using MPI)') +@click.option('--seed', type=int, default=0, help='the random seed used to seed both the environment and the training code') +@click.option('--policy_save_interval', type=int, default=5, help='the interval with which policy pickles are saved. If set to 0, only the best and latest policy will be pickled.') +@click.option('--replay_strategy', type=click.Choice(['future', 'none']), default='future', help='the HER replay strategy to be used. "future" uses HER, "none" disables HER.') +@click.option('--clip_return', type=int, default=1, help='whether or not returns should be clipped') def main(**kwargs): launch(**kwargs)