From d90638b56505b6d61b4253fc89e5b47da364a2e1 Mon Sep 17 00:00:00 2001
From: Matthias Plappert <matthias@openai.com>
Date: Mon, 26 Feb 2018 10:32:11 +0100
Subject: [PATCH] Minor improvements

---
 baselines/her/README.md           | 35 +++++++++++++++++++++++++++++++
 baselines/her/experiment/train.py | 29 +++++++++++++------------
 2 files changed, 51 insertions(+), 13 deletions(-)
 create mode 100644 baselines/her/README.md

diff --git a/baselines/her/README.md b/baselines/her/README.md
new file mode 100644
index 0000000..cffdb12
--- /dev/null
+++ b/baselines/her/README.md
@@ -0,0 +1,35 @@
+# Hindsight Experience Replay
+For details on Hindsight Experience Replay (HER), please read the [paper](https://arxiv.org/pdf/1707.01495.pdf).
+
+## How to use Hindsight Experience Replay
+
+### Getting started
+Training an agent is very simple:
+```bash
+python -m baselines.her.experiment.train
+```
+This will train a DDPG+HER agent on the `FetchReach` environment.
+You should see the success rate go up quickly to `1.0`, which means that the agent achieves the
+desired goal in 100% of the cases.
+The training script logs other diagnostics as well and pickles the best policy so far (w.r.t. to its test success rate),
+the latest policy, and, if enabled, a history of policies every K epochs.
+
+To inspect what the agent has learned, use the play script:
+```bash
+python -m baselines.her.experiment.play /path/to/an/experiment/policy_best.pkl
+```
+You can try it right now with the results of the training step (the script prints out the path for you).
+This should visualize the current policy for 10 episodes and will also print statistics.
+
+
+### Advanced usage
+The train script comes with advanced features like MPI support, that allows to scale across all cores of a single machine.
+To see all available options, simply run this command:
+```bash
+python -m baselines.her.experiment.train --help
+```
+To run on, say, 20 CPU cores, you can use the following command:
+```bash
+python -m baselines.her.experiment.train --num_cpu 20
+```
+That's it, you are now running rollouts using 20 MPI workers and average gradients for network updates across all 20 core.
diff --git a/baselines/her/experiment/train.py b/baselines/her/experiment/train.py
index c8d2405..6ccf17d 100644
--- a/baselines/her/experiment/train.py
+++ b/baselines/her/experiment/train.py
@@ -94,11 +94,14 @@ def launch(
     rank = MPI.COMM_WORLD.Get_rank()
 
     # Configure logging
-    if rank == 0 and (logdir or logger.get_dir() is None):
-        logger.configure(dir=logdir)
-        logdir = logger.get_dir()
-        os.makedirs(logdir, exist_ok=True)
-    assert logger.get_dir() is not None
+    if rank == 0:
+        if logdir or logger.get_dir() is None:
+            logger.configure(dir=logdir)
+    else:
+        logger.configure()
+    logdir = logger.get_dir()
+    assert logdir is not None
+    os.makedirs(logdir, exist_ok=True)
 
     # Seed everything.
     rank_seed = seed + 1000000 * rank
@@ -153,14 +156,14 @@ def launch(
 
 
 @click.command()
-@click.option('--env_name', type=str, default='FetchReach-v0')
-@click.option('--logdir', type=str, default=None)
-@click.option('--n_epochs', type=int, default=50)
-@click.option('--num_cpu', type=int, default=1)
-@click.option('--seed', type=int, default=0)
-@click.option('--policy_save_interval', type=int, default=5)
-@click.option('--replay_strategy', type=click.Choice(['future', 'none']), default='future')
-@click.option('--clip_return', type=int, default=1)
+@click.option('--env_name', type=str, default='FetchReach-v0', help='the name of the OpenAI Gym environment that you want to train on')
+@click.option('--logdir', type=str, default=None, help='the path to where logs and policy pickles should go. If not specified, creates a folder in /tmp/')
+@click.option('--n_epochs', type=int, default=50, help='the number of training epochs to run')
+@click.option('--num_cpu', type=int, default=1, help='the number of CPU cores to use (using MPI)')
+@click.option('--seed', type=int, default=0, help='the random seed used to seed both the environment and the training code')
+@click.option('--policy_save_interval', type=int, default=5, help='the interval with which policy pickles are saved. If set to 0, only the best and latest policy will be pickled.')
+@click.option('--replay_strategy', type=click.Choice(['future', 'none']), default='future', help='the HER replay strategy to be used. "future" uses HER, "none" disables HER.')
+@click.option('--clip_return', type=int, default=1, help='whether or not returns should be clipped')
 def main(**kwargs):
     launch(**kwargs)