Minor improvements
This commit is contained in:
35
baselines/her/README.md
Normal file
35
baselines/her/README.md
Normal file
@@ -0,0 +1,35 @@
|
||||
# Hindsight Experience Replay
|
||||
For details on Hindsight Experience Replay (HER), please read the [paper](https://arxiv.org/pdf/1707.01495.pdf).
|
||||
|
||||
## How to use Hindsight Experience Replay
|
||||
|
||||
### Getting started
|
||||
Training an agent is very simple:
|
||||
```bash
|
||||
python -m baselines.her.experiment.train
|
||||
```
|
||||
This will train a DDPG+HER agent on the `FetchReach` environment.
|
||||
You should see the success rate go up quickly to `1.0`, which means that the agent achieves the
|
||||
desired goal in 100% of the cases.
|
||||
The training script logs other diagnostics as well and pickles the best policy so far (w.r.t. to its test success rate),
|
||||
the latest policy, and, if enabled, a history of policies every K epochs.
|
||||
|
||||
To inspect what the agent has learned, use the play script:
|
||||
```bash
|
||||
python -m baselines.her.experiment.play /path/to/an/experiment/policy_best.pkl
|
||||
```
|
||||
You can try it right now with the results of the training step (the script prints out the path for you).
|
||||
This should visualize the current policy for 10 episodes and will also print statistics.
|
||||
|
||||
|
||||
### Advanced usage
|
||||
The train script comes with advanced features like MPI support, that allows to scale across all cores of a single machine.
|
||||
To see all available options, simply run this command:
|
||||
```bash
|
||||
python -m baselines.her.experiment.train --help
|
||||
```
|
||||
To run on, say, 20 CPU cores, you can use the following command:
|
||||
```bash
|
||||
python -m baselines.her.experiment.train --num_cpu 20
|
||||
```
|
||||
That's it, you are now running rollouts using 20 MPI workers and average gradients for network updates across all 20 core.
|
@@ -94,11 +94,14 @@ def launch(
|
||||
rank = MPI.COMM_WORLD.Get_rank()
|
||||
|
||||
# Configure logging
|
||||
if rank == 0 and (logdir or logger.get_dir() is None):
|
||||
logger.configure(dir=logdir)
|
||||
logdir = logger.get_dir()
|
||||
os.makedirs(logdir, exist_ok=True)
|
||||
assert logger.get_dir() is not None
|
||||
if rank == 0:
|
||||
if logdir or logger.get_dir() is None:
|
||||
logger.configure(dir=logdir)
|
||||
else:
|
||||
logger.configure()
|
||||
logdir = logger.get_dir()
|
||||
assert logdir is not None
|
||||
os.makedirs(logdir, exist_ok=True)
|
||||
|
||||
# Seed everything.
|
||||
rank_seed = seed + 1000000 * rank
|
||||
@@ -153,14 +156,14 @@ def launch(
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--env_name', type=str, default='FetchReach-v0')
|
||||
@click.option('--logdir', type=str, default=None)
|
||||
@click.option('--n_epochs', type=int, default=50)
|
||||
@click.option('--num_cpu', type=int, default=1)
|
||||
@click.option('--seed', type=int, default=0)
|
||||
@click.option('--policy_save_interval', type=int, default=5)
|
||||
@click.option('--replay_strategy', type=click.Choice(['future', 'none']), default='future')
|
||||
@click.option('--clip_return', type=int, default=1)
|
||||
@click.option('--env_name', type=str, default='FetchReach-v0', help='the name of the OpenAI Gym environment that you want to train on')
|
||||
@click.option('--logdir', type=str, default=None, help='the path to where logs and policy pickles should go. If not specified, creates a folder in /tmp/')
|
||||
@click.option('--n_epochs', type=int, default=50, help='the number of training epochs to run')
|
||||
@click.option('--num_cpu', type=int, default=1, help='the number of CPU cores to use (using MPI)')
|
||||
@click.option('--seed', type=int, default=0, help='the random seed used to seed both the environment and the training code')
|
||||
@click.option('--policy_save_interval', type=int, default=5, help='the interval with which policy pickles are saved. If set to 0, only the best and latest policy will be pickled.')
|
||||
@click.option('--replay_strategy', type=click.Choice(['future', 'none']), default='future', help='the HER replay strategy to be used. "future" uses HER, "none" disables HER.')
|
||||
@click.option('--clip_return', type=int, default=1, help='whether or not returns should be clipped')
|
||||
def main(**kwargs):
|
||||
launch(**kwargs)
|
||||
|
||||
|
Reference in New Issue
Block a user