Minor improvements
This commit is contained in:
35
baselines/her/README.md
Normal file
35
baselines/her/README.md
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
# Hindsight Experience Replay
|
||||||
|
For details on Hindsight Experience Replay (HER), please read the [paper](https://arxiv.org/pdf/1707.01495.pdf).
|
||||||
|
|
||||||
|
## How to use Hindsight Experience Replay
|
||||||
|
|
||||||
|
### Getting started
|
||||||
|
Training an agent is very simple:
|
||||||
|
```bash
|
||||||
|
python -m baselines.her.experiment.train
|
||||||
|
```
|
||||||
|
This will train a DDPG+HER agent on the `FetchReach` environment.
|
||||||
|
You should see the success rate go up quickly to `1.0`, which means that the agent achieves the
|
||||||
|
desired goal in 100% of the cases.
|
||||||
|
The training script logs other diagnostics as well and pickles the best policy so far (w.r.t. to its test success rate),
|
||||||
|
the latest policy, and, if enabled, a history of policies every K epochs.
|
||||||
|
|
||||||
|
To inspect what the agent has learned, use the play script:
|
||||||
|
```bash
|
||||||
|
python -m baselines.her.experiment.play /path/to/an/experiment/policy_best.pkl
|
||||||
|
```
|
||||||
|
You can try it right now with the results of the training step (the script prints out the path for you).
|
||||||
|
This should visualize the current policy for 10 episodes and will also print statistics.
|
||||||
|
|
||||||
|
|
||||||
|
### Advanced usage
|
||||||
|
The train script comes with advanced features like MPI support, that allows to scale across all cores of a single machine.
|
||||||
|
To see all available options, simply run this command:
|
||||||
|
```bash
|
||||||
|
python -m baselines.her.experiment.train --help
|
||||||
|
```
|
||||||
|
To run on, say, 20 CPU cores, you can use the following command:
|
||||||
|
```bash
|
||||||
|
python -m baselines.her.experiment.train --num_cpu 20
|
||||||
|
```
|
||||||
|
That's it, you are now running rollouts using 20 MPI workers and average gradients for network updates across all 20 core.
|
@@ -94,11 +94,14 @@ def launch(
|
|||||||
rank = MPI.COMM_WORLD.Get_rank()
|
rank = MPI.COMM_WORLD.Get_rank()
|
||||||
|
|
||||||
# Configure logging
|
# Configure logging
|
||||||
if rank == 0 and (logdir or logger.get_dir() is None):
|
if rank == 0:
|
||||||
logger.configure(dir=logdir)
|
if logdir or logger.get_dir() is None:
|
||||||
logdir = logger.get_dir()
|
logger.configure(dir=logdir)
|
||||||
os.makedirs(logdir, exist_ok=True)
|
else:
|
||||||
assert logger.get_dir() is not None
|
logger.configure()
|
||||||
|
logdir = logger.get_dir()
|
||||||
|
assert logdir is not None
|
||||||
|
os.makedirs(logdir, exist_ok=True)
|
||||||
|
|
||||||
# Seed everything.
|
# Seed everything.
|
||||||
rank_seed = seed + 1000000 * rank
|
rank_seed = seed + 1000000 * rank
|
||||||
@@ -153,14 +156,14 @@ def launch(
|
|||||||
|
|
||||||
|
|
||||||
@click.command()
|
@click.command()
|
||||||
@click.option('--env_name', type=str, default='FetchReach-v0')
|
@click.option('--env_name', type=str, default='FetchReach-v0', help='the name of the OpenAI Gym environment that you want to train on')
|
||||||
@click.option('--logdir', type=str, default=None)
|
@click.option('--logdir', type=str, default=None, help='the path to where logs and policy pickles should go. If not specified, creates a folder in /tmp/')
|
||||||
@click.option('--n_epochs', type=int, default=50)
|
@click.option('--n_epochs', type=int, default=50, help='the number of training epochs to run')
|
||||||
@click.option('--num_cpu', type=int, default=1)
|
@click.option('--num_cpu', type=int, default=1, help='the number of CPU cores to use (using MPI)')
|
||||||
@click.option('--seed', type=int, default=0)
|
@click.option('--seed', type=int, default=0, help='the random seed used to seed both the environment and the training code')
|
||||||
@click.option('--policy_save_interval', type=int, default=5)
|
@click.option('--policy_save_interval', type=int, default=5, help='the interval with which policy pickles are saved. If set to 0, only the best and latest policy will be pickled.')
|
||||||
@click.option('--replay_strategy', type=click.Choice(['future', 'none']), default='future')
|
@click.option('--replay_strategy', type=click.Choice(['future', 'none']), default='future', help='the HER replay strategy to be used. "future" uses HER, "none" disables HER.')
|
||||||
@click.option('--clip_return', type=int, default=1)
|
@click.option('--clip_return', type=int, default=1, help='whether or not returns should be clipped')
|
||||||
def main(**kwargs):
|
def main(**kwargs):
|
||||||
launch(**kwargs)
|
launch(**kwargs)
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user