diff --git a/baselines/gail/README.md b/baselines/gail/README.md index 7a2ab21..36728ab 100644 --- a/baselines/gail/README.md +++ b/baselines/gail/README.md @@ -1,14 +1,20 @@ -# GAIL +# Generative Adversarial Imitation Learning (GAIL) - Original paper: https://arxiv.org/abs/1606.03476 +For results benchmarking on MuJoCo, please navigate to [here](result/gail-result.md) + ## If you want to train an imitation learning agent ### Step 1: Download expert data Download the expert data into `./data` -### Step 2: Imitation learning +Download [Link](https://drive.google.com/drive/folders/1h-bK09Emrteu7vUXllZLRqTx7iNOaxGI?usp=sharing) + +### Step 2: Run GAIL + +Run with single thread: ```bash python -m baselines.gail.run_mujoco @@ -22,4 +28,14 @@ mpirun -np 16 python -m baselines.gail.run_mujoco See help (`-h`) for more options. +#### In case you want to run Behavior Cloning (BC) +```bash +python -m baselines.gail.behavior_clone +``` + +See help (`-h`) for more options. + +## Reference repositories +- @openai/imitation +- @carpedm20/deep-rl-tensorflow diff --git a/baselines/gail/adversary.py b/baselines/gail/adversary.py index 6e96e03..cfc77be 100644 --- a/baselines/gail/adversary.py +++ b/baselines/gail/adversary.py @@ -1,4 +1,7 @@ -# Reference: https://github.com/openai/imitation +''' +Reference: https://github.com/openai/imitation +I follow the architecture from the official repository +''' import tensorflow as tf import numpy as np diff --git a/baselines/gail/behavior_clone.py b/baselines/gail/behavior_clone.py index e0e5c76..5164d85 100644 --- a/baselines/gail/behavior_clone.py +++ b/baselines/gail/behavior_clone.py @@ -1,3 +1,7 @@ +''' +The code is used to train BC imitator, or pretrained GAIL imitator +''' + import argparse import tempfile import os.path as osp diff --git a/baselines/gail/dataset/mujoco_dset.py b/baselines/gail/dataset/mujoco_dset.py index a5e0f8a..0f5f57d 100644 --- a/baselines/gail/dataset/mujoco_dset.py +++ b/baselines/gail/dataset/mujoco_dset.py @@ -1,3 +1,10 @@ +''' +Data structure of the input .npz: +the data is save in python dictionary format with keys: 'acs', 'ep_rets', 'rews', 'obs' +the values of each item is a list storing the expert trajectory sequentially +a transition can be: (data['obs'][t], data['acs'][t], data['obs'][t+1]) and get reward data['rews'][t] +''' + from baselines import logger import numpy as np diff --git a/baselines/gail/gail-eval.py b/baselines/gail/gail-eval.py index 072178e..1169bcc 100644 --- a/baselines/gail/gail-eval.py +++ b/baselines/gail/gail-eval.py @@ -1,5 +1,5 @@ ''' -This code is used to evalaute the imitators trained with different trajectories +This code is used to evalaute the imitators trained with different number of trajectories and plot the results in the same figure for easy comparison. ''' diff --git a/baselines/gail/mlp_policy.py b/baselines/gail/mlp_policy.py index 7312573..eb2ce08 100644 --- a/baselines/gail/mlp_policy.py +++ b/baselines/gail/mlp_policy.py @@ -1,5 +1,5 @@ ''' -from baselines/ppo1/mlp_policy.py +from baselines/ppo1/mlp_policy.py and add simple modification (1) add reuse argument (2) cache the `stochastic` placeholder ''' diff --git a/baselines/gail/run_mujoco.py b/baselines/gail/run_mujoco.py index fcd5900..f3cc213 100644 --- a/baselines/gail/run_mujoco.py +++ b/baselines/gail/run_mujoco.py @@ -1,5 +1,5 @@ ''' -Disclaimer: this code is highly based on trpo_mpi at openai/baselines and openai/imitation +Disclaimer: this code is highly based on trpo_mpi at @openai/baselines and @openai/imitation ''' import argparse diff --git a/baselines/gail/trpo_mpi.py b/baselines/gail/trpo_mpi.py index 192957c..f40aae4 100644 --- a/baselines/gail/trpo_mpi.py +++ b/baselines/gail/trpo_mpi.py @@ -1,3 +1,7 @@ +''' +Disclaimer: The trpo part highly rely on trpo_mpi at @openai/baselines +''' + import time import os from contextlib import contextmanager