git subrepo pull (merge) baselines
subrepo: subdir: "baselines" merged: "95a81e86" upstream: origin: "git@github.com:openai/baselines.git" branch: "master" commit: "c6c0f45c" git-subrepo: version: "0.4.0" origin: "git@github.com:ingydotnet/git-subrepo.git" commit: "74339e8"
This commit is contained in:
27
README.md
27
README.md
@@ -45,8 +45,8 @@ cd baselines
|
||||
```
|
||||
If using virtualenv, create a new virtualenv and activate it
|
||||
```bash
|
||||
virtualenv env --python=python3
|
||||
. env/bin/activate
|
||||
virtualenv env --python=python3
|
||||
. env/bin/activate
|
||||
```
|
||||
Install baselines package
|
||||
```bash
|
||||
@@ -62,29 +62,20 @@ pip install pytest
|
||||
pytest
|
||||
```
|
||||
|
||||
## Subpackages
|
||||
|
||||
## Testing the installation
|
||||
All unit tests in baselines can be run using pytest runner:
|
||||
```
|
||||
pip install pytest
|
||||
pytest
|
||||
```
|
||||
|
||||
## Training models
|
||||
Most of the algorithms in baselines repo are used as follows:
|
||||
```bash
|
||||
python -m baselines.run --alg=<name of the algorithm> --env=<environment_id> [additional arguments]
|
||||
python -m baselines.run --alg=<name of the algorithm> --env=<environment_id> [additional arguments]
|
||||
```
|
||||
### Example 1. PPO with MuJoCo Humanoid
|
||||
For instance, to train a fully-connected network controlling MuJoCo humanoid using a2c for 20M timesteps
|
||||
For instance, to train a fully-connected network controlling MuJoCo humanoid using PPO2 for 20M timesteps
|
||||
```bash
|
||||
python -m baselines.run --alg=a2c --env=Humanoid-v2 --network=mlp --num_timesteps=2e7
|
||||
python -m baselines.run --alg=ppo2 --env=Humanoid-v2 --network=mlp --num_timesteps=2e7
|
||||
```
|
||||
Note that for mujoco environments fully-connected network is default, so we can omit `--network=mlp`
|
||||
The hyperparameters for both network and the learning algorithm can be controlled via the command line, for instance:
|
||||
```bash
|
||||
python -m baselines.run --alg=a2c --env=Humanoid-v2 --network=mlp --num_timesteps=2e7 --ent_coef=0.1 --num_hidden=32 --num_layers=3 --value_network=copy
|
||||
python -m baselines.run --alg=ppo2 --env=Humanoid-v2 --network=mlp --num_timesteps=2e7 --ent_coef=0.1 --num_hidden=32 --num_layers=3 --value_network=copy
|
||||
```
|
||||
will set entropy coeffient to 0.1, and construct fully connected network with 3 layers with 32 hidden units in each, and create a separate network for value function estimation (so that its parameters are not shared with the policy network, but the structure is the same)
|
||||
|
||||
@@ -94,7 +85,7 @@ docstring for [baselines/ppo2/ppo2.py/learn()](ppo2/ppo2.py) fir the description
|
||||
### Example 2. DQN on Atari
|
||||
DQN with Atari is at this point a classics of benchmarks. To run the baselines implementation of DQN on Atari Pong:
|
||||
```
|
||||
python -m baselines.run --alg=deepq --env=PongNoFrameskip-v4 --num_timesteps=1e6
|
||||
python -m baselines.run --alg=deepq --env=PongNoFrameskip-v4 --num_timesteps=1e6
|
||||
```
|
||||
|
||||
## Saving, loading and visualizing models
|
||||
@@ -102,11 +93,11 @@ The algorithms serialization API is not properly unified yet; however, there is
|
||||
`--save_path` and `--load_path` command-line option loads the tensorflow state from a given path before training, and saves it after the training, respectively.
|
||||
Let's imagine you'd like to train ppo2 on Atari Pong, save the model and then later visualize what has it learnt.
|
||||
```bash
|
||||
python -m baselines.run --alg=ppo2 --env=PongNoFrameskip-v4 --num_timesteps=2e7 --save_path=~/models/pong_20M_ppo2
|
||||
python -m baselines.run --alg=ppo2 --env=PongNoFrameskip-v4 --num_timesteps=2e7 --save_path=~/models/pong_20M_ppo2
|
||||
```
|
||||
This should get to the mean reward per episode about 5k. To load and visualize the model, we'll do the following - load the model, train it for 0 steps, and then visualize:
|
||||
```bash
|
||||
python -m baselines.run --alg=ppo2 --env=PongNoFrameskip-v4 --num_timesteps=0 --load_path=~/models/pong_20M_ppo2 --play
|
||||
python -m baselines.run --alg=ppo2 --env=PongNoFrameskip-v4 --num_timesteps=0 --load_path=~/models/pong_20M_ppo2 --play
|
||||
```
|
||||
|
||||
*NOTE:* At the moment Mujoco training uses VecNormalize wrapper for the environment which is not being saved correctly; so loading the models trained on Mujoco will not work well if the environment is recreated. If necessary, you can work around that by replacing RunningMeanStd by TfRunningMeanStd in [baselines/common/vec_env/vec_normalize.py](baselines/common/vec_env/vec_normalize.py#L12). This way, mean and std of environment normalizing wrapper will be saved in tensorflow variables and included in the model file; however, training is slower that way - hence not including it by default
|
||||
|
@@ -54,7 +54,7 @@ def learn(env, policy, vf, gamma, lam, timesteps_per_batch, num_timesteps,
|
||||
stepsize = tf.Variable(initial_value=np.float32(np.array(0.03)), name='stepsize')
|
||||
inputs, loss, loss_sampled = policy.update_info
|
||||
optim = kfac.KfacOptimizer(learning_rate=stepsize, cold_lr=stepsize*(1-0.9), momentum=0.9, kfac_update=2,\
|
||||
epsilon=1e-2, stats_decay=0.99, async=1, cold_iter=1,
|
||||
epsilon=1e-2, stats_decay=0.99, async_=1, cold_iter=1,
|
||||
weight_decay_dict=policy.wd_dict, max_grad_norm=None)
|
||||
pi_var_list = []
|
||||
for var in tf.trainable_variables():
|
||||
|
@@ -58,7 +58,7 @@ class Model(object):
|
||||
with tf.device('/gpu:0'):
|
||||
self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip,\
|
||||
momentum=0.9, kfac_update=1, epsilon=0.01,\
|
||||
stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=max_grad_norm)
|
||||
stats_decay=0.99, async_=1, cold_iter=10, max_grad_norm=max_grad_norm)
|
||||
|
||||
update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss, var_list=params)
|
||||
train_op, q_runner = optim.apply_gradients(list(zip(grads,params)))
|
||||
@@ -97,7 +97,7 @@ def learn(network, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interva
|
||||
kfac_clip=0.001, save_interval=None, lrschedule='linear', load_path=None, **network_kwargs):
|
||||
set_global_seeds(seed)
|
||||
|
||||
|
||||
|
||||
if network == 'cnn':
|
||||
network_kwargs['one_dim_bias'] = True
|
||||
|
||||
@@ -115,7 +115,7 @@ def learn(network, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interva
|
||||
with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh:
|
||||
fh.write(cloudpickle.dumps(make_model))
|
||||
model = make_model()
|
||||
|
||||
|
||||
if load_path is not None:
|
||||
model.load(load_path)
|
||||
|
||||
|
@@ -10,14 +10,14 @@ KFAC_DEBUG = False
|
||||
|
||||
class KfacOptimizer():
|
||||
|
||||
def __init__(self, learning_rate=0.01, momentum=0.9, clip_kl=0.01, kfac_update=2, stats_accum_iter=60, full_stats_init=False, cold_iter=100, cold_lr=None, async=False, async_stats=False, epsilon=1e-2, stats_decay=0.95, blockdiag_bias=False, channel_fac=False, factored_damping=False, approxT2=False, use_float64=False, weight_decay_dict={},max_grad_norm=0.5):
|
||||
def __init__(self, learning_rate=0.01, momentum=0.9, clip_kl=0.01, kfac_update=2, stats_accum_iter=60, full_stats_init=False, cold_iter=100, cold_lr=None, async_=False, async_stats=False, epsilon=1e-2, stats_decay=0.95, blockdiag_bias=False, channel_fac=False, factored_damping=False, approxT2=False, use_float64=False, weight_decay_dict={},max_grad_norm=0.5):
|
||||
self.max_grad_norm = max_grad_norm
|
||||
self._lr = learning_rate
|
||||
self._momentum = momentum
|
||||
self._clip_kl = clip_kl
|
||||
self._channel_fac = channel_fac
|
||||
self._kfac_update = kfac_update
|
||||
self._async = async
|
||||
self._async = async_
|
||||
self._async_stats = async_stats
|
||||
self._epsilon = epsilon
|
||||
self._stats_decay = stats_decay
|
||||
|
@@ -21,7 +21,7 @@ class NeuralNetValueFunction(object):
|
||||
self._predict = U.function([X], vpred_n)
|
||||
optim = kfac.KfacOptimizer(learning_rate=0.001, cold_lr=0.001*(1-0.9), momentum=0.9, \
|
||||
clip_kl=0.3, epsilon=0.1, stats_decay=0.95, \
|
||||
async=1, kfac_update=2, cold_iter=50, \
|
||||
async_=1, kfac_update=2, cold_iter=50, \
|
||||
weight_decay_dict=wd_dict, max_grad_norm=None)
|
||||
vf_var_list = []
|
||||
for var in tf.trainable_variables():
|
||||
|
@@ -72,7 +72,7 @@ class PolicyWithValue(object):
|
||||
|
||||
def step(self, observation, **extra_feed):
|
||||
"""
|
||||
Compute next action(s) given the observaion(s)
|
||||
Compute next action(s) given the observation(s)
|
||||
|
||||
Parameters:
|
||||
----------
|
||||
@@ -93,7 +93,7 @@ class PolicyWithValue(object):
|
||||
|
||||
def value(self, ob, *args, **kwargs):
|
||||
"""
|
||||
Compute value estimate(s) given the observaion(s)
|
||||
Compute value estimate(s) given the observation(s)
|
||||
|
||||
Parameters:
|
||||
----------
|
||||
|
Reference in New Issue
Block a user