git subrepo pull (merge) baselines

subrepo:
  subdir:   "baselines"
  merged:   "95a81e86"
upstream:
  origin:   "git@github.com:openai/baselines.git"
  branch:   "master"
  commit:   "c6c0f45c"
git-subrepo:
  version:  "0.4.0"
  origin:   "git@github.com:ingydotnet/git-subrepo.git"
  commit:   "74339e8"
This commit is contained in:
Peter Zhokhov
2018-08-27 16:39:51 -07:00
6 changed files with 18 additions and 27 deletions

View File

@@ -62,29 +62,20 @@ pip install pytest
pytest pytest
``` ```
## Subpackages
## Testing the installation
All unit tests in baselines can be run using pytest runner:
```
pip install pytest
pytest
```
## Training models ## Training models
Most of the algorithms in baselines repo are used as follows: Most of the algorithms in baselines repo are used as follows:
```bash ```bash
python -m baselines.run --alg=<name of the algorithm> --env=<environment_id> [additional arguments] python -m baselines.run --alg=<name of the algorithm> --env=<environment_id> [additional arguments]
``` ```
### Example 1. PPO with MuJoCo Humanoid ### Example 1. PPO with MuJoCo Humanoid
For instance, to train a fully-connected network controlling MuJoCo humanoid using a2c for 20M timesteps For instance, to train a fully-connected network controlling MuJoCo humanoid using PPO2 for 20M timesteps
```bash ```bash
python -m baselines.run --alg=a2c --env=Humanoid-v2 --network=mlp --num_timesteps=2e7 python -m baselines.run --alg=ppo2 --env=Humanoid-v2 --network=mlp --num_timesteps=2e7
``` ```
Note that for mujoco environments fully-connected network is default, so we can omit `--network=mlp` Note that for mujoco environments fully-connected network is default, so we can omit `--network=mlp`
The hyperparameters for both network and the learning algorithm can be controlled via the command line, for instance: The hyperparameters for both network and the learning algorithm can be controlled via the command line, for instance:
```bash ```bash
python -m baselines.run --alg=a2c --env=Humanoid-v2 --network=mlp --num_timesteps=2e7 --ent_coef=0.1 --num_hidden=32 --num_layers=3 --value_network=copy python -m baselines.run --alg=ppo2 --env=Humanoid-v2 --network=mlp --num_timesteps=2e7 --ent_coef=0.1 --num_hidden=32 --num_layers=3 --value_network=copy
``` ```
will set entropy coeffient to 0.1, and construct fully connected network with 3 layers with 32 hidden units in each, and create a separate network for value function estimation (so that its parameters are not shared with the policy network, but the structure is the same) will set entropy coeffient to 0.1, and construct fully connected network with 3 layers with 32 hidden units in each, and create a separate network for value function estimation (so that its parameters are not shared with the policy network, but the structure is the same)

View File

@@ -54,7 +54,7 @@ def learn(env, policy, vf, gamma, lam, timesteps_per_batch, num_timesteps,
stepsize = tf.Variable(initial_value=np.float32(np.array(0.03)), name='stepsize') stepsize = tf.Variable(initial_value=np.float32(np.array(0.03)), name='stepsize')
inputs, loss, loss_sampled = policy.update_info inputs, loss, loss_sampled = policy.update_info
optim = kfac.KfacOptimizer(learning_rate=stepsize, cold_lr=stepsize*(1-0.9), momentum=0.9, kfac_update=2,\ optim = kfac.KfacOptimizer(learning_rate=stepsize, cold_lr=stepsize*(1-0.9), momentum=0.9, kfac_update=2,\
epsilon=1e-2, stats_decay=0.99, async=1, cold_iter=1, epsilon=1e-2, stats_decay=0.99, async_=1, cold_iter=1,
weight_decay_dict=policy.wd_dict, max_grad_norm=None) weight_decay_dict=policy.wd_dict, max_grad_norm=None)
pi_var_list = [] pi_var_list = []
for var in tf.trainable_variables(): for var in tf.trainable_variables():

View File

@@ -58,7 +58,7 @@ class Model(object):
with tf.device('/gpu:0'): with tf.device('/gpu:0'):
self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip,\ self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip,\
momentum=0.9, kfac_update=1, epsilon=0.01,\ momentum=0.9, kfac_update=1, epsilon=0.01,\
stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=max_grad_norm) stats_decay=0.99, async_=1, cold_iter=10, max_grad_norm=max_grad_norm)
update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss, var_list=params) update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss, var_list=params)
train_op, q_runner = optim.apply_gradients(list(zip(grads,params))) train_op, q_runner = optim.apply_gradients(list(zip(grads,params)))

View File

@@ -10,14 +10,14 @@ KFAC_DEBUG = False
class KfacOptimizer(): class KfacOptimizer():
def __init__(self, learning_rate=0.01, momentum=0.9, clip_kl=0.01, kfac_update=2, stats_accum_iter=60, full_stats_init=False, cold_iter=100, cold_lr=None, async=False, async_stats=False, epsilon=1e-2, stats_decay=0.95, blockdiag_bias=False, channel_fac=False, factored_damping=False, approxT2=False, use_float64=False, weight_decay_dict={},max_grad_norm=0.5): def __init__(self, learning_rate=0.01, momentum=0.9, clip_kl=0.01, kfac_update=2, stats_accum_iter=60, full_stats_init=False, cold_iter=100, cold_lr=None, async_=False, async_stats=False, epsilon=1e-2, stats_decay=0.95, blockdiag_bias=False, channel_fac=False, factored_damping=False, approxT2=False, use_float64=False, weight_decay_dict={},max_grad_norm=0.5):
self.max_grad_norm = max_grad_norm self.max_grad_norm = max_grad_norm
self._lr = learning_rate self._lr = learning_rate
self._momentum = momentum self._momentum = momentum
self._clip_kl = clip_kl self._clip_kl = clip_kl
self._channel_fac = channel_fac self._channel_fac = channel_fac
self._kfac_update = kfac_update self._kfac_update = kfac_update
self._async = async self._async = async_
self._async_stats = async_stats self._async_stats = async_stats
self._epsilon = epsilon self._epsilon = epsilon
self._stats_decay = stats_decay self._stats_decay = stats_decay

View File

@@ -21,7 +21,7 @@ class NeuralNetValueFunction(object):
self._predict = U.function([X], vpred_n) self._predict = U.function([X], vpred_n)
optim = kfac.KfacOptimizer(learning_rate=0.001, cold_lr=0.001*(1-0.9), momentum=0.9, \ optim = kfac.KfacOptimizer(learning_rate=0.001, cold_lr=0.001*(1-0.9), momentum=0.9, \
clip_kl=0.3, epsilon=0.1, stats_decay=0.95, \ clip_kl=0.3, epsilon=0.1, stats_decay=0.95, \
async=1, kfac_update=2, cold_iter=50, \ async_=1, kfac_update=2, cold_iter=50, \
weight_decay_dict=wd_dict, max_grad_norm=None) weight_decay_dict=wd_dict, max_grad_norm=None)
vf_var_list = [] vf_var_list = []
for var in tf.trainable_variables(): for var in tf.trainable_variables():

View File

@@ -72,7 +72,7 @@ class PolicyWithValue(object):
def step(self, observation, **extra_feed): def step(self, observation, **extra_feed):
""" """
Compute next action(s) given the observaion(s) Compute next action(s) given the observation(s)
Parameters: Parameters:
---------- ----------
@@ -93,7 +93,7 @@ class PolicyWithValue(object):
def value(self, ob, *args, **kwargs): def value(self, ob, *args, **kwargs):
""" """
Compute value estimate(s) given the observaion(s) Compute value estimate(s) given the observation(s)
Parameters: Parameters:
---------- ----------