git subrepo pull (merge) baselines
subrepo: subdir: "baselines" merged: "f2a9b8f2" upstream: origin: "git@github.com:openai/baselines.git" branch: "master" commit: "cc4215ef" git-subrepo: version: "0.4.0" origin: "git@github.com:ingydotnet/git-subrepo.git" commit: "74339e8"
This commit is contained in:
@@ -87,7 +87,7 @@ python -m baselines.run --alg=ppo2 --env=Humanoid-v2 --network=mlp --num_timeste
|
||||
will set entropy coeffient to 0.1, and construct fully connected network with 3 layers with 32 hidden units in each, and create a separate network for value function estimation (so that its parameters are not shared with the policy network, but the structure is the same)
|
||||
|
||||
See docstrings in [common/models.py](common/models.py) for description of network parameters for each type of model, and
|
||||
docstring for [baselines/ppo2/ppo2.py/learn()](ppo2/ppo2.py) fir the description of the ppo2 hyperparamters.
|
||||
docstring for [baselines/ppo2/ppo2.py/learn()](ppo2/ppo2.py) for the description of the ppo2 hyperparamters.
|
||||
|
||||
### Example 2. DQN on Atari
|
||||
DQN with Atari is at this point a classics of benchmarks. To run the baselines implementation of DQN on Atari Pong:
|
||||
@@ -102,7 +102,7 @@ Let's imagine you'd like to train ppo2 on Atari Pong, save the model and then l
|
||||
```bash
|
||||
python -m baselines.run --alg=ppo2 --env=PongNoFrameskip-v4 --num_timesteps=2e7 --save_path=~/models/pong_20M_ppo2
|
||||
```
|
||||
This should get to the mean reward per episode about 5k. To load and visualize the model, we'll do the following - load the model, train it for 0 steps, and then visualize:
|
||||
This should get to the mean reward per episode about 20. To load and visualize the model, we'll do the following - load the model, train it for 0 steps, and then visualize:
|
||||
```bash
|
||||
python -m baselines.run --alg=ppo2 --env=PongNoFrameskip-v4 --num_timesteps=0 --load_path=~/models/pong_20M_ppo2 --play
|
||||
```
|
||||
|
@@ -5,6 +5,13 @@ from baselines.a2c.utils import conv, fc, conv_to_fc, batch_to_seq, seq_to_batch
|
||||
from baselines.common.mpi_running_mean_std import RunningMeanStd
|
||||
import tensorflow.contrib.layers as layers
|
||||
|
||||
mapping = {}
|
||||
|
||||
def register(name):
|
||||
def _thunk(func):
|
||||
mapping[name] = func
|
||||
return func
|
||||
return _thunk
|
||||
|
||||
def nature_cnn(unscaled_images, **conv_kwargs):
|
||||
"""
|
||||
@@ -20,6 +27,7 @@ def nature_cnn(unscaled_images, **conv_kwargs):
|
||||
return activ(fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2)))
|
||||
|
||||
|
||||
@register("mlp")
|
||||
def mlp(num_layers=2, num_hidden=64, activation=tf.tanh):
|
||||
"""
|
||||
Stack of fully-connected layers to be used in a policy / q-function approximator
|
||||
@@ -28,16 +36,16 @@ def mlp(num_layers=2, num_hidden=64, activation=tf.tanh):
|
||||
----------
|
||||
|
||||
num_layers: int number of fully-connected layers (default: 2)
|
||||
|
||||
|
||||
num_hidden: int size of fully-connected layers (default: 64)
|
||||
|
||||
|
||||
activation: activation function (default: tf.tanh)
|
||||
|
||||
|
||||
Returns:
|
||||
-------
|
||||
|
||||
function that builds fully connected network with a given input tensor / placeholder
|
||||
"""
|
||||
"""
|
||||
def network_fn(X):
|
||||
h = tf.layers.flatten(X)
|
||||
for i in range(num_layers):
|
||||
@@ -45,17 +53,20 @@ def mlp(num_layers=2, num_hidden=64, activation=tf.tanh):
|
||||
return h, None
|
||||
|
||||
return network_fn
|
||||
|
||||
|
||||
|
||||
@register("cnn")
|
||||
def cnn(**conv_kwargs):
|
||||
def network_fn(X):
|
||||
return nature_cnn(X, **conv_kwargs), None
|
||||
return network_fn
|
||||
|
||||
|
||||
@register("cnn_small")
|
||||
def cnn_small(**conv_kwargs):
|
||||
def network_fn(X):
|
||||
h = tf.cast(X, tf.float32) / 255.
|
||||
|
||||
|
||||
activ = tf.nn.relu
|
||||
h = activ(conv(h, 'c1', nf=8, rf=8, stride=4, init_scale=np.sqrt(2), **conv_kwargs))
|
||||
h = activ(conv(h, 'c2', nf=16, rf=4, stride=2, init_scale=np.sqrt(2), **conv_kwargs))
|
||||
@@ -65,15 +76,15 @@ def cnn_small(**conv_kwargs):
|
||||
return network_fn
|
||||
|
||||
|
||||
|
||||
@register("lstm")
|
||||
def lstm(nlstm=128, layer_norm=False):
|
||||
"""
|
||||
Builds LSTM (Long-Short Term Memory) network to be used in a policy.
|
||||
Note that the resulting function returns not only the output of the LSTM
|
||||
Note that the resulting function returns not only the output of the LSTM
|
||||
(i.e. hidden state of lstm for each step in the sequence), but also a dictionary
|
||||
with auxiliary tensors to be set as policy attributes.
|
||||
with auxiliary tensors to be set as policy attributes.
|
||||
|
||||
Specifically,
|
||||
Specifically,
|
||||
S is a placeholder to feed current state (LSTM state has to be managed outside policy)
|
||||
M is a placeholder for the mask (used to mask out observations after the end of the episode, but can be used for other purposes too)
|
||||
initial_state is a numpy array containing initial lstm state (usually zeros)
|
||||
@@ -81,7 +92,7 @@ def lstm(nlstm=128, layer_norm=False):
|
||||
|
||||
|
||||
An example of usage of lstm-based policy can be found here: common/tests/test_doc_examples.py/test_lstm_example
|
||||
|
||||
|
||||
Parameters:
|
||||
----------
|
||||
|
||||
@@ -94,11 +105,11 @@ def lstm(nlstm=128, layer_norm=False):
|
||||
|
||||
function that builds LSTM with a given input tensor / placeholder
|
||||
"""
|
||||
|
||||
|
||||
def network_fn(X, nenv=1):
|
||||
nbatch = X.shape[0]
|
||||
nbatch = X.shape[0]
|
||||
nsteps = nbatch // nenv
|
||||
|
||||
|
||||
h = tf.layers.flatten(X)
|
||||
|
||||
M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
|
||||
@@ -111,7 +122,7 @@ def lstm(nlstm=128, layer_norm=False):
|
||||
h5, snew = utils.lnlstm(xs, ms, S, scope='lnlstm', nh=nlstm)
|
||||
else:
|
||||
h5, snew = utils.lstm(xs, ms, S, scope='lstm', nh=nlstm)
|
||||
|
||||
|
||||
h = seq_to_batch(h5)
|
||||
initial_state = np.zeros(S.shape.as_list(), dtype=float)
|
||||
|
||||
@@ -120,13 +131,14 @@ def lstm(nlstm=128, layer_norm=False):
|
||||
return network_fn
|
||||
|
||||
|
||||
@register("cnn_lstm")
|
||||
def cnn_lstm(nlstm=128, layer_norm=False, **conv_kwargs):
|
||||
def network_fn(X, nenv=1):
|
||||
nbatch = X.shape[0]
|
||||
nbatch = X.shape[0]
|
||||
nsteps = nbatch // nenv
|
||||
|
||||
|
||||
h = nature_cnn(X, **conv_kwargs)
|
||||
|
||||
|
||||
M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
|
||||
S = tf.placeholder(tf.float32, [nenv, 2*nlstm]) #states
|
||||
|
||||
@@ -137,7 +149,7 @@ def cnn_lstm(nlstm=128, layer_norm=False, **conv_kwargs):
|
||||
h5, snew = utils.lnlstm(xs, ms, S, scope='lnlstm', nh=nlstm)
|
||||
else:
|
||||
h5, snew = utils.lstm(xs, ms, S, scope='lstm', nh=nlstm)
|
||||
|
||||
|
||||
h = seq_to_batch(h5)
|
||||
initial_state = np.zeros(S.shape.as_list(), dtype=float)
|
||||
|
||||
@@ -145,23 +157,26 @@ def cnn_lstm(nlstm=128, layer_norm=False, **conv_kwargs):
|
||||
|
||||
return network_fn
|
||||
|
||||
|
||||
@register("cnn_lnlstm")
|
||||
def cnn_lnlstm(nlstm=128, **conv_kwargs):
|
||||
return cnn_lstm(nlstm, layer_norm=True, **conv_kwargs)
|
||||
|
||||
|
||||
@register("conv_only")
|
||||
def conv_only(convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], **conv_kwargs):
|
||||
'''
|
||||
'''
|
||||
convolutions-only net
|
||||
|
||||
Parameters:
|
||||
----------
|
||||
|
||||
conv: list of triples (filter_number, filter_size, stride) specifying parameters for each layer.
|
||||
conv: list of triples (filter_number, filter_size, stride) specifying parameters for each layer.
|
||||
|
||||
Returns:
|
||||
|
||||
function that takes tensorflow tensor as input and returns the output of the last convolutional layer
|
||||
|
||||
|
||||
'''
|
||||
|
||||
def network_fn(X):
|
||||
@@ -182,23 +197,22 @@ def _normalize_clip_observation(x, clip_range=[-5.0, 5.0]):
|
||||
rms = RunningMeanStd(shape=x.shape[1:])
|
||||
norm_x = tf.clip_by_value((x - rms.mean) / rms.std, min(clip_range), max(clip_range))
|
||||
return norm_x, rms
|
||||
|
||||
|
||||
|
||||
def get_network_builder(name):
|
||||
# TODO: replace with reflection?
|
||||
if name == 'cnn':
|
||||
return cnn
|
||||
elif name == 'cnn_small':
|
||||
return cnn_small
|
||||
elif name == 'conv_only':
|
||||
return conv_only
|
||||
elif name == 'mlp':
|
||||
return mlp
|
||||
elif name == 'lstm':
|
||||
return lstm
|
||||
elif name == 'cnn_lstm':
|
||||
return cnn_lstm
|
||||
elif name == 'cnn_lnlstm':
|
||||
return cnn_lnlstm
|
||||
"""
|
||||
If you want to register your own network outside models.py, you just need:
|
||||
|
||||
Usage Example:
|
||||
-------------
|
||||
from baselines.common.models import register
|
||||
@register("your_network_name")
|
||||
def your_network_define(**net_kwargs):
|
||||
...
|
||||
return network_fn
|
||||
|
||||
"""
|
||||
if name in mapping:
|
||||
return mapping[name]
|
||||
else:
|
||||
raise ValueError('Unknown network type: {}'.format(name))
|
||||
|
@@ -116,7 +116,7 @@ class RolloutWorker:
|
||||
return self.generate_rollouts()
|
||||
|
||||
if np.isnan(o_new).any():
|
||||
self.logger.warning('NaN caught during rollout generation. Trying again...')
|
||||
self.logger.warn('NaN caught during rollout generation. Trying again...')
|
||||
self.reset_all_rollouts()
|
||||
return self.generate_rollouts()
|
||||
|
||||
|
Reference in New Issue
Block a user