refactor a2c, acer, acktr, ppo2, deepq, and trpo_mpi (#490)

* exported rl-algs * more stuff from rl-algs * run slow tests * re-exported rl_algs * re-exported rl_algs - fixed problems with serialization test and test_cartpole * replaced atari_arg_parser with common_arg_parser * run.py can run algos from both baselines and rl_algs * added approximate humanoid reward with ppo2 into the README for reference * dummy commit to RUN BENCHMARKS * dummy commit to RUN BENCHMARKS * dummy commit to RUN BENCHMARKS * dummy commit to RUN BENCHMARKS * very dummy commit to RUN BENCHMARKS * serialize variables as a dict, not as a list * running_mean_std uses tensorflow variables * fixed import in vec_normalize * dummy commit to RUN BENCHMARKS * dummy commit to RUN BENCHMARKS * flake8 complaints * save all variables to make sure we save the vec_normalize normalization * benchmarks on ppo2 only RUN BENCHMARKS * make_atari_env compatible with mpi * run ppo_mpi benchmarks only RUN BENCHMARKS * hardcode names of retro environments * add defaults * changed default ppo2 lr schedule to linear RUN BENCHMARKS * non-tf normalization benchmark RUN BENCHMARKS * use ncpu=1 for mujoco sessions - gives a bit of a performance speedup * reverted running_mean_std to user property decorators for mean, var, count * reverted VecNormalize to use RunningMeanStd (no tf) * reverted VecNormalize to use RunningMeanStd (no tf) * profiling wip * use VecNormalize with regular RunningMeanStd * added acer runner (missing import) * flake8 complaints * added a note in README about TfRunningMeanStd and serialization of VecNormalize * dummy commit to RUN BENCHMARKS * merged benchmarks branch
2018-08-13 09:56:44 -07:00
parent 366f486e34
commit 8c2aea2add
71 changed files with 2942 additions and 1070 deletions
--- a/baselines/common/models.py
+++ b/baselines/common/models.py
@@ -0,0 +1,177 @@
+import numpy as np
+import tensorflow as tf
+from baselines.a2c import utils
+from baselines.a2c.utils import conv, fc, conv_to_fc, batch_to_seq, seq_to_batch
+from baselines.common.mpi_running_mean_std import RunningMeanStd
+import tensorflow.contrib.layers as layers
+
+
+def nature_cnn(unscaled_images, **conv_kwargs):
+    """
+    CNN from Nature paper.
+    """
+    scaled_images = tf.cast(unscaled_images, tf.float32) / 255.
+    activ = tf.nn.relu
+    h = activ(conv(scaled_images, 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2),
+                   **conv_kwargs))
+    h2 = activ(conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2), **conv_kwargs))
+    h3 = activ(conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2), **conv_kwargs))
+    h3 = conv_to_fc(h3)
+    return activ(fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2)))
+
+
+def mlp(num_layers=2, num_hidden=64, activation=tf.tanh):
+    """
+    Simple fully connected layer policy. Separate stacks of fully-connected layers are used for policy and value function estimation.
+    More customized fully-connected policies can be obtained by using PolicyWithV class directly.
+
+    Parameters:
+    ----------
+
+    num_layers: int                 number of fully-connected layers (default: 2)
+    
+    num_hidden: int                 size of fully-connected layers (default: 64)
+    
+    activation:                     activation function (default: tf.tanh)
+        
+    Returns:
+    -------
+
+    function that builds fully connected network with a given input placeholder
+    """        
+    def network_fn(X):
+        h = tf.layers.flatten(X)
+        for i in range(num_layers):
+            h = activation(fc(h, 'mlp_fc{}'.format(i), nh=num_hidden, init_scale=np.sqrt(2)))
+        return h, None
+
+    return network_fn
+  
+
+def cnn(**conv_kwargs):
+    def network_fn(X):
+        return nature_cnn(X, **conv_kwargs), None
+    return network_fn
+
+def cnn_small(**conv_kwargs):
+    def network_fn(X):
+        h = tf.cast(X, tf.float32) / 255.
+        
+        activ = tf.nn.relu
+        h = activ(conv(h, 'c1', nf=8, rf=8, stride=4, init_scale=np.sqrt(2), **conv_kwargs))
+        h = activ(conv(h, 'c2', nf=16, rf=4, stride=2, init_scale=np.sqrt(2), **conv_kwargs))
+        h = conv_to_fc(h)
+        h = activ(fc(h, 'fc1', nh=128, init_scale=np.sqrt(2)))
+        return h, None
+    return network_fn
+
+
+
+def lstm(nlstm=128, layer_norm=False):
+    def network_fn(X, nenv=1):
+        nbatch = X.shape[0] 
+        nsteps = nbatch // nenv
+         
+        h = tf.layers.flatten(X)
+
+        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
+        S = tf.placeholder(tf.float32, [nenv, 2*nlstm]) #states
+
+        xs = batch_to_seq(h, nenv, nsteps)
+        ms = batch_to_seq(M, nenv, nsteps)
+
+        if layer_norm:
+            h5, snew = utils.lnlstm(xs, ms, S, scope='lnlstm', nh=nlstm)
+        else:
+            h5, snew = utils.lstm(xs, ms, S, scope='lstm', nh=nlstm)
+            
+        h = seq_to_batch(h5)
+        initial_state = np.zeros(S.shape.as_list(), dtype=float)
+
+        return h, {'S':S, 'M':M, 'state':snew, 'initial_state':initial_state}
+
+    return network_fn
+
+
+def cnn_lstm(nlstm=128, layer_norm=False, **conv_kwargs):
+    def network_fn(X, nenv=1):
+        nbatch = X.shape[0] 
+        nsteps = nbatch // nenv
+         
+        h = nature_cnn(X, **conv_kwargs)
+       
+        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
+        S = tf.placeholder(tf.float32, [nenv, 2*nlstm]) #states
+
+        xs = batch_to_seq(h, nenv, nsteps)
+        ms = batch_to_seq(M, nenv, nsteps)
+
+        if layer_norm:
+            h5, snew = utils.lnlstm(xs, ms, S, scope='lnlstm', nh=nlstm)
+        else:
+            h5, snew = utils.lstm(xs, ms, S, scope='lstm', nh=nlstm)
+            
+        h = seq_to_batch(h5)
+        initial_state = np.zeros(S.shape.as_list(), dtype=float)
+
+        return h, {'S':S, 'M':M, 'state':snew, 'initial_state':initial_state}
+
+    return network_fn
+
+def cnn_lnlstm(nlstm=128, **conv_kwargs):
+    return cnn_lstm(nlstm, layer_norm=True, **conv_kwargs)
+
+
+def conv_only(convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], **conv_kwargs):
+    ''' 
+    convolutions-only net
+
+    Parameters:
+    ----------
+
+    conv:       list of triples (filter_number, filter_size, stride) specifying parameters for each layer. 
+
+    Returns:
+
+    function that takes tensorflow tensor as input and returns the output of the last convolutional layer
+    
+    '''
+
+    def network_fn(X):
+        out = tf.cast(X, tf.float32) / 255.
+        with tf.variable_scope("convnet"):
+            for num_outputs, kernel_size, stride in convs:
+                out = layers.convolution2d(out,
+                                           num_outputs=num_outputs,
+                                           kernel_size=kernel_size,
+                                           stride=stride,
+                                           activation_fn=tf.nn.relu,
+                                           **conv_kwargs)
+
+        return out, None
+    return network_fn
+
+def _normalize_clip_observation(x, clip_range=[-5.0, 5.0]):
+    rms = RunningMeanStd(shape=x.shape[1:])
+    norm_x = tf.clip_by_value((x - rms.mean) / rms.std, min(clip_range), max(clip_range))
+    return norm_x, rms
+    
+
+def get_network_builder(name):
+    # TODO: replace with reflection? 
+    if name == 'cnn':
+        return cnn
+    elif name == 'cnn_small':
+        return cnn_small
+    elif name == 'conv_only':
+        return conv_only
+    elif name == 'mlp':
+        return mlp
+    elif name == 'lstm':
+        return lstm
+    elif name == 'cnn_lstm':
+        return cnn_lstm
+    elif name == 'cnn_lnlstm':
+        return cnn_lnlstm
+    else:
+        raise ValueError('Unknown network type: {}'.format(name))