baselines/baselines/deepq/models.py

import tensorflow as tf
import tensorflow.contrib.layers as layers


def _mlp(hiddens, inpt, num_actions, scope, reuse=False, layer_norm=False):
    with tf.variable_scope(scope, reuse=reuse):
        out = inpt
        for hidden in hiddens:
            out = layers.fully_connected(out, num_outputs=hidden, activation_fn=None)
            if layer_norm:
                out = layers.layer_norm(out, center=True, scale=True)
            out = tf.nn.relu(out)
        q_out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None)
        return q_out


def mlp(hiddens=[], layer_norm=False):
    """This model takes as input an observation and returns values of all actions.

    Parameters
    ----------
    hiddens: [int]
        list of sizes of hidden layers

    Returns
    -------
    q_func: function
        q_function for DQN algorithm.
    """
    return lambda *args, **kwargs: _mlp(hiddens, layer_norm=layer_norm, *args, **kwargs)


def _cnn_to_mlp(convs, hiddens, dueling, inpt, num_actions, scope, reuse=False, layer_norm=False):
    with tf.variable_scope(scope, reuse=reuse):
        out = inpt
        with tf.variable_scope("convnet"):
            for num_outputs, kernel_size, stride in convs:
                out = layers.convolution2d(out,
                                           num_outputs=num_outputs,
                                           kernel_size=kernel_size,
                                           stride=stride,
                                           activation_fn=tf.nn.relu)
        conv_out = layers.flatten(out)
        with tf.variable_scope("action_value"):
            action_out = conv_out
            for hidden in hiddens:
                action_out = layers.fully_connected(action_out, num_outputs=hidden, activation_fn=None)
                if layer_norm:
                    action_out = layers.layer_norm(action_out, center=True, scale=True)
                action_out = tf.nn.relu(action_out)
            action_scores = layers.fully_connected(action_out, num_outputs=num_actions, activation_fn=None)

        if dueling:
            with tf.variable_scope("state_value"):
                state_out = conv_out
                for hidden in hiddens:
                    state_out = layers.fully_connected(state_out, num_outputs=hidden, activation_fn=None)
                    if layer_norm:
                        state_out = layers.layer_norm(state_out, center=True, scale=True)
                    state_out = tf.nn.relu(state_out)
                state_score = layers.fully_connected(state_out, num_outputs=1, activation_fn=None)
            action_scores_mean = tf.reduce_mean(action_scores, 1)
            action_scores_centered = action_scores - tf.expand_dims(action_scores_mean, 1)
            q_out = state_score + action_scores_centered
        else:
            q_out = action_scores
        return q_out


def cnn_to_mlp(convs, hiddens, dueling=False, layer_norm=False):
    """This model takes as input an observation and returns values of all actions.

    Parameters
    ----------
    convs: [(int, int int)]
        list of convolutional layers in form of
        (num_outputs, kernel_size, stride)
    hiddens: [int]
        list of sizes of hidden layers
    dueling: bool
        if true double the output MLP to compute a baseline
        for action scores

    Returns
    -------
    q_func: function
        q_function for DQN algorithm.
    """

    return lambda *args, **kwargs: _cnn_to_mlp(convs, hiddens, dueling, layer_norm=layer_norm, *args, **kwargs)


def build_q_func(network, hiddens=[256], dueling=True, layer_norm=False, **network_kwargs):
    if isinstance(network, str):
        from baselines.common.models import get_network_builder
        network = get_network_builder(network)(**network_kwargs)

    def q_func_builder(input_placeholder, num_actions, scope, reuse=False):
        with tf.variable_scope(scope, reuse=reuse):
            latent = network(input_placeholder)
            if isinstance(latent, tuple):
                if latent[1] is not None:
                    raise NotImplementedError("DQN is not compatible with recurrent policies yet")
                latent = latent[0]

            latent = layers.flatten(latent)

            with tf.variable_scope("action_value"):
                action_out = latent
                for hidden in hiddens:
                    action_out = layers.fully_connected(action_out, num_outputs=hidden, activation_fn=None)
                    if layer_norm:
                        action_out = layers.layer_norm(action_out, center=True, scale=True)
                    action_out = tf.nn.relu(action_out)
                action_scores = layers.fully_connected(action_out, num_outputs=num_actions, activation_fn=None)

            if dueling:
                with tf.variable_scope("state_value"):
                    state_out = latent
                    for hidden in hiddens:
                        state_out = layers.fully_connected(state_out, num_outputs=hidden, activation_fn=None)
                        if layer_norm:
                            state_out = layers.layer_norm(state_out, center=True, scale=True)
                        state_out = tf.nn.relu(state_out)
                    state_score = layers.fully_connected(state_out, num_outputs=1, activation_fn=None)
                action_scores_mean = tf.reduce_mean(action_scores, 1)
                action_scores_centered = action_scores - tf.expand_dims(action_scores_mean, 1)
                q_out = state_score + action_scores_centered
            else:
                q_out = action_scores
            return q_out

    return q_func_builder
Initial commit 2017-05-17 14:41:46 -07:00			`import tensorflow as tf`
			`import tensorflow.contrib.layers as layers`


Parameter space noise for DQN and DDPG (#75) * Export param noise * Update documentation * Final finishing touches 2017-07-27 08:10:59 -07:00			`def _mlp(hiddens, inpt, num_actions, scope, reuse=False, layer_norm=False):`
Initial commit 2017-05-17 14:41:46 -07:00			`with tf.variable_scope(scope, reuse=reuse):`
			`out = inpt`
			`for hidden in hiddens:`
Parameter space noise for DQN and DDPG (#75) * Export param noise * Update documentation * Final finishing touches 2017-07-27 08:10:59 -07:00			`out = layers.fully_connected(out, num_outputs=hidden, activation_fn=None)`
			`if layer_norm:`
			`out = layers.layer_norm(out, center=True, scale=True)`
			`out = tf.nn.relu(out)`
			`q_out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None)`
			`return q_out`
Initial commit 2017-05-17 14:41:46 -07:00

Parameter space noise for DQN and DDPG (#75) * Export param noise * Update documentation * Final finishing touches 2017-07-27 08:10:59 -07:00			`def mlp(hiddens=[], layer_norm=False):`
Initial commit 2017-05-17 14:41:46 -07:00			`"""This model takes as input an observation and returns values of all actions.`

			`Parameters`
			`----------`
			`hiddens: [int]`
			`list of sizes of hidden layers`

			`Returns`
			`-------`
			`q_func: function`
			`q_function for DQN algorithm.`
			`"""`
Parameter space noise for DQN and DDPG (#75) * Export param noise * Update documentation * Final finishing touches 2017-07-27 08:10:59 -07:00			`return lambda args, kwargs: _mlp(hiddens, layer_norm=layer_norm, args, **kwargs)`
Initial commit 2017-05-17 14:41:46 -07:00

Parameter space noise for DQN and DDPG (#75) * Export param noise * Update documentation * Final finishing touches 2017-07-27 08:10:59 -07:00			`def _cnn_to_mlp(convs, hiddens, dueling, inpt, num_actions, scope, reuse=False, layer_norm=False):`
Initial commit 2017-05-17 14:41:46 -07:00			`with tf.variable_scope(scope, reuse=reuse):`
			`out = inpt`
			`with tf.variable_scope("convnet"):`
			`for num_outputs, kernel_size, stride in convs:`
			`out = layers.convolution2d(out,`
			`num_outputs=num_outputs,`
			`kernel_size=kernel_size,`
			`stride=stride,`
			`activation_fn=tf.nn.relu)`
Parameter space noise for DQN and DDPG (#75) * Export param noise * Update documentation * Final finishing touches 2017-07-27 08:10:59 -07:00			`conv_out = layers.flatten(out)`
Initial commit 2017-05-17 14:41:46 -07:00			`with tf.variable_scope("action_value"):`
Parameter space noise for DQN and DDPG (#75) * Export param noise * Update documentation * Final finishing touches 2017-07-27 08:10:59 -07:00			`action_out = conv_out`
Initial commit 2017-05-17 14:41:46 -07:00			`for hidden in hiddens:`
Parameter space noise for DQN and DDPG (#75) * Export param noise * Update documentation * Final finishing touches 2017-07-27 08:10:59 -07:00			`action_out = layers.fully_connected(action_out, num_outputs=hidden, activation_fn=None)`
			`if layer_norm:`
			`action_out = layers.layer_norm(action_out, center=True, scale=True)`
			`action_out = tf.nn.relu(action_out)`
Initial commit 2017-05-17 14:41:46 -07:00			`action_scores = layers.fully_connected(action_out, num_outputs=num_actions, activation_fn=None)`

			`if dueling:`
			`with tf.variable_scope("state_value"):`
Parameter space noise for DQN and DDPG (#75) * Export param noise * Update documentation * Final finishing touches 2017-07-27 08:10:59 -07:00			`state_out = conv_out`
Initial commit 2017-05-17 14:41:46 -07:00			`for hidden in hiddens:`
Parameter space noise for DQN and DDPG (#75) * Export param noise * Update documentation * Final finishing touches 2017-07-27 08:10:59 -07:00			`state_out = layers.fully_connected(state_out, num_outputs=hidden, activation_fn=None)`
			`if layer_norm:`
			`state_out = layers.layer_norm(state_out, center=True, scale=True)`
			`state_out = tf.nn.relu(state_out)`
Initial commit 2017-05-17 14:41:46 -07:00			`state_score = layers.fully_connected(state_out, num_outputs=1, activation_fn=None)`
			`action_scores_mean = tf.reduce_mean(action_scores, 1)`
			`action_scores_centered = action_scores - tf.expand_dims(action_scores_mean, 1)`
Parameter space noise for DQN and DDPG (#75) * Export param noise * Update documentation * Final finishing touches 2017-07-27 08:10:59 -07:00			`q_out = state_score + action_scores_centered`
Initial commit 2017-05-17 14:41:46 -07:00			`else:`
Parameter space noise for DQN and DDPG (#75) * Export param noise * Update documentation * Final finishing touches 2017-07-27 08:10:59 -07:00			`q_out = action_scores`
			`return q_out`
Initial commit 2017-05-17 14:41:46 -07:00

Parameter space noise for DQN and DDPG (#75) * Export param noise * Update documentation * Final finishing touches 2017-07-27 08:10:59 -07:00			`def cnn_to_mlp(convs, hiddens, dueling=False, layer_norm=False):`
Initial commit 2017-05-17 14:41:46 -07:00			`"""This model takes as input an observation and returns values of all actions.`

			`Parameters`
			`----------`
			`convs: [(int, int int)]`
			`list of convolutional layers in form of`
			`(num_outputs, kernel_size, stride)`
			`hiddens: [int]`
			`list of sizes of hidden layers`
			`dueling: bool`
			`if true double the output MLP to compute a baseline`
			`for action scores`

			`Returns`
			`-------`
			`q_func: function`
			`q_function for DQN algorithm.`
			`"""`

Parameter space noise for DQN and DDPG (#75) * Export param noise * Update documentation * Final finishing touches 2017-07-27 08:10:59 -07:00			`return lambda args, kwargs: _cnn_to_mlp(convs, hiddens, dueling, layer_norm=layer_norm, args, **kwargs)`
Initial commit 2017-05-17 14:41:46 -07:00
refactor a2c, acer, acktr, ppo2, deepq, and trpo_mpi (#490) * exported rl-algs * more stuff from rl-algs * run slow tests * re-exported rl_algs * re-exported rl_algs - fixed problems with serialization test and test_cartpole * replaced atari_arg_parser with common_arg_parser * run.py can run algos from both baselines and rl_algs * added approximate humanoid reward with ppo2 into the README for reference * dummy commit to RUN BENCHMARKS * dummy commit to RUN BENCHMARKS * dummy commit to RUN BENCHMARKS * dummy commit to RUN BENCHMARKS * very dummy commit to RUN BENCHMARKS * serialize variables as a dict, not as a list * running_mean_std uses tensorflow variables * fixed import in vec_normalize * dummy commit to RUN BENCHMARKS * dummy commit to RUN BENCHMARKS * flake8 complaints * save all variables to make sure we save the vec_normalize normalization * benchmarks on ppo2 only RUN BENCHMARKS * make_atari_env compatible with mpi * run ppo_mpi benchmarks only RUN BENCHMARKS * hardcode names of retro environments * add defaults * changed default ppo2 lr schedule to linear RUN BENCHMARKS * non-tf normalization benchmark RUN BENCHMARKS * use ncpu=1 for mujoco sessions - gives a bit of a performance speedup * reverted running_mean_std to user property decorators for mean, var, count * reverted VecNormalize to use RunningMeanStd (no tf) * reverted VecNormalize to use RunningMeanStd (no tf) * profiling wip * use VecNormalize with regular RunningMeanStd * added acer runner (missing import) * flake8 complaints * added a note in README about TfRunningMeanStd and serialization of VecNormalize * dummy commit to RUN BENCHMARKS * merged benchmarks branch 2018-08-13 09:56:44 -07:00

			`def build_q_func(network, hiddens=[256], dueling=True, layer_norm=False, **network_kwargs):`
			`if isinstance(network, str):`
			`from baselines.common.models import get_network_builder`
tighten flake8, autopep8 to fix trailing whitespaces and blank lines with whitespaces (#87) 2018-09-11 11:01:51 -07:00			`network = get_network_builder(network)(**network_kwargs)`

refactor a2c, acer, acktr, ppo2, deepq, and trpo_mpi (#490) * exported rl-algs * more stuff from rl-algs * run slow tests * re-exported rl_algs * re-exported rl_algs - fixed problems with serialization test and test_cartpole * replaced atari_arg_parser with common_arg_parser * run.py can run algos from both baselines and rl_algs * added approximate humanoid reward with ppo2 into the README for reference * dummy commit to RUN BENCHMARKS * dummy commit to RUN BENCHMARKS * dummy commit to RUN BENCHMARKS * dummy commit to RUN BENCHMARKS * very dummy commit to RUN BENCHMARKS * serialize variables as a dict, not as a list * running_mean_std uses tensorflow variables * fixed import in vec_normalize * dummy commit to RUN BENCHMARKS * dummy commit to RUN BENCHMARKS * flake8 complaints * save all variables to make sure we save the vec_normalize normalization * benchmarks on ppo2 only RUN BENCHMARKS * make_atari_env compatible with mpi * run ppo_mpi benchmarks only RUN BENCHMARKS * hardcode names of retro environments * add defaults * changed default ppo2 lr schedule to linear RUN BENCHMARKS * non-tf normalization benchmark RUN BENCHMARKS * use ncpu=1 for mujoco sessions - gives a bit of a performance speedup * reverted running_mean_std to user property decorators for mean, var, count * reverted VecNormalize to use RunningMeanStd (no tf) * reverted VecNormalize to use RunningMeanStd (no tf) * profiling wip * use VecNormalize with regular RunningMeanStd * added acer runner (missing import) * flake8 complaints * added a note in README about TfRunningMeanStd and serialization of VecNormalize * dummy commit to RUN BENCHMARKS * merged benchmarks branch 2018-08-13 09:56:44 -07:00			`def q_func_builder(input_placeholder, num_actions, scope, reuse=False):`
			`with tf.variable_scope(scope, reuse=reuse):`
Refactor DDPG (#111) * run ddpg on Mujoco benchmark RUN BENCHMARKS * autopep8 * fixed all syntax in refactored ddpg * a little bit more refactoring * autopep8 * identity test with ddpg WIP * enable test_identity with ddpg * refactored ddpg RUN BENCHMARKS * autopep8 * include ddpg into style check * fixing tests RUN BENCHMARKS * set default seed to None RUN BENCHMARKS * run tests and benchmarks in separate buildkite steps RUN BENCHMARKS * cleanup pdb usage * flake8 and cleanups * re-enabled all benchmarks in run-benchmarks-new.py * flake8 complaints * deepq model builder compatible with network functions returning single tensor * remove ddpg test with test_discrete_identity * make ppo_metal use make_vec_env instead of make_atari_env * make ppo_metal use make_vec_env instead of make_atari_env * fixed syntax in ppo_metal.run_atari 2018-09-26 15:28:52 -07:00			`latent = network(input_placeholder)`
			`if isinstance(latent, tuple):`
			`if latent[1] is not None:`
			`raise NotImplementedError("DQN is not compatible with recurrent policies yet")`
			`latent = latent[0]`

refactor a2c, acer, acktr, ppo2, deepq, and trpo_mpi (#490) * exported rl-algs * more stuff from rl-algs * run slow tests * re-exported rl_algs * re-exported rl_algs - fixed problems with serialization test and test_cartpole * replaced atari_arg_parser with common_arg_parser * run.py can run algos from both baselines and rl_algs * added approximate humanoid reward with ppo2 into the README for reference * dummy commit to RUN BENCHMARKS * dummy commit to RUN BENCHMARKS * dummy commit to RUN BENCHMARKS * dummy commit to RUN BENCHMARKS * very dummy commit to RUN BENCHMARKS * serialize variables as a dict, not as a list * running_mean_std uses tensorflow variables * fixed import in vec_normalize * dummy commit to RUN BENCHMARKS * dummy commit to RUN BENCHMARKS * flake8 complaints * save all variables to make sure we save the vec_normalize normalization * benchmarks on ppo2 only RUN BENCHMARKS * make_atari_env compatible with mpi * run ppo_mpi benchmarks only RUN BENCHMARKS * hardcode names of retro environments * add defaults * changed default ppo2 lr schedule to linear RUN BENCHMARKS * non-tf normalization benchmark RUN BENCHMARKS * use ncpu=1 for mujoco sessions - gives a bit of a performance speedup * reverted running_mean_std to user property decorators for mean, var, count * reverted VecNormalize to use RunningMeanStd (no tf) * reverted VecNormalize to use RunningMeanStd (no tf) * profiling wip * use VecNormalize with regular RunningMeanStd * added acer runner (missing import) * flake8 complaints * added a note in README about TfRunningMeanStd and serialization of VecNormalize * dummy commit to RUN BENCHMARKS * merged benchmarks branch 2018-08-13 09:56:44 -07:00			`latent = layers.flatten(latent)`

			`with tf.variable_scope("action_value"):`
			`action_out = latent`
			`for hidden in hiddens:`
			`action_out = layers.fully_connected(action_out, num_outputs=hidden, activation_fn=None)`
			`if layer_norm:`
			`action_out = layers.layer_norm(action_out, center=True, scale=True)`
			`action_out = tf.nn.relu(action_out)`
			`action_scores = layers.fully_connected(action_out, num_outputs=num_actions, activation_fn=None)`

			`if dueling:`
			`with tf.variable_scope("state_value"):`
			`state_out = latent`
			`for hidden in hiddens:`
			`state_out = layers.fully_connected(state_out, num_outputs=hidden, activation_fn=None)`
			`if layer_norm:`
			`state_out = layers.layer_norm(state_out, center=True, scale=True)`
			`state_out = tf.nn.relu(state_out)`
			`state_score = layers.fully_connected(state_out, num_outputs=1, activation_fn=None)`
			`action_scores_mean = tf.reduce_mean(action_scores, 1)`
			`action_scores_centered = action_scores - tf.expand_dims(action_scores_mean, 1)`
			`q_out = state_score + action_scores_centered`
			`else:`
			`q_out = action_scores`
			`return q_out`
tighten flake8, autopep8 to fix trailing whitespaces and blank lines with whitespaces (#87) 2018-09-11 11:01:51 -07:00
refactor a2c, acer, acktr, ppo2, deepq, and trpo_mpi (#490) * exported rl-algs * more stuff from rl-algs * run slow tests * re-exported rl_algs * re-exported rl_algs - fixed problems with serialization test and test_cartpole * replaced atari_arg_parser with common_arg_parser * run.py can run algos from both baselines and rl_algs * added approximate humanoid reward with ppo2 into the README for reference * dummy commit to RUN BENCHMARKS * dummy commit to RUN BENCHMARKS * dummy commit to RUN BENCHMARKS * dummy commit to RUN BENCHMARKS * very dummy commit to RUN BENCHMARKS * serialize variables as a dict, not as a list * running_mean_std uses tensorflow variables * fixed import in vec_normalize * dummy commit to RUN BENCHMARKS * dummy commit to RUN BENCHMARKS * flake8 complaints * save all variables to make sure we save the vec_normalize normalization * benchmarks on ppo2 only RUN BENCHMARKS * make_atari_env compatible with mpi * run ppo_mpi benchmarks only RUN BENCHMARKS * hardcode names of retro environments * add defaults * changed default ppo2 lr schedule to linear RUN BENCHMARKS * non-tf normalization benchmark RUN BENCHMARKS * use ncpu=1 for mujoco sessions - gives a bit of a performance speedup * reverted running_mean_std to user property decorators for mean, var, count * reverted VecNormalize to use RunningMeanStd (no tf) * reverted VecNormalize to use RunningMeanStd (no tf) * profiling wip * use VecNormalize with regular RunningMeanStd * added acer runner (missing import) * flake8 complaints * added a note in README about TfRunningMeanStd and serialization of VecNormalize * dummy commit to RUN BENCHMARKS * merged benchmarks branch 2018-08-13 09:56:44 -07:00			`return q_func_builder`