tighten flake8, autopep8 to fix trailing whitespaces and blank lines with whitespaces (#87)

This commit is contained in:
pzhokhov
2018-09-11 11:01:51 -07:00
committed by Peter Zhokhov
parent e56803491f
commit 9070ee7ef3
43 changed files with 176 additions and 176 deletions

View File

@@ -97,21 +97,21 @@ def learn(
load_path=None, load_path=None,
**network_kwargs): **network_kwargs):
''' '''
Main entrypoint for A2C algorithm. Train a policy with given network architecture on a given environment using a2c algorithm. Main entrypoint for A2C algorithm. Train a policy with given network architecture on a given environment using a2c algorithm.
Parameters: Parameters:
----------- -----------
network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list) network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets. neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
See baselines.common/policies.py/lstm for more details on using recurrent nets in policies See baselines.common/policies.py/lstm for more details on using recurrent nets in policies
env: RL environment. Should implement interface similar to VecEnv (baselines.common/vec_env) or be wrapped with DummyVecEnv (baselines.common/vec_env/dummy_vec_env.py) env: RL environment. Should implement interface similar to VecEnv (baselines.common/vec_env) or be wrapped with DummyVecEnv (baselines.common/vec_env/dummy_vec_env.py)
seed: seed to make random number sequence in the alorightm reproducible. By default is None which means seed from system noise generator (not reproducible) seed: seed to make random number sequence in the alorightm reproducible. By default is None which means seed from system noise generator (not reproducible)
@@ -128,7 +128,7 @@ def learn(
lr: float, learning rate for RMSProp (current implementation has RMSProp hardcoded in) (default: 7e-4) lr: float, learning rate for RMSProp (current implementation has RMSProp hardcoded in) (default: 7e-4)
lrschedule: schedule of learning rate. Can be 'linear', 'constant', or a function [0..1] -> [0..1] that takes fraction of the training progress as input and lrschedule: schedule of learning rate. Can be 'linear', 'constant', or a function [0..1] -> [0..1] that takes fraction of the training progress as input and
returns fraction of the learning rate (specified as lr) as output returns fraction of the learning rate (specified as lr) as output
epsilon: float, RMSProp epsilon (stabilizes square root computation in denominator of RMSProp update) (default: 1e-5) epsilon: float, RMSProp epsilon (stabilizes square root computation in denominator of RMSProp update) (default: 1e-5)
@@ -140,17 +140,17 @@ def learn(
log_interval: int, specifies how frequently the logs are printed out (default: 100) log_interval: int, specifies how frequently the logs are printed out (default: 100)
**network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network **network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
For instance, 'mlp' network architecture has arguments num_hidden and num_layers. For instance, 'mlp' network architecture has arguments num_hidden and num_layers.
''' '''
set_global_seeds(seed) set_global_seeds(seed)
nenvs = env.num_envs nenvs = env.num_envs
policy = build_policy(env, network, **network_kwargs) policy = build_policy(env, network, **network_kwargs)
model = Model(policy=policy, env=env, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, model = Model(policy=policy, env=env, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef,
max_grad_norm=max_grad_norm, lr=lr, alpha=alpha, epsilon=epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule) max_grad_norm=max_grad_norm, lr=lr, alpha=alpha, epsilon=epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule)
if load_path is not None: if load_path is not None:

View File

@@ -9,7 +9,7 @@ class Runner(AbstractEnvRunner):
self.gamma = gamma self.gamma = gamma
self.batch_action_shape = [x if x is not None else -1 for x in model.train_model.action.shape.as_list()] self.batch_action_shape = [x if x is not None else -1 for x in model.train_model.action.shape.as_list()]
self.ob_dtype = model.train_model.X.dtype.as_numpy_dtype self.ob_dtype = model.train_model.X.dtype.as_numpy_dtype
def run(self): def run(self):
mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[] mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[]
mb_states = self.states mb_states = self.states
@@ -51,7 +51,7 @@ class Runner(AbstractEnvRunner):
rewards = discount_with_dones(rewards, dones, self.gamma) rewards = discount_with_dones(rewards, dones, self.gamma)
mb_rewards[n] = rewards mb_rewards[n] = rewards
mb_actions = mb_actions.reshape(self.batch_action_shape) mb_actions = mb_actions.reshape(self.batch_action_shape)
mb_rewards = mb_rewards.flatten() mb_rewards = mb_rewards.flatten()

View File

@@ -70,7 +70,7 @@ class Model(object):
MU = tf.placeholder(tf.float32, [nbatch, nact]) # mu's MU = tf.placeholder(tf.float32, [nbatch, nact]) # mu's
LR = tf.placeholder(tf.float32, []) LR = tf.placeholder(tf.float32, [])
eps = 1e-6 eps = 1e-6
step_ob_placeholder = tf.placeholder(dtype=ob_space.dtype, shape=(nenvs,) + ob_space.shape[:-1] + (ob_space.shape[-1] * nstack,)) step_ob_placeholder = tf.placeholder(dtype=ob_space.dtype, shape=(nenvs,) + ob_space.shape[:-1] + (ob_space.shape[-1] * nstack,))
train_ob_placeholder = tf.placeholder(dtype=ob_space.dtype, shape=(nenvs*(nsteps+1),) + ob_space.shape[:-1] + (ob_space.shape[-1] * nstack,)) train_ob_placeholder = tf.placeholder(dtype=ob_space.dtype, shape=(nenvs*(nsteps+1),) + ob_space.shape[:-1] + (ob_space.shape[-1] * nstack,))
with tf.variable_scope('acer_model', reuse=tf.AUTO_REUSE): with tf.variable_scope('acer_model', reuse=tf.AUTO_REUSE):
@@ -78,7 +78,7 @@ class Model(object):
step_model = policy(observ_placeholder=step_ob_placeholder, sess=sess) step_model = policy(observ_placeholder=step_ob_placeholder, sess=sess)
train_model = policy(observ_placeholder=train_ob_placeholder, sess=sess) train_model = policy(observ_placeholder=train_ob_placeholder, sess=sess)
params = find_trainable_variables("acer_model") params = find_trainable_variables("acer_model")
print("Params {}".format(len(params))) print("Params {}".format(len(params)))
for var in params: for var in params:
@@ -97,10 +97,10 @@ class Model(object):
polyak_model = policy(observ_placeholder=train_ob_placeholder, sess=sess) polyak_model = policy(observ_placeholder=train_ob_placeholder, sess=sess)
# Notation: (var) = batch variable, (var)s = seqeuence variable, (var)_i = variable index by action at step i # Notation: (var) = batch variable, (var)s = seqeuence variable, (var)_i = variable index by action at step i
# action probability distributions according to train_model, polyak_model and step_model # action probability distributions according to train_model, polyak_model and step_model
# poilcy.pi is probability distribution parameters; to obtain distribution that sums to 1 need to take softmax # poilcy.pi is probability distribution parameters; to obtain distribution that sums to 1 need to take softmax
train_model_p = tf.nn.softmax(train_model.pi) train_model_p = tf.nn.softmax(train_model.pi)
polyak_model_p = tf.nn.softmax(polyak_model.pi) polyak_model_p = tf.nn.softmax(polyak_model.pi)
step_model_p = tf.nn.softmax(step_model.pi) step_model_p = tf.nn.softmax(step_model.pi)
v = tf.reduce_sum(train_model_p * train_model.q, axis = -1) # shape is [nenvs * (nsteps + 1)] v = tf.reduce_sum(train_model_p * train_model.q, axis = -1) # shape is [nenvs * (nsteps + 1)]
@@ -119,7 +119,7 @@ class Model(object):
qret = q_retrace(R, D, q_i, v, rho_i, nenvs, nsteps, gamma) qret = q_retrace(R, D, q_i, v, rho_i, nenvs, nsteps, gamma)
# Calculate losses # Calculate losses
# Entropy # Entropy
# entropy = tf.reduce_mean(strip(train_model.pd.entropy(), nenvs, nsteps)) # entropy = tf.reduce_mean(strip(train_model.pd.entropy(), nenvs, nsteps))
entropy = tf.reduce_mean(cat_entropy_softmax(f)) entropy = tf.reduce_mean(cat_entropy_softmax(f))
@@ -212,8 +212,8 @@ class Model(object):
def _step(observation, **kwargs): def _step(observation, **kwargs):
return step_model._evaluate([step_model.action, step_model_p, step_model.state], observation, **kwargs) return step_model._evaluate([step_model.action, step_model_p, step_model.state], observation, **kwargs)
self.train = train self.train = train
self.save = functools.partial(save_variables, sess=sess, variables=params) self.save = functools.partial(save_variables, sess=sess, variables=params)
@@ -283,18 +283,18 @@ def learn(network, env, seed=None, nsteps=20, nstack=4, total_timesteps=int(80e6
---------- ----------
network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list) network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets. neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
See baselines.common/policies.py/lstm for more details on using recurrent nets in policies See baselines.common/policies.py/lstm for more details on using recurrent nets in policies
env: environment. Needs to be vectorized for parallel environment simulation. env: environment. Needs to be vectorized for parallel environment simulation.
The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class. The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class.
nsteps: int, number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where nsteps: int, number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where
nenv is number of environment copies simulated in parallel) (default: 20) nenv is number of environment copies simulated in parallel) (default: 20)
nstack: int, size of the frame stack, i.e. number of the frames passed to the step model. Frames are stacked along channel dimension nstack: int, size of the frame stack, i.e. number of the frames passed to the step model. Frames are stacked along channel dimension
(last image dimension) (default: 4) (last image dimension) (default: 4)
total_timesteps: int, number of timesteps (i.e. number of actions taken in the environment) (default: 80M) total_timesteps: int, number of timesteps (i.e. number of actions taken in the environment) (default: 80M)
@@ -303,11 +303,11 @@ def learn(network, env, seed=None, nsteps=20, nstack=4, total_timesteps=int(80e6
ent_coef: float, policy entropy coefficient in the optimization objective (default: 0.01) ent_coef: float, policy entropy coefficient in the optimization objective (default: 0.01)
max_grad_norm: float, gradient norm clipping coefficient. If set to None, no clipping. (default: 10), max_grad_norm: float, gradient norm clipping coefficient. If set to None, no clipping. (default: 10),
lr: float, learning rate for RMSProp (current implementation has RMSProp hardcoded in) (default: 7e-4) lr: float, learning rate for RMSProp (current implementation has RMSProp hardcoded in) (default: 7e-4)
lrschedule: schedule of learning rate. Can be 'linear', 'constant', or a function [0..1] -> [0..1] that takes fraction of the training progress as input and lrschedule: schedule of learning rate. Can be 'linear', 'constant', or a function [0..1] -> [0..1] that takes fraction of the training progress as input and
returns fraction of the learning rate (specified as lr) as output returns fraction of the learning rate (specified as lr) as output
rprop_epsilon: float, RMSProp epsilon (stabilizes square root computation in denominator of RMSProp update) (default: 1e-5) rprop_epsilon: float, RMSProp epsilon (stabilizes square root computation in denominator of RMSProp update) (default: 1e-5)
@@ -325,17 +325,17 @@ def learn(network, env, seed=None, nsteps=20, nstack=4, total_timesteps=int(80e6
replay_start: int, the sampling from the replay buffer does not start until replay buffer has at least that many samples (default: 10k) replay_start: int, the sampling from the replay buffer does not start until replay buffer has at least that many samples (default: 10k)
c: float, importance weight clipping factor (default: 10) c: float, importance weight clipping factor (default: 10)
trust_region bool, whether or not algorithms estimates the gradient KL divergence between the old and updated policy and uses it to determine step size (default: True) trust_region bool, whether or not algorithms estimates the gradient KL divergence between the old and updated policy and uses it to determine step size (default: True)
delta: float, max KL divergence between the old policy and updated policy (default: 1) delta: float, max KL divergence between the old policy and updated policy (default: 1)
alpha: float, momentum factor in the Polyak (exponential moving average) averaging of the model parameters (default: 0.99) alpha: float, momentum factor in the Polyak (exponential moving average) averaging of the model parameters (default: 0.99)
load_path: str, path to load the model from (default: None) load_path: str, path to load the model from (default: None)
**network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network **network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
For instance, 'mlp' network architecture has arguments num_hidden and num_layers. For instance, 'mlp' network architecture has arguments num_hidden and num_layers.
''' '''

View File

@@ -1,2 +1,2 @@
from baselines.bench.benchmarks import * from baselines.bench.benchmarks import *
from baselines.bench.monitor import * from baselines.bench.monitor import *

View File

@@ -102,7 +102,7 @@ def get_monitor_files(dir):
def load_results(dir): def load_results(dir):
import pandas import pandas
monitor_files = ( monitor_files = (
glob(osp.join(dir, "*monitor.json")) + glob(osp.join(dir, "*monitor.json")) +
glob(osp.join(dir, "*monitor.csv"))) # get both csv and (old) json files glob(osp.join(dir, "*monitor.csv"))) # get both csv and (old) json files
if not monitor_files: if not monitor_files:
raise LoadMonitorResultsError("no monitor files of the form *%s found in %s" % (Monitor.EXT, dir)) raise LoadMonitorResultsError("no monitor files of the form *%s found in %s" % (Monitor.EXT, dir))

View File

@@ -31,4 +31,4 @@ def cg(f_Ax, b, cg_iters=10, callback=None, verbose=False, residual_tol=1e-10):
if callback is not None: if callback is not None:
callback(x) callback(x)
if verbose: print(fmtstr % (i+1, rdotr, np.linalg.norm(x))) # pylint: disable=W0631 if verbose: print(fmtstr % (i+1, rdotr, np.linalg.norm(x))) # pylint: disable=W0631
return x return x

View File

@@ -29,7 +29,7 @@ def make_vec_env(env_id, env_type, num_env, seed, wrapper_kwargs=None, start_ind
def _thunk(): def _thunk():
env = make_atari(env_id) if env_type == 'atari' else gym.make(env_id) env = make_atari(env_id) if env_type == 'atari' else gym.make(env_id)
env.seed(seed + 10000*mpi_rank + rank if seed is not None else None) env.seed(seed + 10000*mpi_rank + rank if seed is not None else None)
env = Monitor(env, env = Monitor(env,
logger.get_dir() and os.path.join(logger.get_dir(), str(mpi_rank) + '.' + str(rank)), logger.get_dir() and os.path.join(logger.get_dir(), str(mpi_rank) + '.' + str(rank)),
allow_early_resets=True) allow_early_resets=True)

View File

@@ -2,7 +2,7 @@ from __future__ import print_function
from contextlib import contextmanager from contextlib import contextmanager
import numpy as np import numpy as np
import time import time
import shlex import shlex
import subprocess import subprocess
# ================================================================ # ================================================================

View File

@@ -2,15 +2,15 @@ import tensorflow as tf
from gym.spaces import Discrete, Box from gym.spaces import Discrete, Box
def observation_placeholder(ob_space, batch_size=None, name='Ob'): def observation_placeholder(ob_space, batch_size=None, name='Ob'):
''' '''
Create placeholder to feed observations into of the size appropriate to the observation space Create placeholder to feed observations into of the size appropriate to the observation space
Parameters: Parameters:
---------- ----------
ob_space: gym.Space observation space ob_space: gym.Space observation space
batch_size: int size of the batch to be fed into input. Can be left None in most cases. batch_size: int size of the batch to be fed into input. Can be left None in most cases.
name: str name of the placeholder name: str name of the placeholder
@@ -27,9 +27,9 @@ def observation_placeholder(ob_space, batch_size=None, name='Ob'):
def observation_input(ob_space, batch_size=None, name='Ob'): def observation_input(ob_space, batch_size=None, name='Ob'):
''' '''
Create placeholder to feed observations into of the size appropriate to the observation space, and add input Create placeholder to feed observations into of the size appropriate to the observation space, and add input
encoder of the appropriate type. encoder of the appropriate type.
''' '''
placeholder = observation_placeholder(ob_space, batch_size, name) placeholder = observation_placeholder(ob_space, batch_size, name)
@@ -41,9 +41,9 @@ def encode_observation(ob_space, placeholder):
Parameters: Parameters:
---------- ----------
ob_space: gym.Space observation space ob_space: gym.Space observation space
placeholder: tf.placeholder observation input placeholder placeholder: tf.placeholder observation input placeholder
''' '''
if isinstance(ob_space, Discrete): if isinstance(ob_space, Discrete):

View File

@@ -82,4 +82,4 @@ def test_discount_with_boundaries():
2 + gamma * 3, 2 + gamma * 3,
3, 3,
4 4
]) ])

View File

@@ -76,4 +76,4 @@ def test_MpiAdam():
for i in range(10): for i in range(10):
l,g = lossandgrad() l,g = lossandgrad()
adam.update(g, stepsize) adam.update(g, stepsize)
print(i,l) print(i,l)

View File

@@ -4,7 +4,7 @@ def mpi_fork(n, bind_to_core=False):
"""Re-launches the current script with workers """Re-launches the current script with workers
Returns "parent" for original parent, "child" for MPI children Returns "parent" for original parent, "child" for MPI children
""" """
if n<=1: if n<=1:
return "child" return "child"
if os.getenv("IN_MPI") is None: if os.getenv("IN_MPI") is None:
env = os.environ.copy() env = os.environ.copy()

View File

@@ -33,8 +33,8 @@ def mpi_moments(x, axis=0, comm=None, keepdims=False):
def test_runningmeanstd(): def test_runningmeanstd():
import subprocess import subprocess
subprocess.check_call(['mpirun', '-np', '3', subprocess.check_call(['mpirun', '-np', '3',
'python','-c', 'python','-c',
'from baselines.common.mpi_moments import _helper_runningmeanstd; _helper_runningmeanstd()']) 'from baselines.common.mpi_moments import _helper_runningmeanstd; _helper_runningmeanstd()'])
def _helper_runningmeanstd(): def _helper_runningmeanstd():

View File

@@ -32,7 +32,7 @@ class PolicyWithValue(object):
**tensors tensorflow tensors for additional attributes such as state or mask **tensors tensorflow tensors for additional attributes such as state or mask
""" """
self.X = observations self.X = observations
self.state = tf.constant([]) self.state = tf.constant([])
self.initial_state = None self.initial_state = None
@@ -85,7 +85,7 @@ class PolicyWithValue(object):
------- -------
(action, value estimate, next state, negative log likelihood of the action under current policy parameters) tuple (action, value estimate, next state, negative log likelihood of the action under current policy parameters) tuple
""" """
a, v, state, neglogp = self._evaluate([self.action, self.vf, self.state, self.neglogp], observation, **extra_feed) a, v, state, neglogp = self._evaluate([self.action, self.vf, self.state, self.neglogp], observation, **extra_feed)
if state.size == 0: if state.size == 0:
state = None state = None
@@ -106,14 +106,14 @@ class PolicyWithValue(object):
------- -------
value estimate value estimate
""" """
return self._evaluate(self.vf, ob, *args, **kwargs) return self._evaluate(self.vf, ob, *args, **kwargs)
def save(self, save_path): def save(self, save_path):
tf_util.save_state(save_path, sess=self.sess) tf_util.save_state(save_path, sess=self.sess)
def load(self, load_path): def load(self, load_path):
tf_util.load_state(load_path, sess=self.sess) tf_util.load_state(load_path, sess=self.sess)
def build_policy(env, policy_network, value_network=None, normalize_observations=False, estimate_q=False, **policy_kwargs): def build_policy(env, policy_network, value_network=None, normalize_observations=False, estimate_q=False, **policy_kwargs):
if isinstance(policy_network, str): if isinstance(policy_network, str):
network_type = policy_network network_type = policy_network
@@ -123,7 +123,7 @@ def build_policy(env, policy_network, value_network=None, normalize_observation
ob_space = env.observation_space ob_space = env.observation_space
X = observ_placeholder if observ_placeholder is not None else observation_placeholder(ob_space, batch_size=nbatch) X = observ_placeholder if observ_placeholder is not None else observation_placeholder(ob_space, batch_size=nbatch)
extra_tensors = {} extra_tensors = {}
if normalize_observations and X.dtype == tf.float32: if normalize_observations and X.dtype == tf.float32:
@@ -144,7 +144,7 @@ def build_policy(env, policy_network, value_network=None, normalize_observation
policy_latent, recurrent_tensors = policy_network(encoded_x, nenv) policy_latent, recurrent_tensors = policy_network(encoded_x, nenv)
extra_tensors.update(recurrent_tensors) extra_tensors.update(recurrent_tensors)
_v_net = value_network _v_net = value_network
if _v_net is None or _v_net == 'shared': if _v_net is None or _v_net == 'shared':
@@ -154,10 +154,10 @@ def build_policy(env, policy_network, value_network=None, normalize_observation
_v_net = policy_network _v_net = policy_network
else: else:
assert callable(_v_net) assert callable(_v_net)
with tf.variable_scope('vf', reuse=tf.AUTO_REUSE): with tf.variable_scope('vf', reuse=tf.AUTO_REUSE):
vf_latent, _ = _v_net(encoded_x) vf_latent, _ = _v_net(encoded_x)
policy = PolicyWithValue( policy = PolicyWithValue(
env=env, env=env,
observations=X, observations=X,
@@ -176,4 +176,4 @@ def _normalize_clip_observation(x, clip_range=[-5.0, 5.0]):
rms = RunningMeanStd(shape=x.shape[1:]) rms = RunningMeanStd(shape=x.shape[1:])
norm_x = tf.clip_by_value((x - rms.mean) / rms.std, min(clip_range), max(clip_range)) norm_x = tf.clip_by_value((x - rms.mean) / rms.std, min(clip_range), max(clip_range))
return norm_x, rms return norm_x, rms

View File

@@ -23,15 +23,15 @@ def update_mean_var_count_from_moments(mean, var, count, batch_mean, batch_var,
delta = batch_mean - mean delta = batch_mean - mean
tot_count = count + batch_count tot_count = count + batch_count
new_mean = mean + delta * batch_count / tot_count new_mean = mean + delta * batch_count / tot_count
m_a = var * count m_a = var * count
m_b = batch_var * batch_count m_b = batch_var * batch_count
M2 = m_a + m_b + np.square(delta) * count * batch_count / (count + batch_count) M2 = m_a + m_b + np.square(delta) * count * batch_count / (count + batch_count)
new_var = M2 / (count + batch_count) new_var = M2 / (count + batch_count)
new_count = batch_count + count new_count = batch_count + count
return new_mean, new_var, new_count return new_mean, new_var, new_count
class TfRunningMeanStd(object): class TfRunningMeanStd(object):
# https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
@@ -46,10 +46,10 @@ class TfRunningMeanStd(object):
self._new_var = tf.placeholder(shape=shape, dtype=tf.float64) self._new_var = tf.placeholder(shape=shape, dtype=tf.float64)
self._new_count = tf.placeholder(shape=(), dtype=tf.float64) self._new_count = tf.placeholder(shape=(), dtype=tf.float64)
with tf.variable_scope(scope, reuse=tf.AUTO_REUSE): with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
self._mean = tf.get_variable('mean', initializer=np.zeros(shape, 'float64'), dtype=tf.float64) self._mean = tf.get_variable('mean', initializer=np.zeros(shape, 'float64'), dtype=tf.float64)
self._var = tf.get_variable('std', initializer=np.ones(shape, 'float64'), dtype=tf.float64) self._var = tf.get_variable('std', initializer=np.ones(shape, 'float64'), dtype=tf.float64)
self._count = tf.get_variable('count', initializer=np.full((), epsilon, 'float64'), dtype=tf.float64) self._count = tf.get_variable('count', initializer=np.full((), epsilon, 'float64'), dtype=tf.float64)
self.update_ops = tf.group([ self.update_ops = tf.group([
@@ -61,10 +61,10 @@ class TfRunningMeanStd(object):
sess.run(tf.variables_initializer([self._mean, self._var, self._count])) sess.run(tf.variables_initializer([self._mean, self._var, self._count]))
self.sess = sess self.sess = sess
self._set_mean_var_count() self._set_mean_var_count()
def _set_mean_var_count(self): def _set_mean_var_count(self):
self.mean, self.var, self.count = self.sess.run([self._mean, self._var, self._count]) self.mean, self.var, self.count = self.sess.run([self._mean, self._var, self._count])
def update(self, x): def update(self, x):
batch_mean = np.mean(x, axis=0) batch_mean = np.mean(x, axis=0)
batch_var = np.var(x, axis=0) batch_var = np.var(x, axis=0)
@@ -74,13 +74,13 @@ class TfRunningMeanStd(object):
self.sess.run(self.update_ops, feed_dict={ self.sess.run(self.update_ops, feed_dict={
self._new_mean: new_mean, self._new_mean: new_mean,
self._new_var: new_var, self._new_var: new_var,
self._new_count: new_count self._new_count: new_count
}) })
self._set_mean_var_count() self._set_mean_var_count()
def test_runningmeanstd(): def test_runningmeanstd():
for (x1, x2, x3) in [ for (x1, x2, x3) in [
@@ -145,7 +145,7 @@ def profile_tf_runningmeanstd():
print('rms update time ({} trials): {} s'.format(n_trials, tic2 - tic1)) print('rms update time ({} trials): {} s'.format(n_trials, tic2 - tic1))
print('tfrms update time ({} trials): {} s'.format(n_trials, tic3 - tic2)) print('tfrms update time ({} trials): {} s'.format(n_trials, tic3 - tic2))
tic1 = time.time() tic1 = time.time()
for _ in range(n_trials): for _ in range(n_trials):
@@ -161,21 +161,21 @@ def profile_tf_runningmeanstd():
print('rms get mean time ({} trials): {} s'.format(n_trials, tic2 - tic1)) print('rms get mean time ({} trials): {} s'.format(n_trials, tic2 - tic1))
print('tfrms get mean time ({} trials): {} s'.format(n_trials, tic3 - tic2)) print('tfrms get mean time ({} trials): {} s'.format(n_trials, tic3 - tic2))
''' '''
options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) #pylint: disable=E1101 options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) #pylint: disable=E1101
run_metadata = tf.RunMetadata() run_metadata = tf.RunMetadata()
profile_opts = dict(options=options, run_metadata=run_metadata) profile_opts = dict(options=options, run_metadata=run_metadata)
from tensorflow.python.client import timeline from tensorflow.python.client import timeline
fetched_timeline = timeline.Timeline(run_metadata.step_stats) #pylint: disable=E1101 fetched_timeline = timeline.Timeline(run_metadata.step_stats) #pylint: disable=E1101
chrome_trace = fetched_timeline.generate_chrome_trace_format() chrome_trace = fetched_timeline.generate_chrome_trace_format()
outfile = '/tmp/timeline.json' outfile = '/tmp/timeline.json'
with open(outfile, 'wt') as f: with open(outfile, 'wt') as f:
f.write(chrome_trace) f.write(chrome_trace)
print(f'Successfully saved profile to {outfile}. Exiting.') print(f'Successfully saved profile to {outfile}. Exiting.')
exit(0) exit(0)
@@ -184,4 +184,4 @@ def profile_tf_runningmeanstd():
if __name__ == '__main__': if __name__ == '__main__':
profile_tf_runningmeanstd() profile_tf_runningmeanstd()

View File

@@ -40,5 +40,5 @@ class FixedSequenceEnv(Env):
def _get_reward(self, actions): def _get_reward(self, actions):
return 1 if actions == self.sequence[self.time] else 0 return 1 if actions == self.sequence[self.time] else 0

View File

@@ -15,7 +15,7 @@ class MnistEnv(Env):
no_images=None no_images=None
): ):
from tensorflow.examples.tutorials.mnist import input_data from tensorflow.examples.tutorials.mnist import input_data
# we could use temporary directory for this with a context manager and # we could use temporary directory for this with a context manager and
# TemporaryDirecotry, but then each test that uses mnist would re-download the data # TemporaryDirecotry, but then each test that uses mnist would re-download the data
# this way the data is not cleaned up, but we only download it once per machine # this way the data is not cleaned up, but we only download it once per machine
mnist_path = osp.join(tempfile.gettempdir(), 'MNIST_data') mnist_path = osp.join(tempfile.gettempdir(), 'MNIST_data')
@@ -33,7 +33,7 @@ class MnistEnv(Env):
self.train_mode() self.train_mode()
self.reset() self.reset()
def reset(self): def reset(self):
self._choose_next_state() self._choose_next_state()
self.time = 0 self.time = 0

View File

@@ -10,7 +10,7 @@ common_kwargs = dict(
gamma=1.0, gamma=1.0,
seed=0, seed=0,
) )
learn_kwargs = { learn_kwargs = {
'a2c' : dict(nsteps=32, value_network='copy', lr=0.05), 'a2c' : dict(nsteps=32, value_network='copy', lr=0.05),
'acktr': dict(nsteps=32, value_network='copy'), 'acktr': dict(nsteps=32, value_network='copy'),
@@ -31,8 +31,8 @@ def test_cartpole(alg):
kwargs.update(learn_kwargs[alg]) kwargs.update(learn_kwargs[alg])
learn_fn = lambda e: get_learn_function(alg)(env=e, **kwargs) learn_fn = lambda e: get_learn_function(alg)(env=e, **kwargs)
def env_fn(): def env_fn():
env = gym.make('CartPole-v0') env = gym.make('CartPole-v0')
env.seed(0) env.seed(0)
return env return env

View File

@@ -8,7 +8,7 @@ except BaseException:
@pytest.mark.skipif( @pytest.mark.skipif(
not _mujoco_present, not _mujoco_present,
reason='error loading mujoco - either mujoco / mujoco key not present, or LD_LIBRARY_PATH is not pointing to mujoco library' reason='error loading mujoco - either mujoco / mujoco key not present, or LD_LIBRARY_PATH is not pointing to mujoco library'
) )
def test_lstm_example(): def test_lstm_example():
@@ -37,12 +37,12 @@ def test_lstm_example():
action, _, state, _ = policy.step(ob, S=state, M=done) action, _, state, _ = policy.step(ob, S=state, M=done)
ob, reward, done, _ = venv.step(action) ob, reward, done, _ = venv.step(action)
step_counter += 1 step_counter += 1
if done: if done:
break break
assert step_counter > 5 assert step_counter > 5

View File

@@ -8,7 +8,7 @@ common_kwargs = dict(
seed=0, seed=0,
total_timesteps=50000, total_timesteps=50000,
) )
learn_kwargs = { learn_kwargs = {
'a2c': {}, 'a2c': {},
'ppo2': dict(nsteps=10, ent_coef=0.0, nminibatches=1), 'ppo2': dict(nsteps=10, ent_coef=0.0, nminibatches=1),
@@ -36,7 +36,7 @@ def test_fixed_sequence(alg, rnn):
episode_len = 5 episode_len = 5
env_fn = lambda: FixedSequenceEnv(10, episode_len=episode_len) env_fn = lambda: FixedSequenceEnv(10, episode_len=episode_len)
learn = lambda e: get_learn_function(alg)( learn = lambda e: get_learn_function(alg)(
env=e, env=e,
network=rnn, network=rnn,
**kwargs **kwargs
) )
@@ -47,5 +47,5 @@ def test_fixed_sequence(alg, rnn):
if __name__ == '__main__': if __name__ == '__main__':
test_fixed_sequence('ppo2', 'lstm') test_fixed_sequence('ppo2', 'lstm')

View File

@@ -9,7 +9,7 @@ common_kwargs = dict(
gamma=0.9, gamma=0.9,
seed=0, seed=0,
) )
learn_kwargs = { learn_kwargs = {
'a2c' : {}, 'a2c' : {},
'acktr': {}, 'acktr': {},
@@ -51,5 +51,5 @@ def test_continuous_identity(alg):
simple_test(env_fn, learn_fn, -0.1) simple_test(env_fn, learn_fn, -0.1)
if __name__ == '__main__': if __name__ == '__main__':
test_continuous_identity('a2c') test_continuous_identity('a2c')

View File

@@ -6,7 +6,7 @@ from baselines.common.tests.util import simple_test
from baselines.run import get_learn_function from baselines.run import get_learn_function
# TODO investigate a2c and ppo2 failures - is it due to bad hyperparameters for this problem? # TODO investigate a2c and ppo2 failures - is it due to bad hyperparameters for this problem?
# GitHub issue https://github.com/openai/baselines/issues/189 # GitHub issue https://github.com/openai/baselines/issues/189
common_kwargs = { common_kwargs = {
'seed': 0, 'seed': 0,
@@ -25,21 +25,21 @@ learn_args = {
'trpo_mpi': dict(total_timesteps=80000, timesteps_per_batch=100, cg_iters=10, lam=1.0, max_kl=0.001) 'trpo_mpi': dict(total_timesteps=80000, timesteps_per_batch=100, cg_iters=10, lam=1.0, max_kl=0.001)
} }
#tests pass, but are too slow on travis. Same algorithms are covered #tests pass, but are too slow on travis. Same algorithms are covered
# by other tests with less compute-hungry nn's and by benchmarks # by other tests with less compute-hungry nn's and by benchmarks
@pytest.mark.skip @pytest.mark.skip
@pytest.mark.slow @pytest.mark.slow
@pytest.mark.parametrize("alg", learn_args.keys()) @pytest.mark.parametrize("alg", learn_args.keys())
def test_mnist(alg): def test_mnist(alg):
''' '''
Test if the algorithm can learn to classify MNIST digits. Test if the algorithm can learn to classify MNIST digits.
Uses CNN policy. Uses CNN policy.
''' '''
learn_kwargs = learn_args[alg] learn_kwargs = learn_args[alg]
learn_kwargs.update(common_kwargs) learn_kwargs.update(common_kwargs)
learn = get_learn_function(alg) learn = get_learn_function(alg)
learn_fn = lambda e: learn(env=e, **learn_kwargs) learn_fn = lambda e: learn(env=e, **learn_kwargs)
env_fn = lambda: MnistEnv(seed=0, episode_len=100) env_fn = lambda: MnistEnv(seed=0, episode_len=100)

View File

@@ -14,15 +14,15 @@ from functools import partial
learn_kwargs = { learn_kwargs = {
'deepq': {}, 'deepq': {},
'a2c': {}, 'a2c': {},
'acktr': {}, 'acktr': {},
'ppo2': {'nminibatches': 1, 'nsteps': 10}, 'ppo2': {'nminibatches': 1, 'nsteps': 10},
'trpo_mpi': {}, 'trpo_mpi': {},
} }
network_kwargs = { network_kwargs = {
'mlp': {}, 'mlp': {},
'cnn': {'pad': 'SAME'}, 'cnn': {'pad': 'SAME'},
'lstm': {}, 'lstm': {},
'cnn_lnlstm': {'pad': 'SAME'} 'cnn_lnlstm': {'pad': 'SAME'}
} }
@@ -32,15 +32,15 @@ network_kwargs = {
@pytest.mark.parametrize("network_fn", network_kwargs.keys()) @pytest.mark.parametrize("network_fn", network_kwargs.keys())
def test_serialization(learn_fn, network_fn): def test_serialization(learn_fn, network_fn):
''' '''
Test if the trained model can be serialized Test if the trained model can be serialized
''' '''
if network_fn.endswith('lstm') and learn_fn in ['acktr', 'trpo_mpi', 'deepq']: if network_fn.endswith('lstm') and learn_fn in ['acktr', 'trpo_mpi', 'deepq']:
# TODO make acktr work with recurrent policies # TODO make acktr work with recurrent policies
# and test # and test
# github issue: https://github.com/openai/baselines/issues/194 # github issue: https://github.com/openai/baselines/issues/194
return return
env = DummyVecEnv([lambda: MnistEnv(10, episode_len=100)]) env = DummyVecEnv([lambda: MnistEnv(10, episode_len=100)])
ob = env.reset().copy() ob = env.reset().copy()
@@ -74,14 +74,14 @@ def test_serialization(learn_fn, network_fn):
np.testing.assert_allclose(mean1, mean2, atol=0.5) np.testing.assert_allclose(mean1, mean2, atol=0.5)
np.testing.assert_allclose(std1, std2, atol=0.5) np.testing.assert_allclose(std1, std2, atol=0.5)
def _serialize_variables(): def _serialize_variables():
sess = get_session() sess = get_session()
variables = tf.trainable_variables() variables = tf.trainable_variables()
values = sess.run(variables) values = sess.run(variables)
return {var.name: value for var, value in zip(variables, values)} return {var.name: value for var, value in zip(variables, values)}
def _get_action_stats(model, ob): def _get_action_stats(model, ob):
ntrials = 1000 ntrials = 1000

View File

@@ -30,7 +30,7 @@ def simple_test(env_fn, learn_fn, min_reward_fraction, n_trials=N_TRIALS):
a, v, state, _ = model.step(obs, S=state, M=[False]) a, v, state, _ = model.step(obs, S=state, M=[False])
else: else:
a, v, _, _ = model.step(obs) a, v, _, _ = model.step(obs)
obs, rew, done, _ = env.step(a) obs, rew, done, _ = env.step(a)
sum_rew += float(rew) sum_rew += float(rew)
@@ -46,7 +46,7 @@ def reward_per_episode_test(env_fn, learn_fn, min_avg_reward, n_trials=N_EPISODE
with tf.Graph().as_default(), tf.Session(config=tf.ConfigProto(allow_soft_placement=True)).as_default(): with tf.Graph().as_default(), tf.Session(config=tf.ConfigProto(allow_soft_placement=True)).as_default():
model = learn_fn(env) model = learn_fn(env)
N_TRIALS = 100 N_TRIALS = 100
observations, actions, rewards = rollout(env, model, N_TRIALS) observations, actions, rewards = rollout(env, model, N_TRIALS)
rewards = [sum(r) for r in rewards] rewards = [sum(r) for r in rewards]

View File

@@ -347,7 +347,7 @@ def load_variables(load_path, variables=None, sess=None):
variables = variables or tf.trainable_variables() variables = variables or tf.trainable_variables()
loaded_params = joblib.load(os.path.expanduser(load_path)) loaded_params = joblib.load(os.path.expanduser(load_path))
restores = [] restores = []
if isinstance(loaded_params, list): if isinstance(loaded_params, list):
assert len(loaded_params) == len(variables), 'number of variables loaded mismatches len(variables)' assert len(loaded_params) == len(variables), 'number of variables loaded mismatches len(variables)'
for d, v in zip(loaded_params, variables): for d, v in zip(loaded_params, variables):

View File

@@ -9,8 +9,8 @@ class DummyVecEnv(VecEnv):
env = self.envs[0] env = self.envs[0]
VecEnv.__init__(self, len(env_fns), env.observation_space, env.action_space) VecEnv.__init__(self, len(env_fns), env.observation_space, env.action_space)
obs_space = env.observation_space obs_space = env.observation_space
self.keys, shapes, dtypes = obs_space_info(obs_space) self.keys, shapes, dtypes = obs_space_info(obs_space)
self.buf_obs = { k: np.zeros((self.num_envs,) + tuple(shapes[k]), dtype=dtypes[k]) for k in self.keys } self.buf_obs = { k: np.zeros((self.num_envs,) + tuple(shapes[k]), dtype=dtypes[k]) for k in self.keys }
self.buf_dones = np.zeros((self.num_envs,), dtype=np.bool) self.buf_dones = np.zeros((self.num_envs,), dtype=np.bool)
self.buf_rews = np.zeros((self.num_envs,), dtype=np.float32) self.buf_rews = np.zeros((self.num_envs,), dtype=np.float32)
@@ -62,7 +62,7 @@ class DummyVecEnv(VecEnv):
def get_images(self): def get_images(self):
return [env.render(mode='rgb_array') for env in self.envs] return [env.render(mode='rgb_array') for env in self.envs]
def render(self, mode='human'): def render(self, mode='human'):
if self.num_envs == 1: if self.num_envs == 1:
self.envs[0].render(mode=mode) self.envs[0].render(mode=mode)

View File

@@ -71,7 +71,7 @@ class Memory(object):
def append(self, obs0, action, reward, obs1, terminal1, training=True): def append(self, obs0, action, reward, obs1, terminal1, training=True):
if not training: if not training:
return return
self.observations0.append(obs0) self.observations0.append(obs0)
self.actions.append(action) self.actions.append(action)
self.rewards.append(reward) self.rewards.append(reward)

View File

@@ -35,12 +35,12 @@ class Actor(Model):
if self.layer_norm: if self.layer_norm:
x = tc.layers.layer_norm(x, center=True, scale=True) x = tc.layers.layer_norm(x, center=True, scale=True)
x = tf.nn.relu(x) x = tf.nn.relu(x)
x = tf.layers.dense(x, 64) x = tf.layers.dense(x, 64)
if self.layer_norm: if self.layer_norm:
x = tc.layers.layer_norm(x, center=True, scale=True) x = tc.layers.layer_norm(x, center=True, scale=True)
x = tf.nn.relu(x) x = tf.nn.relu(x)
x = tf.layers.dense(x, self.nb_actions, kernel_initializer=tf.random_uniform_initializer(minval=-3e-3, maxval=3e-3)) x = tf.layers.dense(x, self.nb_actions, kernel_initializer=tf.random_uniform_initializer(minval=-3e-3, maxval=3e-3))
x = tf.nn.tanh(x) x = tf.nn.tanh(x)
return x return x

View File

@@ -176,7 +176,7 @@ def learn(env,
load_path: str load_path: str
path to load the model from. (default: None) path to load the model from. (default: None)
**network_kwargs **network_kwargs
additional keyword arguments to pass to the network builder. additional keyword arguments to pass to the network builder.
Returns Returns
------- -------
@@ -215,7 +215,7 @@ def learn(env,
} }
act = ActWrapper(act, act_params) act = ActWrapper(act, act_params)
# Create the replay buffer # Create the replay buffer
if prioritized_replay: if prioritized_replay:
replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha)
@@ -246,7 +246,7 @@ def learn(env,
model_file = os.path.join(td, "model") model_file = os.path.join(td, "model")
model_saved = False model_saved = False
if tf.train.latest_checkpoint(td) is not None: if tf.train.latest_checkpoint(td) is not None:
load_variables(model_file) load_variables(model_file)
logger.log('Loaded model from {}'.format(model_file)) logger.log('Loaded model from {}'.format(model_file))
@@ -254,7 +254,7 @@ def learn(env,
elif load_path is not None: elif load_path is not None:
load_variables(load_path) load_variables(load_path)
logger.log('Loaded model from {}'.format(load_path)) logger.log('Loaded model from {}'.format(load_path))
for t in range(total_timesteps): for t in range(total_timesteps):
if callback is not None: if callback is not None:

View File

@@ -7,7 +7,7 @@ from baselines.common import models
def main(): def main():
env = gym.make("MountainCar-v0") env = gym.make("MountainCar-v0")
act = deepq.learn( act = deepq.learn(
env, env,
network=models.mlp(num_layers=1, num_hidden=64), network=models.mlp(num_layers=1, num_hidden=64),
total_timesteps=0, total_timesteps=0,
load_path='mountaincar_model.pkl' load_path='mountaincar_model.pkl'

View File

@@ -29,7 +29,7 @@ def main():
model.save('pong_model.pkl') model.save('pong_model.pkl')
env.close() env.close()
if __name__ == '__main__': if __name__ == '__main__':

View File

@@ -94,8 +94,8 @@ def cnn_to_mlp(convs, hiddens, dueling=False, layer_norm=False):
def build_q_func(network, hiddens=[256], dueling=True, layer_norm=False, **network_kwargs): def build_q_func(network, hiddens=[256], dueling=True, layer_norm=False, **network_kwargs):
if isinstance(network, str): if isinstance(network, str):
from baselines.common.models import get_network_builder from baselines.common.models import get_network_builder
network = get_network_builder(network)(**network_kwargs) network = get_network_builder(network)(**network_kwargs)
def q_func_builder(input_placeholder, num_actions, scope, reuse=False): def q_func_builder(input_placeholder, num_actions, scope, reuse=False):
with tf.variable_scope(scope, reuse=reuse): with tf.variable_scope(scope, reuse=reuse):
latent, _ = network(input_placeholder) latent, _ = network(input_placeholder)
@@ -125,5 +125,5 @@ def build_q_func(network, hiddens=[256], dueling=True, layer_norm=False, **netwo
else: else:
q_out = action_scores q_out = action_scores
return q_out return q_out
return q_func_builder return q_func_builder

View File

@@ -66,13 +66,13 @@ class Uint8Input(PlaceholderTfInput):
class ObservationInput(PlaceholderTfInput): class ObservationInput(PlaceholderTfInput):
def __init__(self, observation_space, name=None): def __init__(self, observation_space, name=None):
"""Creates an input placeholder tailored to a specific observation space """Creates an input placeholder tailored to a specific observation space
Parameters Parameters
---------- ----------
observation_space: observation_space:
observation space of the environment. Should be one of the gym.spaces types observation space of the environment. Should be one of the gym.spaces types
name: str name: str
tensorflow name of the underlying placeholder tensorflow name of the underlying placeholder
""" """
inpt, self.processed_inpt = observation_input(observation_space, name=name) inpt, self.processed_inpt = observation_input(observation_space, name=name)
@@ -80,5 +80,5 @@ class ObservationInput(PlaceholderTfInput):
def get(self): def get(self):
return self.processed_inpt return self.processed_inpt

View File

@@ -41,7 +41,7 @@ def main(policy_file, seed, n_test_rollouts, render):
for name in ['T', 'gamma', 'noise_eps', 'random_eps']: for name in ['T', 'gamma', 'noise_eps', 'random_eps']:
eval_params[name] = params[name] eval_params[name] = params[name]
evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params) evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params)
evaluator.seed(seed) evaluator.seed(seed)

View File

@@ -37,12 +37,12 @@ def load_results(file):
def pad(xs, value=np.nan): def pad(xs, value=np.nan):
maxlen = np.max([len(x) for x in xs]) maxlen = np.max([len(x) for x in xs])
padded_xs = [] padded_xs = []
for x in xs: for x in xs:
if x.shape[0] >= maxlen: if x.shape[0] >= maxlen:
padded_xs.append(x) padded_xs.append(x)
padding = np.ones((maxlen - x.shape[0],) + x.shape[1:]) * value padding = np.ones((maxlen - x.shape[0],) + x.shape[1:]) * value
x_padded = np.concatenate([x, padding], axis=0) x_padded = np.concatenate([x, padding], axis=0)
assert x_padded.shape[1:] == x.shape[1:] assert x_padded.shape[1:] == x.shape[1:]

View File

@@ -23,17 +23,17 @@ def train(num_timesteps, seed, model_path=None):
max_timesteps=num_timesteps, max_timesteps=num_timesteps,
timesteps_per_actorbatch=2048, timesteps_per_actorbatch=2048,
clip_param=0.2, entcoeff=0.0, clip_param=0.2, entcoeff=0.0,
optim_epochs=10, optim_epochs=10,
optim_stepsize=3e-4, optim_stepsize=3e-4,
optim_batchsize=64, optim_batchsize=64,
gamma=0.99, gamma=0.99,
lam=0.95, lam=0.95,
schedule='linear', schedule='linear',
) )
env.close() env.close()
if model_path: if model_path:
U.save_state(model_path) U.save_state(model_path)
return pi return pi
class RewScale(gym.RewardWrapper): class RewScale(gym.RewardWrapper):
@@ -48,28 +48,28 @@ def main():
parser = mujoco_arg_parser() parser = mujoco_arg_parser()
parser.add_argument('--model-path', default=os.path.join(logger.get_dir(), 'humanoid_policy')) parser.add_argument('--model-path', default=os.path.join(logger.get_dir(), 'humanoid_policy'))
parser.set_defaults(num_timesteps=int(2e7)) parser.set_defaults(num_timesteps=int(2e7))
args = parser.parse_args() args = parser.parse_args()
if not args.play: if not args.play:
# train the model # train the model
train(num_timesteps=args.num_timesteps, seed=args.seed, model_path=args.model_path) train(num_timesteps=args.num_timesteps, seed=args.seed, model_path=args.model_path)
else: else:
# construct the model object, load pre-trained model and render # construct the model object, load pre-trained model and render
pi = train(num_timesteps=1, seed=args.seed) pi = train(num_timesteps=1, seed=args.seed)
U.load_state(args.model_path) U.load_state(args.model_path)
env = make_mujoco_env('Humanoid-v2', seed=0) env = make_mujoco_env('Humanoid-v2', seed=0)
ob = env.reset() ob = env.reset()
while True: while True:
action = pi.act(stochastic=False, ob=ob)[0] action = pi.act(stochastic=False, ob=ob)[0]
ob, _, done, _ = env.step(action) ob, _, done, _ = env.step(action)
env.render() env.render()
if done: if done:
ob = env.reset() ob = env.reset()
if __name__ == '__main__': if __name__ == '__main__':
main() main()

View File

@@ -155,20 +155,20 @@ def learn(*, network, env, total_timesteps, seed=None, nsteps=2048, ent_coef=0.0
save_interval=0, load_path=None, **network_kwargs): save_interval=0, load_path=None, **network_kwargs):
''' '''
Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347) Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347)
Parameters: Parameters:
---------- ----------
network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list) network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets. neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
See common/models.py/lstm for more details on using recurrent nets in policies See common/models.py/lstm for more details on using recurrent nets in policies
env: baselines.common.vec_env.VecEnv environment. Needs to be vectorized for parallel environment simulation. env: baselines.common.vec_env.VecEnv environment. Needs to be vectorized for parallel environment simulation.
The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class. The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class.
nsteps: int number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where nsteps: int number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where
nenv is number of environment copies simulated in parallel) nenv is number of environment copies simulated in parallel)
@@ -176,38 +176,38 @@ def learn(*, network, env, total_timesteps, seed=None, nsteps=2048, ent_coef=0.0
ent_coef: float policy entropy coefficient in the optimization objective ent_coef: float policy entropy coefficient in the optimization objective
lr: float or function learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the lr: float or function learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the
training and 0 is the end of the training. training and 0 is the end of the training.
vf_coef: float value function loss coefficient in the optimization objective vf_coef: float value function loss coefficient in the optimization objective
max_grad_norm: float or None gradient norm clipping coefficient max_grad_norm: float or None gradient norm clipping coefficient
gamma: float discounting factor gamma: float discounting factor
lam: float advantage estimation discounting factor (lambda in the paper) lam: float advantage estimation discounting factor (lambda in the paper)
log_interval: int number of timesteps between logging events log_interval: int number of timesteps between logging events
nminibatches: int number of training minibatches per update. For recurrent policies, nminibatches: int number of training minibatches per update. For recurrent policies,
should be smaller or equal than number of environments run in parallel. should be smaller or equal than number of environments run in parallel.
noptepochs: int number of training epochs per update noptepochs: int number of training epochs per update
cliprange: float or function clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training cliprange: float or function clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training
and 0 is the end of the training and 0 is the end of the training
save_interval: int number of timesteps between saving events save_interval: int number of timesteps between saving events
load_path: str path to load the model from load_path: str path to load the model from
**network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network **network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
For instance, 'mlp' network architecture has arguments num_hidden and num_layers. For instance, 'mlp' network architecture has arguments num_hidden and num_layers.
''' '''
set_global_seeds(seed) set_global_seeds(seed)
if isinstance(lr, float): lr = constfn(lr) if isinstance(lr, float): lr = constfn(lr)

View File

@@ -84,4 +84,4 @@ def main():
plt.show() plt.show()
if __name__ == '__main__': if __name__ == '__main__':
main() main()

View File

@@ -120,7 +120,7 @@ def build_env(args):
env = bench.Monitor(env, logger.get_dir()) env = bench.Monitor(env, logger.get_dir())
env = retro_wrappers.wrap_deepmind_retro(env) env = retro_wrappers.wrap_deepmind_retro(env)
else: else:
get_session(tf.ConfigProto(allow_soft_placement=True, get_session(tf.ConfigProto(allow_soft_placement=True,
intra_op_parallelism_threads=1, intra_op_parallelism_threads=1,
inter_op_parallelism_threads=1)) inter_op_parallelism_threads=1))
@@ -128,7 +128,7 @@ def build_env(args):
env = make_vec_env(env_id, env_type, args.num_env or 1, seed, reward_scale=args.reward_scale) env = make_vec_env(env_id, env_type, args.num_env or 1, seed, reward_scale=args.reward_scale)
if env_type == 'mujoco': if env_type == 'mujoco':
env = VecNormalize(env) env = VecNormalize(env)
return env return env

View File

@@ -4,7 +4,7 @@ from baselines.common.models import mlp, cnn_small
def atari(): def atari():
return dict( return dict(
network = cnn_small(), network = cnn_small(),
timesteps_per_batch=512, timesteps_per_batch=512,
max_kl=0.001, max_kl=0.001,
cg_iters=10, cg_iters=10,
cg_damping=1e-3, cg_damping=1e-3,
@@ -26,5 +26,5 @@ def mujoco():
lam=0.98, lam=0.98,
vf_iters=5, vf_iters=5,
vf_stepsize=1e-3, vf_stepsize=1e-3,
normalize_observations=True, normalize_observations=True,
) )

View File

@@ -83,13 +83,13 @@ def add_vtarg_and_adv(seg, gamma, lam):
seg["tdlamret"] = seg["adv"] + seg["vpred"] seg["tdlamret"] = seg["adv"] + seg["vpred"]
def learn(*, def learn(*,
network, network,
env, env,
total_timesteps, total_timesteps,
timesteps_per_batch=1024, # what to train on timesteps_per_batch=1024, # what to train on
max_kl=0.001, max_kl=0.001,
cg_iters=10, cg_iters=10,
gamma=0.99, gamma=0.99,
lam=1.0, # advantage estimation lam=1.0, # advantage estimation
seed=None, seed=None,
entcoeff=0.0, entcoeff=0.0,
@@ -103,7 +103,7 @@ def learn(*,
): ):
''' '''
learn a policy function with TRPO algorithm learn a policy function with TRPO algorithm
Parameters: Parameters:
---------- ----------
@@ -121,7 +121,7 @@ def learn(*,
cg_iters number of iterations of conjugate gradient algorithm cg_iters number of iterations of conjugate gradient algorithm
cg_damping conjugate gradient damping cg_damping conjugate gradient damping
vf_stepsize learning rate for adam optimizer used to optimie value function loss vf_stepsize learning rate for adam optimizer used to optimie value function loss
@@ -130,11 +130,11 @@ def learn(*,
total_timesteps max number of timesteps total_timesteps max number of timesteps
max_episodes max number of episodes max_episodes max number of episodes
max_iters maximum number of policy optimization iterations max_iters maximum number of policy optimization iterations
callback function to be called with (locals(), globals()) each policy optimization step callback function to be called with (locals(), globals()) each policy optimization step
load_path str, path to load the model from (default: None, i.e. no model is loaded) load_path str, path to load the model from (default: None, i.e. no model is loaded)
**network_kwargs keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network **network_kwargs keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
@@ -145,18 +145,18 @@ def learn(*,
learnt model learnt model
''' '''
nworkers = MPI.COMM_WORLD.Get_size() nworkers = MPI.COMM_WORLD.Get_size()
rank = MPI.COMM_WORLD.Get_rank() rank = MPI.COMM_WORLD.Get_rank()
cpus_per_worker = 1 cpus_per_worker = 1
U.get_session(config=tf.ConfigProto( U.get_session(config=tf.ConfigProto(
allow_soft_placement=True, allow_soft_placement=True,
inter_op_parallelism_threads=cpus_per_worker, inter_op_parallelism_threads=cpus_per_worker,
intra_op_parallelism_threads=cpus_per_worker intra_op_parallelism_threads=cpus_per_worker
)) ))
policy = build_policy(env, network, value_network='copy', **network_kwargs) policy = build_policy(env, network, value_network='copy', **network_kwargs)
set_global_seeds(seed) set_global_seeds(seed)
@@ -245,7 +245,7 @@ def learn(*,
U.initialize() U.initialize()
if load_path is not None: if load_path is not None:
pi.load(load_path) pi.load(load_path)
th_init = get_flat() th_init = get_flat()
MPI.COMM_WORLD.Bcast(th_init, root=0) MPI.COMM_WORLD.Bcast(th_init, root=0)
set_from_flat(th_init) set_from_flat(th_init)
@@ -384,8 +384,8 @@ def get_trainable_variables(scope):
return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope) return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope)
def get_vf_trainable_variables(scope): def get_vf_trainable_variables(scope):
return [v for v in get_trainable_variables(scope) if 'vf' in v.name[len(scope):].split('/')] return [v for v in get_trainable_variables(scope) if 'vf' in v.name[len(scope):].split('/')]
def get_pi_trainable_variables(scope): def get_pi_trainable_variables(scope):
return [v for v in get_trainable_variables(scope) if 'pi' in v.name[len(scope):].split('/')] return [v for v in get_trainable_variables(scope) if 'pi' in v.name[len(scope):].split('/')]

View File

@@ -1,5 +1,5 @@
[flake8] [flake8]
select = F,E999 select = F,E999,W291,W293
exclude = exclude =
.git, .git,
__pycache__, __pycache__,

View File

@@ -8,7 +8,7 @@ if sys.version_info.major != 3:
extras = { extras = {
'test': [ 'test': [
'filelock', 'filelock',
'pytest' 'pytest'
] ]
} }