tighten flake8, autopep8 to fix trailing whitespaces and blank lines with whitespaces (#87)
This commit is contained in:
@@ -97,21 +97,21 @@ def learn(
|
||||
load_path=None,
|
||||
**network_kwargs):
|
||||
|
||||
'''
|
||||
'''
|
||||
Main entrypoint for A2C algorithm. Train a policy with given network architecture on a given environment using a2c algorithm.
|
||||
|
||||
Parameters:
|
||||
-----------
|
||||
|
||||
network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
|
||||
specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
|
||||
specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
|
||||
tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
|
||||
neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
|
||||
See baselines.common/policies.py/lstm for more details on using recurrent nets in policies
|
||||
|
||||
|
||||
|
||||
env: RL environment. Should implement interface similar to VecEnv (baselines.common/vec_env) or be wrapped with DummyVecEnv (baselines.common/vec_env/dummy_vec_env.py)
|
||||
|
||||
|
||||
|
||||
seed: seed to make random number sequence in the alorightm reproducible. By default is None which means seed from system noise generator (not reproducible)
|
||||
|
||||
@@ -128,7 +128,7 @@ def learn(
|
||||
|
||||
lr: float, learning rate for RMSProp (current implementation has RMSProp hardcoded in) (default: 7e-4)
|
||||
|
||||
lrschedule: schedule of learning rate. Can be 'linear', 'constant', or a function [0..1] -> [0..1] that takes fraction of the training progress as input and
|
||||
lrschedule: schedule of learning rate. Can be 'linear', 'constant', or a function [0..1] -> [0..1] that takes fraction of the training progress as input and
|
||||
returns fraction of the learning rate (specified as lr) as output
|
||||
|
||||
epsilon: float, RMSProp epsilon (stabilizes square root computation in denominator of RMSProp update) (default: 1e-5)
|
||||
@@ -140,17 +140,17 @@ def learn(
|
||||
log_interval: int, specifies how frequently the logs are printed out (default: 100)
|
||||
|
||||
**network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
|
||||
For instance, 'mlp' network architecture has arguments num_hidden and num_layers.
|
||||
For instance, 'mlp' network architecture has arguments num_hidden and num_layers.
|
||||
|
||||
'''
|
||||
|
||||
|
||||
|
||||
|
||||
set_global_seeds(seed)
|
||||
|
||||
nenvs = env.num_envs
|
||||
policy = build_policy(env, network, **network_kwargs)
|
||||
|
||||
|
||||
model = Model(policy=policy, env=env, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef,
|
||||
max_grad_norm=max_grad_norm, lr=lr, alpha=alpha, epsilon=epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule)
|
||||
if load_path is not None:
|
||||
|
@@ -9,7 +9,7 @@ class Runner(AbstractEnvRunner):
|
||||
self.gamma = gamma
|
||||
self.batch_action_shape = [x if x is not None else -1 for x in model.train_model.action.shape.as_list()]
|
||||
self.ob_dtype = model.train_model.X.dtype.as_numpy_dtype
|
||||
|
||||
|
||||
def run(self):
|
||||
mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[]
|
||||
mb_states = self.states
|
||||
@@ -51,7 +51,7 @@ class Runner(AbstractEnvRunner):
|
||||
rewards = discount_with_dones(rewards, dones, self.gamma)
|
||||
|
||||
mb_rewards[n] = rewards
|
||||
|
||||
|
||||
mb_actions = mb_actions.reshape(self.batch_action_shape)
|
||||
|
||||
mb_rewards = mb_rewards.flatten()
|
||||
|
@@ -70,7 +70,7 @@ class Model(object):
|
||||
MU = tf.placeholder(tf.float32, [nbatch, nact]) # mu's
|
||||
LR = tf.placeholder(tf.float32, [])
|
||||
eps = 1e-6
|
||||
|
||||
|
||||
step_ob_placeholder = tf.placeholder(dtype=ob_space.dtype, shape=(nenvs,) + ob_space.shape[:-1] + (ob_space.shape[-1] * nstack,))
|
||||
train_ob_placeholder = tf.placeholder(dtype=ob_space.dtype, shape=(nenvs*(nsteps+1),) + ob_space.shape[:-1] + (ob_space.shape[-1] * nstack,))
|
||||
with tf.variable_scope('acer_model', reuse=tf.AUTO_REUSE):
|
||||
@@ -78,7 +78,7 @@ class Model(object):
|
||||
step_model = policy(observ_placeholder=step_ob_placeholder, sess=sess)
|
||||
train_model = policy(observ_placeholder=train_ob_placeholder, sess=sess)
|
||||
|
||||
|
||||
|
||||
params = find_trainable_variables("acer_model")
|
||||
print("Params {}".format(len(params)))
|
||||
for var in params:
|
||||
@@ -97,10 +97,10 @@ class Model(object):
|
||||
polyak_model = policy(observ_placeholder=train_ob_placeholder, sess=sess)
|
||||
|
||||
# Notation: (var) = batch variable, (var)s = seqeuence variable, (var)_i = variable index by action at step i
|
||||
|
||||
|
||||
# action probability distributions according to train_model, polyak_model and step_model
|
||||
# poilcy.pi is probability distribution parameters; to obtain distribution that sums to 1 need to take softmax
|
||||
train_model_p = tf.nn.softmax(train_model.pi)
|
||||
train_model_p = tf.nn.softmax(train_model.pi)
|
||||
polyak_model_p = tf.nn.softmax(polyak_model.pi)
|
||||
step_model_p = tf.nn.softmax(step_model.pi)
|
||||
v = tf.reduce_sum(train_model_p * train_model.q, axis = -1) # shape is [nenvs * (nsteps + 1)]
|
||||
@@ -119,7 +119,7 @@ class Model(object):
|
||||
qret = q_retrace(R, D, q_i, v, rho_i, nenvs, nsteps, gamma)
|
||||
|
||||
# Calculate losses
|
||||
# Entropy
|
||||
# Entropy
|
||||
# entropy = tf.reduce_mean(strip(train_model.pd.entropy(), nenvs, nsteps))
|
||||
entropy = tf.reduce_mean(cat_entropy_softmax(f))
|
||||
|
||||
@@ -212,8 +212,8 @@ class Model(object):
|
||||
|
||||
def _step(observation, **kwargs):
|
||||
return step_model._evaluate([step_model.action, step_model_p, step_model.state], observation, **kwargs)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
self.train = train
|
||||
self.save = functools.partial(save_variables, sess=sess, variables=params)
|
||||
@@ -283,18 +283,18 @@ def learn(network, env, seed=None, nsteps=20, nstack=4, total_timesteps=int(80e6
|
||||
----------
|
||||
|
||||
network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
|
||||
specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
|
||||
specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
|
||||
tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
|
||||
neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
|
||||
See baselines.common/policies.py/lstm for more details on using recurrent nets in policies
|
||||
|
||||
env: environment. Needs to be vectorized for parallel environment simulation.
|
||||
env: environment. Needs to be vectorized for parallel environment simulation.
|
||||
The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class.
|
||||
|
||||
nsteps: int, number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where
|
||||
nenv is number of environment copies simulated in parallel) (default: 20)
|
||||
|
||||
nstack: int, size of the frame stack, i.e. number of the frames passed to the step model. Frames are stacked along channel dimension
|
||||
nstack: int, size of the frame stack, i.e. number of the frames passed to the step model. Frames are stacked along channel dimension
|
||||
(last image dimension) (default: 4)
|
||||
|
||||
total_timesteps: int, number of timesteps (i.e. number of actions taken in the environment) (default: 80M)
|
||||
@@ -303,11 +303,11 @@ def learn(network, env, seed=None, nsteps=20, nstack=4, total_timesteps=int(80e6
|
||||
|
||||
ent_coef: float, policy entropy coefficient in the optimization objective (default: 0.01)
|
||||
|
||||
max_grad_norm: float, gradient norm clipping coefficient. If set to None, no clipping. (default: 10),
|
||||
|
||||
max_grad_norm: float, gradient norm clipping coefficient. If set to None, no clipping. (default: 10),
|
||||
|
||||
lr: float, learning rate for RMSProp (current implementation has RMSProp hardcoded in) (default: 7e-4)
|
||||
|
||||
lrschedule: schedule of learning rate. Can be 'linear', 'constant', or a function [0..1] -> [0..1] that takes fraction of the training progress as input and
|
||||
lrschedule: schedule of learning rate. Can be 'linear', 'constant', or a function [0..1] -> [0..1] that takes fraction of the training progress as input and
|
||||
returns fraction of the learning rate (specified as lr) as output
|
||||
|
||||
rprop_epsilon: float, RMSProp epsilon (stabilizes square root computation in denominator of RMSProp update) (default: 1e-5)
|
||||
@@ -325,17 +325,17 @@ def learn(network, env, seed=None, nsteps=20, nstack=4, total_timesteps=int(80e6
|
||||
replay_start: int, the sampling from the replay buffer does not start until replay buffer has at least that many samples (default: 10k)
|
||||
|
||||
c: float, importance weight clipping factor (default: 10)
|
||||
|
||||
|
||||
trust_region bool, whether or not algorithms estimates the gradient KL divergence between the old and updated policy and uses it to determine step size (default: True)
|
||||
|
||||
delta: float, max KL divergence between the old policy and updated policy (default: 1)
|
||||
|
||||
alpha: float, momentum factor in the Polyak (exponential moving average) averaging of the model parameters (default: 0.99)
|
||||
alpha: float, momentum factor in the Polyak (exponential moving average) averaging of the model parameters (default: 0.99)
|
||||
|
||||
load_path: str, path to load the model from (default: None)
|
||||
|
||||
**network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
|
||||
For instance, 'mlp' network architecture has arguments num_hidden and num_layers.
|
||||
For instance, 'mlp' network architecture has arguments num_hidden and num_layers.
|
||||
|
||||
'''
|
||||
|
||||
|
@@ -1,2 +1,2 @@
|
||||
from baselines.bench.benchmarks import *
|
||||
from baselines.bench.monitor import *
|
||||
from baselines.bench.monitor import *
|
||||
|
@@ -102,7 +102,7 @@ def get_monitor_files(dir):
|
||||
def load_results(dir):
|
||||
import pandas
|
||||
monitor_files = (
|
||||
glob(osp.join(dir, "*monitor.json")) +
|
||||
glob(osp.join(dir, "*monitor.json")) +
|
||||
glob(osp.join(dir, "*monitor.csv"))) # get both csv and (old) json files
|
||||
if not monitor_files:
|
||||
raise LoadMonitorResultsError("no monitor files of the form *%s found in %s" % (Monitor.EXT, dir))
|
||||
|
@@ -31,4 +31,4 @@ def cg(f_Ax, b, cg_iters=10, callback=None, verbose=False, residual_tol=1e-10):
|
||||
if callback is not None:
|
||||
callback(x)
|
||||
if verbose: print(fmtstr % (i+1, rdotr, np.linalg.norm(x))) # pylint: disable=W0631
|
||||
return x
|
||||
return x
|
||||
|
@@ -29,7 +29,7 @@ def make_vec_env(env_id, env_type, num_env, seed, wrapper_kwargs=None, start_ind
|
||||
def _thunk():
|
||||
env = make_atari(env_id) if env_type == 'atari' else gym.make(env_id)
|
||||
env.seed(seed + 10000*mpi_rank + rank if seed is not None else None)
|
||||
env = Monitor(env,
|
||||
env = Monitor(env,
|
||||
logger.get_dir() and os.path.join(logger.get_dir(), str(mpi_rank) + '.' + str(rank)),
|
||||
allow_early_resets=True)
|
||||
|
||||
|
@@ -2,7 +2,7 @@ from __future__ import print_function
|
||||
from contextlib import contextmanager
|
||||
import numpy as np
|
||||
import time
|
||||
import shlex
|
||||
import shlex
|
||||
import subprocess
|
||||
|
||||
# ================================================================
|
||||
|
@@ -2,15 +2,15 @@ import tensorflow as tf
|
||||
from gym.spaces import Discrete, Box
|
||||
|
||||
def observation_placeholder(ob_space, batch_size=None, name='Ob'):
|
||||
'''
|
||||
'''
|
||||
Create placeholder to feed observations into of the size appropriate to the observation space
|
||||
|
||||
|
||||
Parameters:
|
||||
----------
|
||||
|
||||
ob_space: gym.Space observation space
|
||||
|
||||
batch_size: int size of the batch to be fed into input. Can be left None in most cases.
|
||||
|
||||
batch_size: int size of the batch to be fed into input. Can be left None in most cases.
|
||||
|
||||
name: str name of the placeholder
|
||||
|
||||
@@ -27,9 +27,9 @@ def observation_placeholder(ob_space, batch_size=None, name='Ob'):
|
||||
|
||||
|
||||
def observation_input(ob_space, batch_size=None, name='Ob'):
|
||||
'''
|
||||
Create placeholder to feed observations into of the size appropriate to the observation space, and add input
|
||||
encoder of the appropriate type.
|
||||
'''
|
||||
Create placeholder to feed observations into of the size appropriate to the observation space, and add input
|
||||
encoder of the appropriate type.
|
||||
'''
|
||||
|
||||
placeholder = observation_placeholder(ob_space, batch_size, name)
|
||||
@@ -41,9 +41,9 @@ def encode_observation(ob_space, placeholder):
|
||||
|
||||
Parameters:
|
||||
----------
|
||||
|
||||
|
||||
ob_space: gym.Space observation space
|
||||
|
||||
|
||||
placeholder: tf.placeholder observation input placeholder
|
||||
'''
|
||||
if isinstance(ob_space, Discrete):
|
||||
|
@@ -82,4 +82,4 @@ def test_discount_with_boundaries():
|
||||
2 + gamma * 3,
|
||||
3,
|
||||
4
|
||||
])
|
||||
])
|
||||
|
@@ -76,4 +76,4 @@ def test_MpiAdam():
|
||||
for i in range(10):
|
||||
l,g = lossandgrad()
|
||||
adam.update(g, stepsize)
|
||||
print(i,l)
|
||||
print(i,l)
|
||||
|
@@ -4,7 +4,7 @@ def mpi_fork(n, bind_to_core=False):
|
||||
"""Re-launches the current script with workers
|
||||
Returns "parent" for original parent, "child" for MPI children
|
||||
"""
|
||||
if n<=1:
|
||||
if n<=1:
|
||||
return "child"
|
||||
if os.getenv("IN_MPI") is None:
|
||||
env = os.environ.copy()
|
||||
|
@@ -33,8 +33,8 @@ def mpi_moments(x, axis=0, comm=None, keepdims=False):
|
||||
|
||||
def test_runningmeanstd():
|
||||
import subprocess
|
||||
subprocess.check_call(['mpirun', '-np', '3',
|
||||
'python','-c',
|
||||
subprocess.check_call(['mpirun', '-np', '3',
|
||||
'python','-c',
|
||||
'from baselines.common.mpi_moments import _helper_runningmeanstd; _helper_runningmeanstd()'])
|
||||
|
||||
def _helper_runningmeanstd():
|
||||
|
@@ -32,7 +32,7 @@ class PolicyWithValue(object):
|
||||
**tensors tensorflow tensors for additional attributes such as state or mask
|
||||
|
||||
"""
|
||||
|
||||
|
||||
self.X = observations
|
||||
self.state = tf.constant([])
|
||||
self.initial_state = None
|
||||
@@ -85,7 +85,7 @@ class PolicyWithValue(object):
|
||||
-------
|
||||
(action, value estimate, next state, negative log likelihood of the action under current policy parameters) tuple
|
||||
"""
|
||||
|
||||
|
||||
a, v, state, neglogp = self._evaluate([self.action, self.vf, self.state, self.neglogp], observation, **extra_feed)
|
||||
if state.size == 0:
|
||||
state = None
|
||||
@@ -106,14 +106,14 @@ class PolicyWithValue(object):
|
||||
-------
|
||||
value estimate
|
||||
"""
|
||||
return self._evaluate(self.vf, ob, *args, **kwargs)
|
||||
return self._evaluate(self.vf, ob, *args, **kwargs)
|
||||
|
||||
def save(self, save_path):
|
||||
tf_util.save_state(save_path, sess=self.sess)
|
||||
|
||||
def load(self, load_path):
|
||||
tf_util.load_state(load_path, sess=self.sess)
|
||||
|
||||
|
||||
def build_policy(env, policy_network, value_network=None, normalize_observations=False, estimate_q=False, **policy_kwargs):
|
||||
if isinstance(policy_network, str):
|
||||
network_type = policy_network
|
||||
@@ -123,7 +123,7 @@ def build_policy(env, policy_network, value_network=None, normalize_observation
|
||||
ob_space = env.observation_space
|
||||
|
||||
X = observ_placeholder if observ_placeholder is not None else observation_placeholder(ob_space, batch_size=nbatch)
|
||||
|
||||
|
||||
extra_tensors = {}
|
||||
|
||||
if normalize_observations and X.dtype == tf.float32:
|
||||
@@ -144,7 +144,7 @@ def build_policy(env, policy_network, value_network=None, normalize_observation
|
||||
policy_latent, recurrent_tensors = policy_network(encoded_x, nenv)
|
||||
extra_tensors.update(recurrent_tensors)
|
||||
|
||||
|
||||
|
||||
_v_net = value_network
|
||||
|
||||
if _v_net is None or _v_net == 'shared':
|
||||
@@ -154,10 +154,10 @@ def build_policy(env, policy_network, value_network=None, normalize_observation
|
||||
_v_net = policy_network
|
||||
else:
|
||||
assert callable(_v_net)
|
||||
|
||||
|
||||
with tf.variable_scope('vf', reuse=tf.AUTO_REUSE):
|
||||
vf_latent, _ = _v_net(encoded_x)
|
||||
|
||||
|
||||
policy = PolicyWithValue(
|
||||
env=env,
|
||||
observations=X,
|
||||
@@ -176,4 +176,4 @@ def _normalize_clip_observation(x, clip_range=[-5.0, 5.0]):
|
||||
rms = RunningMeanStd(shape=x.shape[1:])
|
||||
norm_x = tf.clip_by_value((x - rms.mean) / rms.std, min(clip_range), max(clip_range))
|
||||
return norm_x, rms
|
||||
|
||||
|
||||
|
@@ -23,15 +23,15 @@ def update_mean_var_count_from_moments(mean, var, count, batch_mean, batch_var,
|
||||
delta = batch_mean - mean
|
||||
tot_count = count + batch_count
|
||||
|
||||
new_mean = mean + delta * batch_count / tot_count
|
||||
new_mean = mean + delta * batch_count / tot_count
|
||||
m_a = var * count
|
||||
m_b = batch_var * batch_count
|
||||
M2 = m_a + m_b + np.square(delta) * count * batch_count / (count + batch_count)
|
||||
new_var = M2 / (count + batch_count)
|
||||
new_count = batch_count + count
|
||||
|
||||
|
||||
return new_mean, new_var, new_count
|
||||
|
||||
|
||||
|
||||
class TfRunningMeanStd(object):
|
||||
# https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
|
||||
@@ -46,10 +46,10 @@ class TfRunningMeanStd(object):
|
||||
self._new_var = tf.placeholder(shape=shape, dtype=tf.float64)
|
||||
self._new_count = tf.placeholder(shape=(), dtype=tf.float64)
|
||||
|
||||
|
||||
|
||||
with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
|
||||
self._mean = tf.get_variable('mean', initializer=np.zeros(shape, 'float64'), dtype=tf.float64)
|
||||
self._var = tf.get_variable('std', initializer=np.ones(shape, 'float64'), dtype=tf.float64)
|
||||
self._var = tf.get_variable('std', initializer=np.ones(shape, 'float64'), dtype=tf.float64)
|
||||
self._count = tf.get_variable('count', initializer=np.full((), epsilon, 'float64'), dtype=tf.float64)
|
||||
|
||||
self.update_ops = tf.group([
|
||||
@@ -61,10 +61,10 @@ class TfRunningMeanStd(object):
|
||||
sess.run(tf.variables_initializer([self._mean, self._var, self._count]))
|
||||
self.sess = sess
|
||||
self._set_mean_var_count()
|
||||
|
||||
|
||||
def _set_mean_var_count(self):
|
||||
self.mean, self.var, self.count = self.sess.run([self._mean, self._var, self._count])
|
||||
|
||||
self.mean, self.var, self.count = self.sess.run([self._mean, self._var, self._count])
|
||||
|
||||
def update(self, x):
|
||||
batch_mean = np.mean(x, axis=0)
|
||||
batch_var = np.var(x, axis=0)
|
||||
@@ -74,13 +74,13 @@ class TfRunningMeanStd(object):
|
||||
|
||||
self.sess.run(self.update_ops, feed_dict={
|
||||
self._new_mean: new_mean,
|
||||
self._new_var: new_var,
|
||||
self._new_var: new_var,
|
||||
self._new_count: new_count
|
||||
})
|
||||
|
||||
self._set_mean_var_count()
|
||||
|
||||
|
||||
|
||||
|
||||
def test_runningmeanstd():
|
||||
for (x1, x2, x3) in [
|
||||
@@ -145,7 +145,7 @@ def profile_tf_runningmeanstd():
|
||||
|
||||
print('rms update time ({} trials): {} s'.format(n_trials, tic2 - tic1))
|
||||
print('tfrms update time ({} trials): {} s'.format(n_trials, tic3 - tic2))
|
||||
|
||||
|
||||
|
||||
tic1 = time.time()
|
||||
for _ in range(n_trials):
|
||||
@@ -161,21 +161,21 @@ def profile_tf_runningmeanstd():
|
||||
|
||||
print('rms get mean time ({} trials): {} s'.format(n_trials, tic2 - tic1))
|
||||
print('tfrms get mean time ({} trials): {} s'.format(n_trials, tic3 - tic2))
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
'''
|
||||
options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) #pylint: disable=E1101
|
||||
run_metadata = tf.RunMetadata()
|
||||
profile_opts = dict(options=options, run_metadata=run_metadata)
|
||||
|
||||
|
||||
|
||||
|
||||
from tensorflow.python.client import timeline
|
||||
fetched_timeline = timeline.Timeline(run_metadata.step_stats) #pylint: disable=E1101
|
||||
chrome_trace = fetched_timeline.generate_chrome_trace_format()
|
||||
outfile = '/tmp/timeline.json'
|
||||
with open(outfile, 'wt') as f:
|
||||
with open(outfile, 'wt') as f:
|
||||
f.write(chrome_trace)
|
||||
print(f'Successfully saved profile to {outfile}. Exiting.')
|
||||
exit(0)
|
||||
@@ -184,4 +184,4 @@ def profile_tf_runningmeanstd():
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
profile_tf_runningmeanstd()
|
||||
profile_tf_runningmeanstd()
|
||||
|
@@ -40,5 +40,5 @@ class FixedSequenceEnv(Env):
|
||||
|
||||
def _get_reward(self, actions):
|
||||
return 1 if actions == self.sequence[self.time] else 0
|
||||
|
||||
|
||||
|
||||
|
@@ -15,7 +15,7 @@ class MnistEnv(Env):
|
||||
no_images=None
|
||||
):
|
||||
from tensorflow.examples.tutorials.mnist import input_data
|
||||
# we could use temporary directory for this with a context manager and
|
||||
# we could use temporary directory for this with a context manager and
|
||||
# TemporaryDirecotry, but then each test that uses mnist would re-download the data
|
||||
# this way the data is not cleaned up, but we only download it once per machine
|
||||
mnist_path = osp.join(tempfile.gettempdir(), 'MNIST_data')
|
||||
@@ -33,7 +33,7 @@ class MnistEnv(Env):
|
||||
|
||||
self.train_mode()
|
||||
self.reset()
|
||||
|
||||
|
||||
def reset(self):
|
||||
self._choose_next_state()
|
||||
self.time = 0
|
||||
|
@@ -10,7 +10,7 @@ common_kwargs = dict(
|
||||
gamma=1.0,
|
||||
seed=0,
|
||||
)
|
||||
|
||||
|
||||
learn_kwargs = {
|
||||
'a2c' : dict(nsteps=32, value_network='copy', lr=0.05),
|
||||
'acktr': dict(nsteps=32, value_network='copy'),
|
||||
@@ -31,8 +31,8 @@ def test_cartpole(alg):
|
||||
kwargs.update(learn_kwargs[alg])
|
||||
|
||||
learn_fn = lambda e: get_learn_function(alg)(env=e, **kwargs)
|
||||
def env_fn():
|
||||
|
||||
def env_fn():
|
||||
|
||||
env = gym.make('CartPole-v0')
|
||||
env.seed(0)
|
||||
return env
|
||||
|
@@ -8,7 +8,7 @@ except BaseException:
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not _mujoco_present,
|
||||
not _mujoco_present,
|
||||
reason='error loading mujoco - either mujoco / mujoco key not present, or LD_LIBRARY_PATH is not pointing to mujoco library'
|
||||
)
|
||||
def test_lstm_example():
|
||||
@@ -37,12 +37,12 @@ def test_lstm_example():
|
||||
action, _, state, _ = policy.step(ob, S=state, M=done)
|
||||
ob, reward, done, _ = venv.step(action)
|
||||
step_counter += 1
|
||||
if done:
|
||||
if done:
|
||||
break
|
||||
|
||||
|
||||
|
||||
assert step_counter > 5
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
@@ -8,7 +8,7 @@ common_kwargs = dict(
|
||||
seed=0,
|
||||
total_timesteps=50000,
|
||||
)
|
||||
|
||||
|
||||
learn_kwargs = {
|
||||
'a2c': {},
|
||||
'ppo2': dict(nsteps=10, ent_coef=0.0, nminibatches=1),
|
||||
@@ -36,7 +36,7 @@ def test_fixed_sequence(alg, rnn):
|
||||
episode_len = 5
|
||||
env_fn = lambda: FixedSequenceEnv(10, episode_len=episode_len)
|
||||
learn = lambda e: get_learn_function(alg)(
|
||||
env=e,
|
||||
env=e,
|
||||
network=rnn,
|
||||
**kwargs
|
||||
)
|
||||
@@ -47,5 +47,5 @@ def test_fixed_sequence(alg, rnn):
|
||||
if __name__ == '__main__':
|
||||
test_fixed_sequence('ppo2', 'lstm')
|
||||
|
||||
|
||||
|
||||
|
||||
|
@@ -9,7 +9,7 @@ common_kwargs = dict(
|
||||
gamma=0.9,
|
||||
seed=0,
|
||||
)
|
||||
|
||||
|
||||
learn_kwargs = {
|
||||
'a2c' : {},
|
||||
'acktr': {},
|
||||
@@ -51,5 +51,5 @@ def test_continuous_identity(alg):
|
||||
simple_test(env_fn, learn_fn, -0.1)
|
||||
|
||||
if __name__ == '__main__':
|
||||
test_continuous_identity('a2c')
|
||||
test_continuous_identity('a2c')
|
||||
|
||||
|
@@ -6,7 +6,7 @@ from baselines.common.tests.util import simple_test
|
||||
from baselines.run import get_learn_function
|
||||
|
||||
|
||||
# TODO investigate a2c and ppo2 failures - is it due to bad hyperparameters for this problem?
|
||||
# TODO investigate a2c and ppo2 failures - is it due to bad hyperparameters for this problem?
|
||||
# GitHub issue https://github.com/openai/baselines/issues/189
|
||||
common_kwargs = {
|
||||
'seed': 0,
|
||||
@@ -25,21 +25,21 @@ learn_args = {
|
||||
'trpo_mpi': dict(total_timesteps=80000, timesteps_per_batch=100, cg_iters=10, lam=1.0, max_kl=0.001)
|
||||
}
|
||||
|
||||
|
||||
#tests pass, but are too slow on travis. Same algorithms are covered
|
||||
|
||||
#tests pass, but are too slow on travis. Same algorithms are covered
|
||||
# by other tests with less compute-hungry nn's and by benchmarks
|
||||
@pytest.mark.skip
|
||||
@pytest.mark.skip
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.parametrize("alg", learn_args.keys())
|
||||
def test_mnist(alg):
|
||||
'''
|
||||
Test if the algorithm can learn to classify MNIST digits.
|
||||
Uses CNN policy.
|
||||
Test if the algorithm can learn to classify MNIST digits.
|
||||
Uses CNN policy.
|
||||
'''
|
||||
|
||||
|
||||
learn_kwargs = learn_args[alg]
|
||||
learn_kwargs.update(common_kwargs)
|
||||
|
||||
|
||||
learn = get_learn_function(alg)
|
||||
learn_fn = lambda e: learn(env=e, **learn_kwargs)
|
||||
env_fn = lambda: MnistEnv(seed=0, episode_len=100)
|
||||
|
@@ -14,15 +14,15 @@ from functools import partial
|
||||
|
||||
learn_kwargs = {
|
||||
'deepq': {},
|
||||
'a2c': {},
|
||||
'a2c': {},
|
||||
'acktr': {},
|
||||
'ppo2': {'nminibatches': 1, 'nsteps': 10},
|
||||
'trpo_mpi': {},
|
||||
}
|
||||
|
||||
network_kwargs = {
|
||||
'mlp': {},
|
||||
'cnn': {'pad': 'SAME'},
|
||||
'mlp': {},
|
||||
'cnn': {'pad': 'SAME'},
|
||||
'lstm': {},
|
||||
'cnn_lnlstm': {'pad': 'SAME'}
|
||||
}
|
||||
@@ -32,15 +32,15 @@ network_kwargs = {
|
||||
@pytest.mark.parametrize("network_fn", network_kwargs.keys())
|
||||
def test_serialization(learn_fn, network_fn):
|
||||
'''
|
||||
Test if the trained model can be serialized
|
||||
Test if the trained model can be serialized
|
||||
'''
|
||||
|
||||
|
||||
|
||||
if network_fn.endswith('lstm') and learn_fn in ['acktr', 'trpo_mpi', 'deepq']:
|
||||
# TODO make acktr work with recurrent policies
|
||||
# and test
|
||||
# github issue: https://github.com/openai/baselines/issues/194
|
||||
return
|
||||
return
|
||||
|
||||
env = DummyVecEnv([lambda: MnistEnv(10, episode_len=100)])
|
||||
ob = env.reset().copy()
|
||||
@@ -74,14 +74,14 @@ def test_serialization(learn_fn, network_fn):
|
||||
np.testing.assert_allclose(mean1, mean2, atol=0.5)
|
||||
np.testing.assert_allclose(std1, std2, atol=0.5)
|
||||
|
||||
|
||||
|
||||
|
||||
def _serialize_variables():
|
||||
sess = get_session()
|
||||
variables = tf.trainable_variables()
|
||||
variables = tf.trainable_variables()
|
||||
values = sess.run(variables)
|
||||
return {var.name: value for var, value in zip(variables, values)}
|
||||
|
||||
|
||||
|
||||
def _get_action_stats(model, ob):
|
||||
ntrials = 1000
|
||||
|
@@ -30,7 +30,7 @@ def simple_test(env_fn, learn_fn, min_reward_fraction, n_trials=N_TRIALS):
|
||||
a, v, state, _ = model.step(obs, S=state, M=[False])
|
||||
else:
|
||||
a, v, _, _ = model.step(obs)
|
||||
|
||||
|
||||
obs, rew, done, _ = env.step(a)
|
||||
sum_rew += float(rew)
|
||||
|
||||
@@ -46,7 +46,7 @@ def reward_per_episode_test(env_fn, learn_fn, min_avg_reward, n_trials=N_EPISODE
|
||||
with tf.Graph().as_default(), tf.Session(config=tf.ConfigProto(allow_soft_placement=True)).as_default():
|
||||
model = learn_fn(env)
|
||||
|
||||
N_TRIALS = 100
|
||||
N_TRIALS = 100
|
||||
|
||||
observations, actions, rewards = rollout(env, model, N_TRIALS)
|
||||
rewards = [sum(r) for r in rewards]
|
||||
|
@@ -347,7 +347,7 @@ def load_variables(load_path, variables=None, sess=None):
|
||||
variables = variables or tf.trainable_variables()
|
||||
|
||||
loaded_params = joblib.load(os.path.expanduser(load_path))
|
||||
restores = []
|
||||
restores = []
|
||||
if isinstance(loaded_params, list):
|
||||
assert len(loaded_params) == len(variables), 'number of variables loaded mismatches len(variables)'
|
||||
for d, v in zip(loaded_params, variables):
|
||||
|
@@ -9,8 +9,8 @@ class DummyVecEnv(VecEnv):
|
||||
env = self.envs[0]
|
||||
VecEnv.__init__(self, len(env_fns), env.observation_space, env.action_space)
|
||||
obs_space = env.observation_space
|
||||
|
||||
self.keys, shapes, dtypes = obs_space_info(obs_space)
|
||||
|
||||
self.keys, shapes, dtypes = obs_space_info(obs_space)
|
||||
self.buf_obs = { k: np.zeros((self.num_envs,) + tuple(shapes[k]), dtype=dtypes[k]) for k in self.keys }
|
||||
self.buf_dones = np.zeros((self.num_envs,), dtype=np.bool)
|
||||
self.buf_rews = np.zeros((self.num_envs,), dtype=np.float32)
|
||||
@@ -62,7 +62,7 @@ class DummyVecEnv(VecEnv):
|
||||
|
||||
def get_images(self):
|
||||
return [env.render(mode='rgb_array') for env in self.envs]
|
||||
|
||||
|
||||
def render(self, mode='human'):
|
||||
if self.num_envs == 1:
|
||||
self.envs[0].render(mode=mode)
|
||||
|
@@ -71,7 +71,7 @@ class Memory(object):
|
||||
def append(self, obs0, action, reward, obs1, terminal1, training=True):
|
||||
if not training:
|
||||
return
|
||||
|
||||
|
||||
self.observations0.append(obs0)
|
||||
self.actions.append(action)
|
||||
self.rewards.append(reward)
|
||||
|
@@ -35,12 +35,12 @@ class Actor(Model):
|
||||
if self.layer_norm:
|
||||
x = tc.layers.layer_norm(x, center=True, scale=True)
|
||||
x = tf.nn.relu(x)
|
||||
|
||||
|
||||
x = tf.layers.dense(x, 64)
|
||||
if self.layer_norm:
|
||||
x = tc.layers.layer_norm(x, center=True, scale=True)
|
||||
x = tf.nn.relu(x)
|
||||
|
||||
|
||||
x = tf.layers.dense(x, self.nb_actions, kernel_initializer=tf.random_uniform_initializer(minval=-3e-3, maxval=3e-3))
|
||||
x = tf.nn.tanh(x)
|
||||
return x
|
||||
|
@@ -176,7 +176,7 @@ def learn(env,
|
||||
load_path: str
|
||||
path to load the model from. (default: None)
|
||||
**network_kwargs
|
||||
additional keyword arguments to pass to the network builder.
|
||||
additional keyword arguments to pass to the network builder.
|
||||
|
||||
Returns
|
||||
-------
|
||||
@@ -215,7 +215,7 @@ def learn(env,
|
||||
}
|
||||
|
||||
act = ActWrapper(act, act_params)
|
||||
|
||||
|
||||
# Create the replay buffer
|
||||
if prioritized_replay:
|
||||
replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha)
|
||||
@@ -246,7 +246,7 @@ def learn(env,
|
||||
|
||||
model_file = os.path.join(td, "model")
|
||||
model_saved = False
|
||||
|
||||
|
||||
if tf.train.latest_checkpoint(td) is not None:
|
||||
load_variables(model_file)
|
||||
logger.log('Loaded model from {}'.format(model_file))
|
||||
@@ -254,7 +254,7 @@ def learn(env,
|
||||
elif load_path is not None:
|
||||
load_variables(load_path)
|
||||
logger.log('Loaded model from {}'.format(load_path))
|
||||
|
||||
|
||||
|
||||
for t in range(total_timesteps):
|
||||
if callback is not None:
|
||||
|
@@ -7,7 +7,7 @@ from baselines.common import models
|
||||
def main():
|
||||
env = gym.make("MountainCar-v0")
|
||||
act = deepq.learn(
|
||||
env,
|
||||
env,
|
||||
network=models.mlp(num_layers=1, num_hidden=64),
|
||||
total_timesteps=0,
|
||||
load_path='mountaincar_model.pkl'
|
||||
|
@@ -29,7 +29,7 @@ def main():
|
||||
|
||||
model.save('pong_model.pkl')
|
||||
env.close()
|
||||
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
@@ -94,8 +94,8 @@ def cnn_to_mlp(convs, hiddens, dueling=False, layer_norm=False):
|
||||
def build_q_func(network, hiddens=[256], dueling=True, layer_norm=False, **network_kwargs):
|
||||
if isinstance(network, str):
|
||||
from baselines.common.models import get_network_builder
|
||||
network = get_network_builder(network)(**network_kwargs)
|
||||
|
||||
network = get_network_builder(network)(**network_kwargs)
|
||||
|
||||
def q_func_builder(input_placeholder, num_actions, scope, reuse=False):
|
||||
with tf.variable_scope(scope, reuse=reuse):
|
||||
latent, _ = network(input_placeholder)
|
||||
@@ -125,5 +125,5 @@ def build_q_func(network, hiddens=[256], dueling=True, layer_norm=False, **netwo
|
||||
else:
|
||||
q_out = action_scores
|
||||
return q_out
|
||||
|
||||
|
||||
return q_func_builder
|
||||
|
@@ -66,13 +66,13 @@ class Uint8Input(PlaceholderTfInput):
|
||||
class ObservationInput(PlaceholderTfInput):
|
||||
def __init__(self, observation_space, name=None):
|
||||
"""Creates an input placeholder tailored to a specific observation space
|
||||
|
||||
|
||||
Parameters
|
||||
----------
|
||||
|
||||
observation_space:
|
||||
observation_space:
|
||||
observation space of the environment. Should be one of the gym.spaces types
|
||||
name: str
|
||||
name: str
|
||||
tensorflow name of the underlying placeholder
|
||||
"""
|
||||
inpt, self.processed_inpt = observation_input(observation_space, name=name)
|
||||
@@ -80,5 +80,5 @@ class ObservationInput(PlaceholderTfInput):
|
||||
|
||||
def get(self):
|
||||
return self.processed_inpt
|
||||
|
||||
|
||||
|
||||
|
||||
|
@@ -41,7 +41,7 @@ def main(policy_file, seed, n_test_rollouts, render):
|
||||
|
||||
for name in ['T', 'gamma', 'noise_eps', 'random_eps']:
|
||||
eval_params[name] = params[name]
|
||||
|
||||
|
||||
evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params)
|
||||
evaluator.seed(seed)
|
||||
|
||||
|
@@ -37,12 +37,12 @@ def load_results(file):
|
||||
|
||||
def pad(xs, value=np.nan):
|
||||
maxlen = np.max([len(x) for x in xs])
|
||||
|
||||
|
||||
padded_xs = []
|
||||
for x in xs:
|
||||
if x.shape[0] >= maxlen:
|
||||
padded_xs.append(x)
|
||||
|
||||
|
||||
padding = np.ones((maxlen - x.shape[0],) + x.shape[1:]) * value
|
||||
x_padded = np.concatenate([x, padding], axis=0)
|
||||
assert x_padded.shape[1:] == x.shape[1:]
|
||||
|
@@ -23,17 +23,17 @@ def train(num_timesteps, seed, model_path=None):
|
||||
max_timesteps=num_timesteps,
|
||||
timesteps_per_actorbatch=2048,
|
||||
clip_param=0.2, entcoeff=0.0,
|
||||
optim_epochs=10,
|
||||
optim_stepsize=3e-4,
|
||||
optim_batchsize=64,
|
||||
gamma=0.99,
|
||||
optim_epochs=10,
|
||||
optim_stepsize=3e-4,
|
||||
optim_batchsize=64,
|
||||
gamma=0.99,
|
||||
lam=0.95,
|
||||
schedule='linear',
|
||||
)
|
||||
env.close()
|
||||
if model_path:
|
||||
U.save_state(model_path)
|
||||
|
||||
|
||||
return pi
|
||||
|
||||
class RewScale(gym.RewardWrapper):
|
||||
@@ -48,28 +48,28 @@ def main():
|
||||
parser = mujoco_arg_parser()
|
||||
parser.add_argument('--model-path', default=os.path.join(logger.get_dir(), 'humanoid_policy'))
|
||||
parser.set_defaults(num_timesteps=int(2e7))
|
||||
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
if not args.play:
|
||||
# train the model
|
||||
train(num_timesteps=args.num_timesteps, seed=args.seed, model_path=args.model_path)
|
||||
else:
|
||||
else:
|
||||
# construct the model object, load pre-trained model and render
|
||||
pi = train(num_timesteps=1, seed=args.seed)
|
||||
U.load_state(args.model_path)
|
||||
env = make_mujoco_env('Humanoid-v2', seed=0)
|
||||
|
||||
ob = env.reset()
|
||||
ob = env.reset()
|
||||
while True:
|
||||
action = pi.act(stochastic=False, ob=ob)[0]
|
||||
ob, _, done, _ = env.step(action)
|
||||
env.render()
|
||||
if done:
|
||||
ob = env.reset()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
@@ -155,20 +155,20 @@ def learn(*, network, env, total_timesteps, seed=None, nsteps=2048, ent_coef=0.0
|
||||
save_interval=0, load_path=None, **network_kwargs):
|
||||
'''
|
||||
Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347)
|
||||
|
||||
|
||||
Parameters:
|
||||
----------
|
||||
|
||||
network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
|
||||
specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
|
||||
specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
|
||||
tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
|
||||
neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
|
||||
See common/models.py/lstm for more details on using recurrent nets in policies
|
||||
|
||||
env: baselines.common.vec_env.VecEnv environment. Needs to be vectorized for parallel environment simulation.
|
||||
env: baselines.common.vec_env.VecEnv environment. Needs to be vectorized for parallel environment simulation.
|
||||
The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class.
|
||||
|
||||
|
||||
|
||||
nsteps: int number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where
|
||||
nenv is number of environment copies simulated in parallel)
|
||||
|
||||
@@ -176,38 +176,38 @@ def learn(*, network, env, total_timesteps, seed=None, nsteps=2048, ent_coef=0.0
|
||||
|
||||
ent_coef: float policy entropy coefficient in the optimization objective
|
||||
|
||||
lr: float or function learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the
|
||||
lr: float or function learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the
|
||||
training and 0 is the end of the training.
|
||||
|
||||
vf_coef: float value function loss coefficient in the optimization objective
|
||||
|
||||
max_grad_norm: float or None gradient norm clipping coefficient
|
||||
|
||||
|
||||
gamma: float discounting factor
|
||||
|
||||
lam: float advantage estimation discounting factor (lambda in the paper)
|
||||
|
||||
log_interval: int number of timesteps between logging events
|
||||
|
||||
nminibatches: int number of training minibatches per update. For recurrent policies,
|
||||
should be smaller or equal than number of environments run in parallel.
|
||||
nminibatches: int number of training minibatches per update. For recurrent policies,
|
||||
should be smaller or equal than number of environments run in parallel.
|
||||
|
||||
noptepochs: int number of training epochs per update
|
||||
|
||||
cliprange: float or function clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training
|
||||
and 0 is the end of the training
|
||||
cliprange: float or function clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training
|
||||
and 0 is the end of the training
|
||||
|
||||
save_interval: int number of timesteps between saving events
|
||||
|
||||
load_path: str path to load the model from
|
||||
|
||||
**network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
|
||||
For instance, 'mlp' network architecture has arguments num_hidden and num_layers.
|
||||
For instance, 'mlp' network architecture has arguments num_hidden and num_layers.
|
||||
|
||||
|
||||
|
||||
|
||||
'''
|
||||
|
||||
|
||||
set_global_seeds(seed)
|
||||
|
||||
if isinstance(lr, float): lr = constfn(lr)
|
||||
|
@@ -84,4 +84,4 @@ def main():
|
||||
plt.show()
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
main()
|
||||
|
@@ -120,7 +120,7 @@ def build_env(args):
|
||||
env = bench.Monitor(env, logger.get_dir())
|
||||
env = retro_wrappers.wrap_deepmind_retro(env)
|
||||
|
||||
else:
|
||||
else:
|
||||
get_session(tf.ConfigProto(allow_soft_placement=True,
|
||||
intra_op_parallelism_threads=1,
|
||||
inter_op_parallelism_threads=1))
|
||||
@@ -128,7 +128,7 @@ def build_env(args):
|
||||
env = make_vec_env(env_id, env_type, args.num_env or 1, seed, reward_scale=args.reward_scale)
|
||||
|
||||
if env_type == 'mujoco':
|
||||
env = VecNormalize(env)
|
||||
env = VecNormalize(env)
|
||||
|
||||
return env
|
||||
|
||||
|
@@ -4,7 +4,7 @@ from baselines.common.models import mlp, cnn_small
|
||||
def atari():
|
||||
return dict(
|
||||
network = cnn_small(),
|
||||
timesteps_per_batch=512,
|
||||
timesteps_per_batch=512,
|
||||
max_kl=0.001,
|
||||
cg_iters=10,
|
||||
cg_damping=1e-3,
|
||||
@@ -26,5 +26,5 @@ def mujoco():
|
||||
lam=0.98,
|
||||
vf_iters=5,
|
||||
vf_stepsize=1e-3,
|
||||
normalize_observations=True,
|
||||
normalize_observations=True,
|
||||
)
|
||||
|
@@ -83,13 +83,13 @@ def add_vtarg_and_adv(seg, gamma, lam):
|
||||
seg["tdlamret"] = seg["adv"] + seg["vpred"]
|
||||
|
||||
def learn(*,
|
||||
network,
|
||||
network,
|
||||
env,
|
||||
total_timesteps,
|
||||
total_timesteps,
|
||||
timesteps_per_batch=1024, # what to train on
|
||||
max_kl=0.001,
|
||||
cg_iters=10,
|
||||
gamma=0.99,
|
||||
max_kl=0.001,
|
||||
cg_iters=10,
|
||||
gamma=0.99,
|
||||
lam=1.0, # advantage estimation
|
||||
seed=None,
|
||||
entcoeff=0.0,
|
||||
@@ -103,7 +103,7 @@ def learn(*,
|
||||
):
|
||||
'''
|
||||
learn a policy function with TRPO algorithm
|
||||
|
||||
|
||||
Parameters:
|
||||
----------
|
||||
|
||||
@@ -121,7 +121,7 @@ def learn(*,
|
||||
|
||||
cg_iters number of iterations of conjugate gradient algorithm
|
||||
|
||||
cg_damping conjugate gradient damping
|
||||
cg_damping conjugate gradient damping
|
||||
|
||||
vf_stepsize learning rate for adam optimizer used to optimie value function loss
|
||||
|
||||
@@ -130,11 +130,11 @@ def learn(*,
|
||||
total_timesteps max number of timesteps
|
||||
|
||||
max_episodes max number of episodes
|
||||
|
||||
|
||||
max_iters maximum number of policy optimization iterations
|
||||
|
||||
callback function to be called with (locals(), globals()) each policy optimization step
|
||||
|
||||
|
||||
load_path str, path to load the model from (default: None, i.e. no model is loaded)
|
||||
|
||||
**network_kwargs keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
|
||||
@@ -145,18 +145,18 @@ def learn(*,
|
||||
learnt model
|
||||
|
||||
'''
|
||||
|
||||
|
||||
|
||||
|
||||
nworkers = MPI.COMM_WORLD.Get_size()
|
||||
rank = MPI.COMM_WORLD.Get_rank()
|
||||
|
||||
cpus_per_worker = 1
|
||||
U.get_session(config=tf.ConfigProto(
|
||||
allow_soft_placement=True,
|
||||
allow_soft_placement=True,
|
||||
inter_op_parallelism_threads=cpus_per_worker,
|
||||
intra_op_parallelism_threads=cpus_per_worker
|
||||
))
|
||||
|
||||
|
||||
|
||||
policy = build_policy(env, network, value_network='copy', **network_kwargs)
|
||||
set_global_seeds(seed)
|
||||
@@ -245,7 +245,7 @@ def learn(*,
|
||||
U.initialize()
|
||||
if load_path is not None:
|
||||
pi.load(load_path)
|
||||
|
||||
|
||||
th_init = get_flat()
|
||||
MPI.COMM_WORLD.Bcast(th_init, root=0)
|
||||
set_from_flat(th_init)
|
||||
@@ -384,8 +384,8 @@ def get_trainable_variables(scope):
|
||||
return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope)
|
||||
|
||||
def get_vf_trainable_variables(scope):
|
||||
return [v for v in get_trainable_variables(scope) if 'vf' in v.name[len(scope):].split('/')]
|
||||
return [v for v in get_trainable_variables(scope) if 'vf' in v.name[len(scope):].split('/')]
|
||||
|
||||
def get_pi_trainable_variables(scope):
|
||||
return [v for v in get_trainable_variables(scope) if 'pi' in v.name[len(scope):].split('/')]
|
||||
return [v for v in get_trainable_variables(scope) if 'pi' in v.name[len(scope):].split('/')]
|
||||
|
||||
|
@@ -1,5 +1,5 @@
|
||||
[flake8]
|
||||
select = F,E999
|
||||
select = F,E999,W291,W293
|
||||
exclude =
|
||||
.git,
|
||||
__pycache__,
|
||||
|
Reference in New Issue
Block a user