tighten flake8, autopep8 to fix trailing whitespaces and blank lines with whitespaces (#87)
This commit is contained in:
@@ -97,21 +97,21 @@ def learn(
|
|||||||
load_path=None,
|
load_path=None,
|
||||||
**network_kwargs):
|
**network_kwargs):
|
||||||
|
|
||||||
'''
|
'''
|
||||||
Main entrypoint for A2C algorithm. Train a policy with given network architecture on a given environment using a2c algorithm.
|
Main entrypoint for A2C algorithm. Train a policy with given network architecture on a given environment using a2c algorithm.
|
||||||
|
|
||||||
Parameters:
|
Parameters:
|
||||||
-----------
|
-----------
|
||||||
|
|
||||||
network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
|
network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
|
||||||
specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
|
specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
|
||||||
tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
|
tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
|
||||||
neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
|
neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
|
||||||
See baselines.common/policies.py/lstm for more details on using recurrent nets in policies
|
See baselines.common/policies.py/lstm for more details on using recurrent nets in policies
|
||||||
|
|
||||||
|
|
||||||
env: RL environment. Should implement interface similar to VecEnv (baselines.common/vec_env) or be wrapped with DummyVecEnv (baselines.common/vec_env/dummy_vec_env.py)
|
env: RL environment. Should implement interface similar to VecEnv (baselines.common/vec_env) or be wrapped with DummyVecEnv (baselines.common/vec_env/dummy_vec_env.py)
|
||||||
|
|
||||||
|
|
||||||
seed: seed to make random number sequence in the alorightm reproducible. By default is None which means seed from system noise generator (not reproducible)
|
seed: seed to make random number sequence in the alorightm reproducible. By default is None which means seed from system noise generator (not reproducible)
|
||||||
|
|
||||||
@@ -128,7 +128,7 @@ def learn(
|
|||||||
|
|
||||||
lr: float, learning rate for RMSProp (current implementation has RMSProp hardcoded in) (default: 7e-4)
|
lr: float, learning rate for RMSProp (current implementation has RMSProp hardcoded in) (default: 7e-4)
|
||||||
|
|
||||||
lrschedule: schedule of learning rate. Can be 'linear', 'constant', or a function [0..1] -> [0..1] that takes fraction of the training progress as input and
|
lrschedule: schedule of learning rate. Can be 'linear', 'constant', or a function [0..1] -> [0..1] that takes fraction of the training progress as input and
|
||||||
returns fraction of the learning rate (specified as lr) as output
|
returns fraction of the learning rate (specified as lr) as output
|
||||||
|
|
||||||
epsilon: float, RMSProp epsilon (stabilizes square root computation in denominator of RMSProp update) (default: 1e-5)
|
epsilon: float, RMSProp epsilon (stabilizes square root computation in denominator of RMSProp update) (default: 1e-5)
|
||||||
@@ -140,17 +140,17 @@ def learn(
|
|||||||
log_interval: int, specifies how frequently the logs are printed out (default: 100)
|
log_interval: int, specifies how frequently the logs are printed out (default: 100)
|
||||||
|
|
||||||
**network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
|
**network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
|
||||||
For instance, 'mlp' network architecture has arguments num_hidden and num_layers.
|
For instance, 'mlp' network architecture has arguments num_hidden and num_layers.
|
||||||
|
|
||||||
'''
|
'''
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
set_global_seeds(seed)
|
set_global_seeds(seed)
|
||||||
|
|
||||||
nenvs = env.num_envs
|
nenvs = env.num_envs
|
||||||
policy = build_policy(env, network, **network_kwargs)
|
policy = build_policy(env, network, **network_kwargs)
|
||||||
|
|
||||||
model = Model(policy=policy, env=env, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef,
|
model = Model(policy=policy, env=env, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef,
|
||||||
max_grad_norm=max_grad_norm, lr=lr, alpha=alpha, epsilon=epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule)
|
max_grad_norm=max_grad_norm, lr=lr, alpha=alpha, epsilon=epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule)
|
||||||
if load_path is not None:
|
if load_path is not None:
|
||||||
|
@@ -9,7 +9,7 @@ class Runner(AbstractEnvRunner):
|
|||||||
self.gamma = gamma
|
self.gamma = gamma
|
||||||
self.batch_action_shape = [x if x is not None else -1 for x in model.train_model.action.shape.as_list()]
|
self.batch_action_shape = [x if x is not None else -1 for x in model.train_model.action.shape.as_list()]
|
||||||
self.ob_dtype = model.train_model.X.dtype.as_numpy_dtype
|
self.ob_dtype = model.train_model.X.dtype.as_numpy_dtype
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[]
|
mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[]
|
||||||
mb_states = self.states
|
mb_states = self.states
|
||||||
@@ -51,7 +51,7 @@ class Runner(AbstractEnvRunner):
|
|||||||
rewards = discount_with_dones(rewards, dones, self.gamma)
|
rewards = discount_with_dones(rewards, dones, self.gamma)
|
||||||
|
|
||||||
mb_rewards[n] = rewards
|
mb_rewards[n] = rewards
|
||||||
|
|
||||||
mb_actions = mb_actions.reshape(self.batch_action_shape)
|
mb_actions = mb_actions.reshape(self.batch_action_shape)
|
||||||
|
|
||||||
mb_rewards = mb_rewards.flatten()
|
mb_rewards = mb_rewards.flatten()
|
||||||
|
@@ -70,7 +70,7 @@ class Model(object):
|
|||||||
MU = tf.placeholder(tf.float32, [nbatch, nact]) # mu's
|
MU = tf.placeholder(tf.float32, [nbatch, nact]) # mu's
|
||||||
LR = tf.placeholder(tf.float32, [])
|
LR = tf.placeholder(tf.float32, [])
|
||||||
eps = 1e-6
|
eps = 1e-6
|
||||||
|
|
||||||
step_ob_placeholder = tf.placeholder(dtype=ob_space.dtype, shape=(nenvs,) + ob_space.shape[:-1] + (ob_space.shape[-1] * nstack,))
|
step_ob_placeholder = tf.placeholder(dtype=ob_space.dtype, shape=(nenvs,) + ob_space.shape[:-1] + (ob_space.shape[-1] * nstack,))
|
||||||
train_ob_placeholder = tf.placeholder(dtype=ob_space.dtype, shape=(nenvs*(nsteps+1),) + ob_space.shape[:-1] + (ob_space.shape[-1] * nstack,))
|
train_ob_placeholder = tf.placeholder(dtype=ob_space.dtype, shape=(nenvs*(nsteps+1),) + ob_space.shape[:-1] + (ob_space.shape[-1] * nstack,))
|
||||||
with tf.variable_scope('acer_model', reuse=tf.AUTO_REUSE):
|
with tf.variable_scope('acer_model', reuse=tf.AUTO_REUSE):
|
||||||
@@ -78,7 +78,7 @@ class Model(object):
|
|||||||
step_model = policy(observ_placeholder=step_ob_placeholder, sess=sess)
|
step_model = policy(observ_placeholder=step_ob_placeholder, sess=sess)
|
||||||
train_model = policy(observ_placeholder=train_ob_placeholder, sess=sess)
|
train_model = policy(observ_placeholder=train_ob_placeholder, sess=sess)
|
||||||
|
|
||||||
|
|
||||||
params = find_trainable_variables("acer_model")
|
params = find_trainable_variables("acer_model")
|
||||||
print("Params {}".format(len(params)))
|
print("Params {}".format(len(params)))
|
||||||
for var in params:
|
for var in params:
|
||||||
@@ -97,10 +97,10 @@ class Model(object):
|
|||||||
polyak_model = policy(observ_placeholder=train_ob_placeholder, sess=sess)
|
polyak_model = policy(observ_placeholder=train_ob_placeholder, sess=sess)
|
||||||
|
|
||||||
# Notation: (var) = batch variable, (var)s = seqeuence variable, (var)_i = variable index by action at step i
|
# Notation: (var) = batch variable, (var)s = seqeuence variable, (var)_i = variable index by action at step i
|
||||||
|
|
||||||
# action probability distributions according to train_model, polyak_model and step_model
|
# action probability distributions according to train_model, polyak_model and step_model
|
||||||
# poilcy.pi is probability distribution parameters; to obtain distribution that sums to 1 need to take softmax
|
# poilcy.pi is probability distribution parameters; to obtain distribution that sums to 1 need to take softmax
|
||||||
train_model_p = tf.nn.softmax(train_model.pi)
|
train_model_p = tf.nn.softmax(train_model.pi)
|
||||||
polyak_model_p = tf.nn.softmax(polyak_model.pi)
|
polyak_model_p = tf.nn.softmax(polyak_model.pi)
|
||||||
step_model_p = tf.nn.softmax(step_model.pi)
|
step_model_p = tf.nn.softmax(step_model.pi)
|
||||||
v = tf.reduce_sum(train_model_p * train_model.q, axis = -1) # shape is [nenvs * (nsteps + 1)]
|
v = tf.reduce_sum(train_model_p * train_model.q, axis = -1) # shape is [nenvs * (nsteps + 1)]
|
||||||
@@ -119,7 +119,7 @@ class Model(object):
|
|||||||
qret = q_retrace(R, D, q_i, v, rho_i, nenvs, nsteps, gamma)
|
qret = q_retrace(R, D, q_i, v, rho_i, nenvs, nsteps, gamma)
|
||||||
|
|
||||||
# Calculate losses
|
# Calculate losses
|
||||||
# Entropy
|
# Entropy
|
||||||
# entropy = tf.reduce_mean(strip(train_model.pd.entropy(), nenvs, nsteps))
|
# entropy = tf.reduce_mean(strip(train_model.pd.entropy(), nenvs, nsteps))
|
||||||
entropy = tf.reduce_mean(cat_entropy_softmax(f))
|
entropy = tf.reduce_mean(cat_entropy_softmax(f))
|
||||||
|
|
||||||
@@ -212,8 +212,8 @@ class Model(object):
|
|||||||
|
|
||||||
def _step(observation, **kwargs):
|
def _step(observation, **kwargs):
|
||||||
return step_model._evaluate([step_model.action, step_model_p, step_model.state], observation, **kwargs)
|
return step_model._evaluate([step_model.action, step_model_p, step_model.state], observation, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
self.train = train
|
self.train = train
|
||||||
self.save = functools.partial(save_variables, sess=sess, variables=params)
|
self.save = functools.partial(save_variables, sess=sess, variables=params)
|
||||||
@@ -283,18 +283,18 @@ def learn(network, env, seed=None, nsteps=20, nstack=4, total_timesteps=int(80e6
|
|||||||
----------
|
----------
|
||||||
|
|
||||||
network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
|
network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
|
||||||
specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
|
specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
|
||||||
tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
|
tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
|
||||||
neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
|
neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
|
||||||
See baselines.common/policies.py/lstm for more details on using recurrent nets in policies
|
See baselines.common/policies.py/lstm for more details on using recurrent nets in policies
|
||||||
|
|
||||||
env: environment. Needs to be vectorized for parallel environment simulation.
|
env: environment. Needs to be vectorized for parallel environment simulation.
|
||||||
The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class.
|
The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class.
|
||||||
|
|
||||||
nsteps: int, number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where
|
nsteps: int, number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where
|
||||||
nenv is number of environment copies simulated in parallel) (default: 20)
|
nenv is number of environment copies simulated in parallel) (default: 20)
|
||||||
|
|
||||||
nstack: int, size of the frame stack, i.e. number of the frames passed to the step model. Frames are stacked along channel dimension
|
nstack: int, size of the frame stack, i.e. number of the frames passed to the step model. Frames are stacked along channel dimension
|
||||||
(last image dimension) (default: 4)
|
(last image dimension) (default: 4)
|
||||||
|
|
||||||
total_timesteps: int, number of timesteps (i.e. number of actions taken in the environment) (default: 80M)
|
total_timesteps: int, number of timesteps (i.e. number of actions taken in the environment) (default: 80M)
|
||||||
@@ -303,11 +303,11 @@ def learn(network, env, seed=None, nsteps=20, nstack=4, total_timesteps=int(80e6
|
|||||||
|
|
||||||
ent_coef: float, policy entropy coefficient in the optimization objective (default: 0.01)
|
ent_coef: float, policy entropy coefficient in the optimization objective (default: 0.01)
|
||||||
|
|
||||||
max_grad_norm: float, gradient norm clipping coefficient. If set to None, no clipping. (default: 10),
|
max_grad_norm: float, gradient norm clipping coefficient. If set to None, no clipping. (default: 10),
|
||||||
|
|
||||||
lr: float, learning rate for RMSProp (current implementation has RMSProp hardcoded in) (default: 7e-4)
|
lr: float, learning rate for RMSProp (current implementation has RMSProp hardcoded in) (default: 7e-4)
|
||||||
|
|
||||||
lrschedule: schedule of learning rate. Can be 'linear', 'constant', or a function [0..1] -> [0..1] that takes fraction of the training progress as input and
|
lrschedule: schedule of learning rate. Can be 'linear', 'constant', or a function [0..1] -> [0..1] that takes fraction of the training progress as input and
|
||||||
returns fraction of the learning rate (specified as lr) as output
|
returns fraction of the learning rate (specified as lr) as output
|
||||||
|
|
||||||
rprop_epsilon: float, RMSProp epsilon (stabilizes square root computation in denominator of RMSProp update) (default: 1e-5)
|
rprop_epsilon: float, RMSProp epsilon (stabilizes square root computation in denominator of RMSProp update) (default: 1e-5)
|
||||||
@@ -325,17 +325,17 @@ def learn(network, env, seed=None, nsteps=20, nstack=4, total_timesteps=int(80e6
|
|||||||
replay_start: int, the sampling from the replay buffer does not start until replay buffer has at least that many samples (default: 10k)
|
replay_start: int, the sampling from the replay buffer does not start until replay buffer has at least that many samples (default: 10k)
|
||||||
|
|
||||||
c: float, importance weight clipping factor (default: 10)
|
c: float, importance weight clipping factor (default: 10)
|
||||||
|
|
||||||
trust_region bool, whether or not algorithms estimates the gradient KL divergence between the old and updated policy and uses it to determine step size (default: True)
|
trust_region bool, whether or not algorithms estimates the gradient KL divergence between the old and updated policy and uses it to determine step size (default: True)
|
||||||
|
|
||||||
delta: float, max KL divergence between the old policy and updated policy (default: 1)
|
delta: float, max KL divergence between the old policy and updated policy (default: 1)
|
||||||
|
|
||||||
alpha: float, momentum factor in the Polyak (exponential moving average) averaging of the model parameters (default: 0.99)
|
alpha: float, momentum factor in the Polyak (exponential moving average) averaging of the model parameters (default: 0.99)
|
||||||
|
|
||||||
load_path: str, path to load the model from (default: None)
|
load_path: str, path to load the model from (default: None)
|
||||||
|
|
||||||
**network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
|
**network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
|
||||||
For instance, 'mlp' network architecture has arguments num_hidden and num_layers.
|
For instance, 'mlp' network architecture has arguments num_hidden and num_layers.
|
||||||
|
|
||||||
'''
|
'''
|
||||||
|
|
||||||
|
@@ -1,2 +1,2 @@
|
|||||||
from baselines.bench.benchmarks import *
|
from baselines.bench.benchmarks import *
|
||||||
from baselines.bench.monitor import *
|
from baselines.bench.monitor import *
|
||||||
|
@@ -102,7 +102,7 @@ def get_monitor_files(dir):
|
|||||||
def load_results(dir):
|
def load_results(dir):
|
||||||
import pandas
|
import pandas
|
||||||
monitor_files = (
|
monitor_files = (
|
||||||
glob(osp.join(dir, "*monitor.json")) +
|
glob(osp.join(dir, "*monitor.json")) +
|
||||||
glob(osp.join(dir, "*monitor.csv"))) # get both csv and (old) json files
|
glob(osp.join(dir, "*monitor.csv"))) # get both csv and (old) json files
|
||||||
if not monitor_files:
|
if not monitor_files:
|
||||||
raise LoadMonitorResultsError("no monitor files of the form *%s found in %s" % (Monitor.EXT, dir))
|
raise LoadMonitorResultsError("no monitor files of the form *%s found in %s" % (Monitor.EXT, dir))
|
||||||
|
@@ -31,4 +31,4 @@ def cg(f_Ax, b, cg_iters=10, callback=None, verbose=False, residual_tol=1e-10):
|
|||||||
if callback is not None:
|
if callback is not None:
|
||||||
callback(x)
|
callback(x)
|
||||||
if verbose: print(fmtstr % (i+1, rdotr, np.linalg.norm(x))) # pylint: disable=W0631
|
if verbose: print(fmtstr % (i+1, rdotr, np.linalg.norm(x))) # pylint: disable=W0631
|
||||||
return x
|
return x
|
||||||
|
@@ -29,7 +29,7 @@ def make_vec_env(env_id, env_type, num_env, seed, wrapper_kwargs=None, start_ind
|
|||||||
def _thunk():
|
def _thunk():
|
||||||
env = make_atari(env_id) if env_type == 'atari' else gym.make(env_id)
|
env = make_atari(env_id) if env_type == 'atari' else gym.make(env_id)
|
||||||
env.seed(seed + 10000*mpi_rank + rank if seed is not None else None)
|
env.seed(seed + 10000*mpi_rank + rank if seed is not None else None)
|
||||||
env = Monitor(env,
|
env = Monitor(env,
|
||||||
logger.get_dir() and os.path.join(logger.get_dir(), str(mpi_rank) + '.' + str(rank)),
|
logger.get_dir() and os.path.join(logger.get_dir(), str(mpi_rank) + '.' + str(rank)),
|
||||||
allow_early_resets=True)
|
allow_early_resets=True)
|
||||||
|
|
||||||
|
@@ -2,7 +2,7 @@ from __future__ import print_function
|
|||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import time
|
import time
|
||||||
import shlex
|
import shlex
|
||||||
import subprocess
|
import subprocess
|
||||||
|
|
||||||
# ================================================================
|
# ================================================================
|
||||||
|
@@ -2,15 +2,15 @@ import tensorflow as tf
|
|||||||
from gym.spaces import Discrete, Box
|
from gym.spaces import Discrete, Box
|
||||||
|
|
||||||
def observation_placeholder(ob_space, batch_size=None, name='Ob'):
|
def observation_placeholder(ob_space, batch_size=None, name='Ob'):
|
||||||
'''
|
'''
|
||||||
Create placeholder to feed observations into of the size appropriate to the observation space
|
Create placeholder to feed observations into of the size appropriate to the observation space
|
||||||
|
|
||||||
Parameters:
|
Parameters:
|
||||||
----------
|
----------
|
||||||
|
|
||||||
ob_space: gym.Space observation space
|
ob_space: gym.Space observation space
|
||||||
|
|
||||||
batch_size: int size of the batch to be fed into input. Can be left None in most cases.
|
batch_size: int size of the batch to be fed into input. Can be left None in most cases.
|
||||||
|
|
||||||
name: str name of the placeholder
|
name: str name of the placeholder
|
||||||
|
|
||||||
@@ -27,9 +27,9 @@ def observation_placeholder(ob_space, batch_size=None, name='Ob'):
|
|||||||
|
|
||||||
|
|
||||||
def observation_input(ob_space, batch_size=None, name='Ob'):
|
def observation_input(ob_space, batch_size=None, name='Ob'):
|
||||||
'''
|
'''
|
||||||
Create placeholder to feed observations into of the size appropriate to the observation space, and add input
|
Create placeholder to feed observations into of the size appropriate to the observation space, and add input
|
||||||
encoder of the appropriate type.
|
encoder of the appropriate type.
|
||||||
'''
|
'''
|
||||||
|
|
||||||
placeholder = observation_placeholder(ob_space, batch_size, name)
|
placeholder = observation_placeholder(ob_space, batch_size, name)
|
||||||
@@ -41,9 +41,9 @@ def encode_observation(ob_space, placeholder):
|
|||||||
|
|
||||||
Parameters:
|
Parameters:
|
||||||
----------
|
----------
|
||||||
|
|
||||||
ob_space: gym.Space observation space
|
ob_space: gym.Space observation space
|
||||||
|
|
||||||
placeholder: tf.placeholder observation input placeholder
|
placeholder: tf.placeholder observation input placeholder
|
||||||
'''
|
'''
|
||||||
if isinstance(ob_space, Discrete):
|
if isinstance(ob_space, Discrete):
|
||||||
|
@@ -82,4 +82,4 @@ def test_discount_with_boundaries():
|
|||||||
2 + gamma * 3,
|
2 + gamma * 3,
|
||||||
3,
|
3,
|
||||||
4
|
4
|
||||||
])
|
])
|
||||||
|
@@ -76,4 +76,4 @@ def test_MpiAdam():
|
|||||||
for i in range(10):
|
for i in range(10):
|
||||||
l,g = lossandgrad()
|
l,g = lossandgrad()
|
||||||
adam.update(g, stepsize)
|
adam.update(g, stepsize)
|
||||||
print(i,l)
|
print(i,l)
|
||||||
|
@@ -4,7 +4,7 @@ def mpi_fork(n, bind_to_core=False):
|
|||||||
"""Re-launches the current script with workers
|
"""Re-launches the current script with workers
|
||||||
Returns "parent" for original parent, "child" for MPI children
|
Returns "parent" for original parent, "child" for MPI children
|
||||||
"""
|
"""
|
||||||
if n<=1:
|
if n<=1:
|
||||||
return "child"
|
return "child"
|
||||||
if os.getenv("IN_MPI") is None:
|
if os.getenv("IN_MPI") is None:
|
||||||
env = os.environ.copy()
|
env = os.environ.copy()
|
||||||
|
@@ -33,8 +33,8 @@ def mpi_moments(x, axis=0, comm=None, keepdims=False):
|
|||||||
|
|
||||||
def test_runningmeanstd():
|
def test_runningmeanstd():
|
||||||
import subprocess
|
import subprocess
|
||||||
subprocess.check_call(['mpirun', '-np', '3',
|
subprocess.check_call(['mpirun', '-np', '3',
|
||||||
'python','-c',
|
'python','-c',
|
||||||
'from baselines.common.mpi_moments import _helper_runningmeanstd; _helper_runningmeanstd()'])
|
'from baselines.common.mpi_moments import _helper_runningmeanstd; _helper_runningmeanstd()'])
|
||||||
|
|
||||||
def _helper_runningmeanstd():
|
def _helper_runningmeanstd():
|
||||||
|
@@ -32,7 +32,7 @@ class PolicyWithValue(object):
|
|||||||
**tensors tensorflow tensors for additional attributes such as state or mask
|
**tensors tensorflow tensors for additional attributes such as state or mask
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
self.X = observations
|
self.X = observations
|
||||||
self.state = tf.constant([])
|
self.state = tf.constant([])
|
||||||
self.initial_state = None
|
self.initial_state = None
|
||||||
@@ -85,7 +85,7 @@ class PolicyWithValue(object):
|
|||||||
-------
|
-------
|
||||||
(action, value estimate, next state, negative log likelihood of the action under current policy parameters) tuple
|
(action, value estimate, next state, negative log likelihood of the action under current policy parameters) tuple
|
||||||
"""
|
"""
|
||||||
|
|
||||||
a, v, state, neglogp = self._evaluate([self.action, self.vf, self.state, self.neglogp], observation, **extra_feed)
|
a, v, state, neglogp = self._evaluate([self.action, self.vf, self.state, self.neglogp], observation, **extra_feed)
|
||||||
if state.size == 0:
|
if state.size == 0:
|
||||||
state = None
|
state = None
|
||||||
@@ -106,14 +106,14 @@ class PolicyWithValue(object):
|
|||||||
-------
|
-------
|
||||||
value estimate
|
value estimate
|
||||||
"""
|
"""
|
||||||
return self._evaluate(self.vf, ob, *args, **kwargs)
|
return self._evaluate(self.vf, ob, *args, **kwargs)
|
||||||
|
|
||||||
def save(self, save_path):
|
def save(self, save_path):
|
||||||
tf_util.save_state(save_path, sess=self.sess)
|
tf_util.save_state(save_path, sess=self.sess)
|
||||||
|
|
||||||
def load(self, load_path):
|
def load(self, load_path):
|
||||||
tf_util.load_state(load_path, sess=self.sess)
|
tf_util.load_state(load_path, sess=self.sess)
|
||||||
|
|
||||||
def build_policy(env, policy_network, value_network=None, normalize_observations=False, estimate_q=False, **policy_kwargs):
|
def build_policy(env, policy_network, value_network=None, normalize_observations=False, estimate_q=False, **policy_kwargs):
|
||||||
if isinstance(policy_network, str):
|
if isinstance(policy_network, str):
|
||||||
network_type = policy_network
|
network_type = policy_network
|
||||||
@@ -123,7 +123,7 @@ def build_policy(env, policy_network, value_network=None, normalize_observation
|
|||||||
ob_space = env.observation_space
|
ob_space = env.observation_space
|
||||||
|
|
||||||
X = observ_placeholder if observ_placeholder is not None else observation_placeholder(ob_space, batch_size=nbatch)
|
X = observ_placeholder if observ_placeholder is not None else observation_placeholder(ob_space, batch_size=nbatch)
|
||||||
|
|
||||||
extra_tensors = {}
|
extra_tensors = {}
|
||||||
|
|
||||||
if normalize_observations and X.dtype == tf.float32:
|
if normalize_observations and X.dtype == tf.float32:
|
||||||
@@ -144,7 +144,7 @@ def build_policy(env, policy_network, value_network=None, normalize_observation
|
|||||||
policy_latent, recurrent_tensors = policy_network(encoded_x, nenv)
|
policy_latent, recurrent_tensors = policy_network(encoded_x, nenv)
|
||||||
extra_tensors.update(recurrent_tensors)
|
extra_tensors.update(recurrent_tensors)
|
||||||
|
|
||||||
|
|
||||||
_v_net = value_network
|
_v_net = value_network
|
||||||
|
|
||||||
if _v_net is None or _v_net == 'shared':
|
if _v_net is None or _v_net == 'shared':
|
||||||
@@ -154,10 +154,10 @@ def build_policy(env, policy_network, value_network=None, normalize_observation
|
|||||||
_v_net = policy_network
|
_v_net = policy_network
|
||||||
else:
|
else:
|
||||||
assert callable(_v_net)
|
assert callable(_v_net)
|
||||||
|
|
||||||
with tf.variable_scope('vf', reuse=tf.AUTO_REUSE):
|
with tf.variable_scope('vf', reuse=tf.AUTO_REUSE):
|
||||||
vf_latent, _ = _v_net(encoded_x)
|
vf_latent, _ = _v_net(encoded_x)
|
||||||
|
|
||||||
policy = PolicyWithValue(
|
policy = PolicyWithValue(
|
||||||
env=env,
|
env=env,
|
||||||
observations=X,
|
observations=X,
|
||||||
@@ -176,4 +176,4 @@ def _normalize_clip_observation(x, clip_range=[-5.0, 5.0]):
|
|||||||
rms = RunningMeanStd(shape=x.shape[1:])
|
rms = RunningMeanStd(shape=x.shape[1:])
|
||||||
norm_x = tf.clip_by_value((x - rms.mean) / rms.std, min(clip_range), max(clip_range))
|
norm_x = tf.clip_by_value((x - rms.mean) / rms.std, min(clip_range), max(clip_range))
|
||||||
return norm_x, rms
|
return norm_x, rms
|
||||||
|
|
||||||
|
@@ -23,15 +23,15 @@ def update_mean_var_count_from_moments(mean, var, count, batch_mean, batch_var,
|
|||||||
delta = batch_mean - mean
|
delta = batch_mean - mean
|
||||||
tot_count = count + batch_count
|
tot_count = count + batch_count
|
||||||
|
|
||||||
new_mean = mean + delta * batch_count / tot_count
|
new_mean = mean + delta * batch_count / tot_count
|
||||||
m_a = var * count
|
m_a = var * count
|
||||||
m_b = batch_var * batch_count
|
m_b = batch_var * batch_count
|
||||||
M2 = m_a + m_b + np.square(delta) * count * batch_count / (count + batch_count)
|
M2 = m_a + m_b + np.square(delta) * count * batch_count / (count + batch_count)
|
||||||
new_var = M2 / (count + batch_count)
|
new_var = M2 / (count + batch_count)
|
||||||
new_count = batch_count + count
|
new_count = batch_count + count
|
||||||
|
|
||||||
return new_mean, new_var, new_count
|
return new_mean, new_var, new_count
|
||||||
|
|
||||||
|
|
||||||
class TfRunningMeanStd(object):
|
class TfRunningMeanStd(object):
|
||||||
# https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
|
# https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
|
||||||
@@ -46,10 +46,10 @@ class TfRunningMeanStd(object):
|
|||||||
self._new_var = tf.placeholder(shape=shape, dtype=tf.float64)
|
self._new_var = tf.placeholder(shape=shape, dtype=tf.float64)
|
||||||
self._new_count = tf.placeholder(shape=(), dtype=tf.float64)
|
self._new_count = tf.placeholder(shape=(), dtype=tf.float64)
|
||||||
|
|
||||||
|
|
||||||
with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
|
with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
|
||||||
self._mean = tf.get_variable('mean', initializer=np.zeros(shape, 'float64'), dtype=tf.float64)
|
self._mean = tf.get_variable('mean', initializer=np.zeros(shape, 'float64'), dtype=tf.float64)
|
||||||
self._var = tf.get_variable('std', initializer=np.ones(shape, 'float64'), dtype=tf.float64)
|
self._var = tf.get_variable('std', initializer=np.ones(shape, 'float64'), dtype=tf.float64)
|
||||||
self._count = tf.get_variable('count', initializer=np.full((), epsilon, 'float64'), dtype=tf.float64)
|
self._count = tf.get_variable('count', initializer=np.full((), epsilon, 'float64'), dtype=tf.float64)
|
||||||
|
|
||||||
self.update_ops = tf.group([
|
self.update_ops = tf.group([
|
||||||
@@ -61,10 +61,10 @@ class TfRunningMeanStd(object):
|
|||||||
sess.run(tf.variables_initializer([self._mean, self._var, self._count]))
|
sess.run(tf.variables_initializer([self._mean, self._var, self._count]))
|
||||||
self.sess = sess
|
self.sess = sess
|
||||||
self._set_mean_var_count()
|
self._set_mean_var_count()
|
||||||
|
|
||||||
def _set_mean_var_count(self):
|
def _set_mean_var_count(self):
|
||||||
self.mean, self.var, self.count = self.sess.run([self._mean, self._var, self._count])
|
self.mean, self.var, self.count = self.sess.run([self._mean, self._var, self._count])
|
||||||
|
|
||||||
def update(self, x):
|
def update(self, x):
|
||||||
batch_mean = np.mean(x, axis=0)
|
batch_mean = np.mean(x, axis=0)
|
||||||
batch_var = np.var(x, axis=0)
|
batch_var = np.var(x, axis=0)
|
||||||
@@ -74,13 +74,13 @@ class TfRunningMeanStd(object):
|
|||||||
|
|
||||||
self.sess.run(self.update_ops, feed_dict={
|
self.sess.run(self.update_ops, feed_dict={
|
||||||
self._new_mean: new_mean,
|
self._new_mean: new_mean,
|
||||||
self._new_var: new_var,
|
self._new_var: new_var,
|
||||||
self._new_count: new_count
|
self._new_count: new_count
|
||||||
})
|
})
|
||||||
|
|
||||||
self._set_mean_var_count()
|
self._set_mean_var_count()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def test_runningmeanstd():
|
def test_runningmeanstd():
|
||||||
for (x1, x2, x3) in [
|
for (x1, x2, x3) in [
|
||||||
@@ -145,7 +145,7 @@ def profile_tf_runningmeanstd():
|
|||||||
|
|
||||||
print('rms update time ({} trials): {} s'.format(n_trials, tic2 - tic1))
|
print('rms update time ({} trials): {} s'.format(n_trials, tic2 - tic1))
|
||||||
print('tfrms update time ({} trials): {} s'.format(n_trials, tic3 - tic2))
|
print('tfrms update time ({} trials): {} s'.format(n_trials, tic3 - tic2))
|
||||||
|
|
||||||
|
|
||||||
tic1 = time.time()
|
tic1 = time.time()
|
||||||
for _ in range(n_trials):
|
for _ in range(n_trials):
|
||||||
@@ -161,21 +161,21 @@ def profile_tf_runningmeanstd():
|
|||||||
|
|
||||||
print('rms get mean time ({} trials): {} s'.format(n_trials, tic2 - tic1))
|
print('rms get mean time ({} trials): {} s'.format(n_trials, tic2 - tic1))
|
||||||
print('tfrms get mean time ({} trials): {} s'.format(n_trials, tic3 - tic2))
|
print('tfrms get mean time ({} trials): {} s'.format(n_trials, tic3 - tic2))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
'''
|
'''
|
||||||
options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) #pylint: disable=E1101
|
options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) #pylint: disable=E1101
|
||||||
run_metadata = tf.RunMetadata()
|
run_metadata = tf.RunMetadata()
|
||||||
profile_opts = dict(options=options, run_metadata=run_metadata)
|
profile_opts = dict(options=options, run_metadata=run_metadata)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
from tensorflow.python.client import timeline
|
from tensorflow.python.client import timeline
|
||||||
fetched_timeline = timeline.Timeline(run_metadata.step_stats) #pylint: disable=E1101
|
fetched_timeline = timeline.Timeline(run_metadata.step_stats) #pylint: disable=E1101
|
||||||
chrome_trace = fetched_timeline.generate_chrome_trace_format()
|
chrome_trace = fetched_timeline.generate_chrome_trace_format()
|
||||||
outfile = '/tmp/timeline.json'
|
outfile = '/tmp/timeline.json'
|
||||||
with open(outfile, 'wt') as f:
|
with open(outfile, 'wt') as f:
|
||||||
f.write(chrome_trace)
|
f.write(chrome_trace)
|
||||||
print(f'Successfully saved profile to {outfile}. Exiting.')
|
print(f'Successfully saved profile to {outfile}. Exiting.')
|
||||||
exit(0)
|
exit(0)
|
||||||
@@ -184,4 +184,4 @@ def profile_tf_runningmeanstd():
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
profile_tf_runningmeanstd()
|
profile_tf_runningmeanstd()
|
||||||
|
@@ -40,5 +40,5 @@ class FixedSequenceEnv(Env):
|
|||||||
|
|
||||||
def _get_reward(self, actions):
|
def _get_reward(self, actions):
|
||||||
return 1 if actions == self.sequence[self.time] else 0
|
return 1 if actions == self.sequence[self.time] else 0
|
||||||
|
|
||||||
|
|
||||||
|
@@ -15,7 +15,7 @@ class MnistEnv(Env):
|
|||||||
no_images=None
|
no_images=None
|
||||||
):
|
):
|
||||||
from tensorflow.examples.tutorials.mnist import input_data
|
from tensorflow.examples.tutorials.mnist import input_data
|
||||||
# we could use temporary directory for this with a context manager and
|
# we could use temporary directory for this with a context manager and
|
||||||
# TemporaryDirecotry, but then each test that uses mnist would re-download the data
|
# TemporaryDirecotry, but then each test that uses mnist would re-download the data
|
||||||
# this way the data is not cleaned up, but we only download it once per machine
|
# this way the data is not cleaned up, but we only download it once per machine
|
||||||
mnist_path = osp.join(tempfile.gettempdir(), 'MNIST_data')
|
mnist_path = osp.join(tempfile.gettempdir(), 'MNIST_data')
|
||||||
@@ -33,7 +33,7 @@ class MnistEnv(Env):
|
|||||||
|
|
||||||
self.train_mode()
|
self.train_mode()
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|
||||||
def reset(self):
|
def reset(self):
|
||||||
self._choose_next_state()
|
self._choose_next_state()
|
||||||
self.time = 0
|
self.time = 0
|
||||||
|
@@ -10,7 +10,7 @@ common_kwargs = dict(
|
|||||||
gamma=1.0,
|
gamma=1.0,
|
||||||
seed=0,
|
seed=0,
|
||||||
)
|
)
|
||||||
|
|
||||||
learn_kwargs = {
|
learn_kwargs = {
|
||||||
'a2c' : dict(nsteps=32, value_network='copy', lr=0.05),
|
'a2c' : dict(nsteps=32, value_network='copy', lr=0.05),
|
||||||
'acktr': dict(nsteps=32, value_network='copy'),
|
'acktr': dict(nsteps=32, value_network='copy'),
|
||||||
@@ -31,8 +31,8 @@ def test_cartpole(alg):
|
|||||||
kwargs.update(learn_kwargs[alg])
|
kwargs.update(learn_kwargs[alg])
|
||||||
|
|
||||||
learn_fn = lambda e: get_learn_function(alg)(env=e, **kwargs)
|
learn_fn = lambda e: get_learn_function(alg)(env=e, **kwargs)
|
||||||
def env_fn():
|
def env_fn():
|
||||||
|
|
||||||
env = gym.make('CartPole-v0')
|
env = gym.make('CartPole-v0')
|
||||||
env.seed(0)
|
env.seed(0)
|
||||||
return env
|
return env
|
||||||
|
@@ -8,7 +8,7 @@ except BaseException:
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(
|
@pytest.mark.skipif(
|
||||||
not _mujoco_present,
|
not _mujoco_present,
|
||||||
reason='error loading mujoco - either mujoco / mujoco key not present, or LD_LIBRARY_PATH is not pointing to mujoco library'
|
reason='error loading mujoco - either mujoco / mujoco key not present, or LD_LIBRARY_PATH is not pointing to mujoco library'
|
||||||
)
|
)
|
||||||
def test_lstm_example():
|
def test_lstm_example():
|
||||||
@@ -37,12 +37,12 @@ def test_lstm_example():
|
|||||||
action, _, state, _ = policy.step(ob, S=state, M=done)
|
action, _, state, _ = policy.step(ob, S=state, M=done)
|
||||||
ob, reward, done, _ = venv.step(action)
|
ob, reward, done, _ = venv.step(action)
|
||||||
step_counter += 1
|
step_counter += 1
|
||||||
if done:
|
if done:
|
||||||
break
|
break
|
||||||
|
|
||||||
|
|
||||||
assert step_counter > 5
|
assert step_counter > 5
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@@ -8,7 +8,7 @@ common_kwargs = dict(
|
|||||||
seed=0,
|
seed=0,
|
||||||
total_timesteps=50000,
|
total_timesteps=50000,
|
||||||
)
|
)
|
||||||
|
|
||||||
learn_kwargs = {
|
learn_kwargs = {
|
||||||
'a2c': {},
|
'a2c': {},
|
||||||
'ppo2': dict(nsteps=10, ent_coef=0.0, nminibatches=1),
|
'ppo2': dict(nsteps=10, ent_coef=0.0, nminibatches=1),
|
||||||
@@ -36,7 +36,7 @@ def test_fixed_sequence(alg, rnn):
|
|||||||
episode_len = 5
|
episode_len = 5
|
||||||
env_fn = lambda: FixedSequenceEnv(10, episode_len=episode_len)
|
env_fn = lambda: FixedSequenceEnv(10, episode_len=episode_len)
|
||||||
learn = lambda e: get_learn_function(alg)(
|
learn = lambda e: get_learn_function(alg)(
|
||||||
env=e,
|
env=e,
|
||||||
network=rnn,
|
network=rnn,
|
||||||
**kwargs
|
**kwargs
|
||||||
)
|
)
|
||||||
@@ -47,5 +47,5 @@ def test_fixed_sequence(alg, rnn):
|
|||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
test_fixed_sequence('ppo2', 'lstm')
|
test_fixed_sequence('ppo2', 'lstm')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@@ -9,7 +9,7 @@ common_kwargs = dict(
|
|||||||
gamma=0.9,
|
gamma=0.9,
|
||||||
seed=0,
|
seed=0,
|
||||||
)
|
)
|
||||||
|
|
||||||
learn_kwargs = {
|
learn_kwargs = {
|
||||||
'a2c' : {},
|
'a2c' : {},
|
||||||
'acktr': {},
|
'acktr': {},
|
||||||
@@ -51,5 +51,5 @@ def test_continuous_identity(alg):
|
|||||||
simple_test(env_fn, learn_fn, -0.1)
|
simple_test(env_fn, learn_fn, -0.1)
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
test_continuous_identity('a2c')
|
test_continuous_identity('a2c')
|
||||||
|
|
||||||
|
@@ -6,7 +6,7 @@ from baselines.common.tests.util import simple_test
|
|||||||
from baselines.run import get_learn_function
|
from baselines.run import get_learn_function
|
||||||
|
|
||||||
|
|
||||||
# TODO investigate a2c and ppo2 failures - is it due to bad hyperparameters for this problem?
|
# TODO investigate a2c and ppo2 failures - is it due to bad hyperparameters for this problem?
|
||||||
# GitHub issue https://github.com/openai/baselines/issues/189
|
# GitHub issue https://github.com/openai/baselines/issues/189
|
||||||
common_kwargs = {
|
common_kwargs = {
|
||||||
'seed': 0,
|
'seed': 0,
|
||||||
@@ -25,21 +25,21 @@ learn_args = {
|
|||||||
'trpo_mpi': dict(total_timesteps=80000, timesteps_per_batch=100, cg_iters=10, lam=1.0, max_kl=0.001)
|
'trpo_mpi': dict(total_timesteps=80000, timesteps_per_batch=100, cg_iters=10, lam=1.0, max_kl=0.001)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
#tests pass, but are too slow on travis. Same algorithms are covered
|
#tests pass, but are too slow on travis. Same algorithms are covered
|
||||||
# by other tests with less compute-hungry nn's and by benchmarks
|
# by other tests with less compute-hungry nn's and by benchmarks
|
||||||
@pytest.mark.skip
|
@pytest.mark.skip
|
||||||
@pytest.mark.slow
|
@pytest.mark.slow
|
||||||
@pytest.mark.parametrize("alg", learn_args.keys())
|
@pytest.mark.parametrize("alg", learn_args.keys())
|
||||||
def test_mnist(alg):
|
def test_mnist(alg):
|
||||||
'''
|
'''
|
||||||
Test if the algorithm can learn to classify MNIST digits.
|
Test if the algorithm can learn to classify MNIST digits.
|
||||||
Uses CNN policy.
|
Uses CNN policy.
|
||||||
'''
|
'''
|
||||||
|
|
||||||
learn_kwargs = learn_args[alg]
|
learn_kwargs = learn_args[alg]
|
||||||
learn_kwargs.update(common_kwargs)
|
learn_kwargs.update(common_kwargs)
|
||||||
|
|
||||||
learn = get_learn_function(alg)
|
learn = get_learn_function(alg)
|
||||||
learn_fn = lambda e: learn(env=e, **learn_kwargs)
|
learn_fn = lambda e: learn(env=e, **learn_kwargs)
|
||||||
env_fn = lambda: MnistEnv(seed=0, episode_len=100)
|
env_fn = lambda: MnistEnv(seed=0, episode_len=100)
|
||||||
|
@@ -14,15 +14,15 @@ from functools import partial
|
|||||||
|
|
||||||
learn_kwargs = {
|
learn_kwargs = {
|
||||||
'deepq': {},
|
'deepq': {},
|
||||||
'a2c': {},
|
'a2c': {},
|
||||||
'acktr': {},
|
'acktr': {},
|
||||||
'ppo2': {'nminibatches': 1, 'nsteps': 10},
|
'ppo2': {'nminibatches': 1, 'nsteps': 10},
|
||||||
'trpo_mpi': {},
|
'trpo_mpi': {},
|
||||||
}
|
}
|
||||||
|
|
||||||
network_kwargs = {
|
network_kwargs = {
|
||||||
'mlp': {},
|
'mlp': {},
|
||||||
'cnn': {'pad': 'SAME'},
|
'cnn': {'pad': 'SAME'},
|
||||||
'lstm': {},
|
'lstm': {},
|
||||||
'cnn_lnlstm': {'pad': 'SAME'}
|
'cnn_lnlstm': {'pad': 'SAME'}
|
||||||
}
|
}
|
||||||
@@ -32,15 +32,15 @@ network_kwargs = {
|
|||||||
@pytest.mark.parametrize("network_fn", network_kwargs.keys())
|
@pytest.mark.parametrize("network_fn", network_kwargs.keys())
|
||||||
def test_serialization(learn_fn, network_fn):
|
def test_serialization(learn_fn, network_fn):
|
||||||
'''
|
'''
|
||||||
Test if the trained model can be serialized
|
Test if the trained model can be serialized
|
||||||
'''
|
'''
|
||||||
|
|
||||||
|
|
||||||
if network_fn.endswith('lstm') and learn_fn in ['acktr', 'trpo_mpi', 'deepq']:
|
if network_fn.endswith('lstm') and learn_fn in ['acktr', 'trpo_mpi', 'deepq']:
|
||||||
# TODO make acktr work with recurrent policies
|
# TODO make acktr work with recurrent policies
|
||||||
# and test
|
# and test
|
||||||
# github issue: https://github.com/openai/baselines/issues/194
|
# github issue: https://github.com/openai/baselines/issues/194
|
||||||
return
|
return
|
||||||
|
|
||||||
env = DummyVecEnv([lambda: MnistEnv(10, episode_len=100)])
|
env = DummyVecEnv([lambda: MnistEnv(10, episode_len=100)])
|
||||||
ob = env.reset().copy()
|
ob = env.reset().copy()
|
||||||
@@ -74,14 +74,14 @@ def test_serialization(learn_fn, network_fn):
|
|||||||
np.testing.assert_allclose(mean1, mean2, atol=0.5)
|
np.testing.assert_allclose(mean1, mean2, atol=0.5)
|
||||||
np.testing.assert_allclose(std1, std2, atol=0.5)
|
np.testing.assert_allclose(std1, std2, atol=0.5)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def _serialize_variables():
|
def _serialize_variables():
|
||||||
sess = get_session()
|
sess = get_session()
|
||||||
variables = tf.trainable_variables()
|
variables = tf.trainable_variables()
|
||||||
values = sess.run(variables)
|
values = sess.run(variables)
|
||||||
return {var.name: value for var, value in zip(variables, values)}
|
return {var.name: value for var, value in zip(variables, values)}
|
||||||
|
|
||||||
|
|
||||||
def _get_action_stats(model, ob):
|
def _get_action_stats(model, ob):
|
||||||
ntrials = 1000
|
ntrials = 1000
|
||||||
|
@@ -30,7 +30,7 @@ def simple_test(env_fn, learn_fn, min_reward_fraction, n_trials=N_TRIALS):
|
|||||||
a, v, state, _ = model.step(obs, S=state, M=[False])
|
a, v, state, _ = model.step(obs, S=state, M=[False])
|
||||||
else:
|
else:
|
||||||
a, v, _, _ = model.step(obs)
|
a, v, _, _ = model.step(obs)
|
||||||
|
|
||||||
obs, rew, done, _ = env.step(a)
|
obs, rew, done, _ = env.step(a)
|
||||||
sum_rew += float(rew)
|
sum_rew += float(rew)
|
||||||
|
|
||||||
@@ -46,7 +46,7 @@ def reward_per_episode_test(env_fn, learn_fn, min_avg_reward, n_trials=N_EPISODE
|
|||||||
with tf.Graph().as_default(), tf.Session(config=tf.ConfigProto(allow_soft_placement=True)).as_default():
|
with tf.Graph().as_default(), tf.Session(config=tf.ConfigProto(allow_soft_placement=True)).as_default():
|
||||||
model = learn_fn(env)
|
model = learn_fn(env)
|
||||||
|
|
||||||
N_TRIALS = 100
|
N_TRIALS = 100
|
||||||
|
|
||||||
observations, actions, rewards = rollout(env, model, N_TRIALS)
|
observations, actions, rewards = rollout(env, model, N_TRIALS)
|
||||||
rewards = [sum(r) for r in rewards]
|
rewards = [sum(r) for r in rewards]
|
||||||
|
@@ -347,7 +347,7 @@ def load_variables(load_path, variables=None, sess=None):
|
|||||||
variables = variables or tf.trainable_variables()
|
variables = variables or tf.trainable_variables()
|
||||||
|
|
||||||
loaded_params = joblib.load(os.path.expanduser(load_path))
|
loaded_params = joblib.load(os.path.expanduser(load_path))
|
||||||
restores = []
|
restores = []
|
||||||
if isinstance(loaded_params, list):
|
if isinstance(loaded_params, list):
|
||||||
assert len(loaded_params) == len(variables), 'number of variables loaded mismatches len(variables)'
|
assert len(loaded_params) == len(variables), 'number of variables loaded mismatches len(variables)'
|
||||||
for d, v in zip(loaded_params, variables):
|
for d, v in zip(loaded_params, variables):
|
||||||
|
@@ -9,8 +9,8 @@ class DummyVecEnv(VecEnv):
|
|||||||
env = self.envs[0]
|
env = self.envs[0]
|
||||||
VecEnv.__init__(self, len(env_fns), env.observation_space, env.action_space)
|
VecEnv.__init__(self, len(env_fns), env.observation_space, env.action_space)
|
||||||
obs_space = env.observation_space
|
obs_space = env.observation_space
|
||||||
|
|
||||||
self.keys, shapes, dtypes = obs_space_info(obs_space)
|
self.keys, shapes, dtypes = obs_space_info(obs_space)
|
||||||
self.buf_obs = { k: np.zeros((self.num_envs,) + tuple(shapes[k]), dtype=dtypes[k]) for k in self.keys }
|
self.buf_obs = { k: np.zeros((self.num_envs,) + tuple(shapes[k]), dtype=dtypes[k]) for k in self.keys }
|
||||||
self.buf_dones = np.zeros((self.num_envs,), dtype=np.bool)
|
self.buf_dones = np.zeros((self.num_envs,), dtype=np.bool)
|
||||||
self.buf_rews = np.zeros((self.num_envs,), dtype=np.float32)
|
self.buf_rews = np.zeros((self.num_envs,), dtype=np.float32)
|
||||||
@@ -62,7 +62,7 @@ class DummyVecEnv(VecEnv):
|
|||||||
|
|
||||||
def get_images(self):
|
def get_images(self):
|
||||||
return [env.render(mode='rgb_array') for env in self.envs]
|
return [env.render(mode='rgb_array') for env in self.envs]
|
||||||
|
|
||||||
def render(self, mode='human'):
|
def render(self, mode='human'):
|
||||||
if self.num_envs == 1:
|
if self.num_envs == 1:
|
||||||
self.envs[0].render(mode=mode)
|
self.envs[0].render(mode=mode)
|
||||||
|
@@ -71,7 +71,7 @@ class Memory(object):
|
|||||||
def append(self, obs0, action, reward, obs1, terminal1, training=True):
|
def append(self, obs0, action, reward, obs1, terminal1, training=True):
|
||||||
if not training:
|
if not training:
|
||||||
return
|
return
|
||||||
|
|
||||||
self.observations0.append(obs0)
|
self.observations0.append(obs0)
|
||||||
self.actions.append(action)
|
self.actions.append(action)
|
||||||
self.rewards.append(reward)
|
self.rewards.append(reward)
|
||||||
|
@@ -35,12 +35,12 @@ class Actor(Model):
|
|||||||
if self.layer_norm:
|
if self.layer_norm:
|
||||||
x = tc.layers.layer_norm(x, center=True, scale=True)
|
x = tc.layers.layer_norm(x, center=True, scale=True)
|
||||||
x = tf.nn.relu(x)
|
x = tf.nn.relu(x)
|
||||||
|
|
||||||
x = tf.layers.dense(x, 64)
|
x = tf.layers.dense(x, 64)
|
||||||
if self.layer_norm:
|
if self.layer_norm:
|
||||||
x = tc.layers.layer_norm(x, center=True, scale=True)
|
x = tc.layers.layer_norm(x, center=True, scale=True)
|
||||||
x = tf.nn.relu(x)
|
x = tf.nn.relu(x)
|
||||||
|
|
||||||
x = tf.layers.dense(x, self.nb_actions, kernel_initializer=tf.random_uniform_initializer(minval=-3e-3, maxval=3e-3))
|
x = tf.layers.dense(x, self.nb_actions, kernel_initializer=tf.random_uniform_initializer(minval=-3e-3, maxval=3e-3))
|
||||||
x = tf.nn.tanh(x)
|
x = tf.nn.tanh(x)
|
||||||
return x
|
return x
|
||||||
|
@@ -176,7 +176,7 @@ def learn(env,
|
|||||||
load_path: str
|
load_path: str
|
||||||
path to load the model from. (default: None)
|
path to load the model from. (default: None)
|
||||||
**network_kwargs
|
**network_kwargs
|
||||||
additional keyword arguments to pass to the network builder.
|
additional keyword arguments to pass to the network builder.
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
@@ -215,7 +215,7 @@ def learn(env,
|
|||||||
}
|
}
|
||||||
|
|
||||||
act = ActWrapper(act, act_params)
|
act = ActWrapper(act, act_params)
|
||||||
|
|
||||||
# Create the replay buffer
|
# Create the replay buffer
|
||||||
if prioritized_replay:
|
if prioritized_replay:
|
||||||
replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha)
|
replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha)
|
||||||
@@ -246,7 +246,7 @@ def learn(env,
|
|||||||
|
|
||||||
model_file = os.path.join(td, "model")
|
model_file = os.path.join(td, "model")
|
||||||
model_saved = False
|
model_saved = False
|
||||||
|
|
||||||
if tf.train.latest_checkpoint(td) is not None:
|
if tf.train.latest_checkpoint(td) is not None:
|
||||||
load_variables(model_file)
|
load_variables(model_file)
|
||||||
logger.log('Loaded model from {}'.format(model_file))
|
logger.log('Loaded model from {}'.format(model_file))
|
||||||
@@ -254,7 +254,7 @@ def learn(env,
|
|||||||
elif load_path is not None:
|
elif load_path is not None:
|
||||||
load_variables(load_path)
|
load_variables(load_path)
|
||||||
logger.log('Loaded model from {}'.format(load_path))
|
logger.log('Loaded model from {}'.format(load_path))
|
||||||
|
|
||||||
|
|
||||||
for t in range(total_timesteps):
|
for t in range(total_timesteps):
|
||||||
if callback is not None:
|
if callback is not None:
|
||||||
|
@@ -7,7 +7,7 @@ from baselines.common import models
|
|||||||
def main():
|
def main():
|
||||||
env = gym.make("MountainCar-v0")
|
env = gym.make("MountainCar-v0")
|
||||||
act = deepq.learn(
|
act = deepq.learn(
|
||||||
env,
|
env,
|
||||||
network=models.mlp(num_layers=1, num_hidden=64),
|
network=models.mlp(num_layers=1, num_hidden=64),
|
||||||
total_timesteps=0,
|
total_timesteps=0,
|
||||||
load_path='mountaincar_model.pkl'
|
load_path='mountaincar_model.pkl'
|
||||||
|
@@ -29,7 +29,7 @@ def main():
|
|||||||
|
|
||||||
model.save('pong_model.pkl')
|
model.save('pong_model.pkl')
|
||||||
env.close()
|
env.close()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
@@ -94,8 +94,8 @@ def cnn_to_mlp(convs, hiddens, dueling=False, layer_norm=False):
|
|||||||
def build_q_func(network, hiddens=[256], dueling=True, layer_norm=False, **network_kwargs):
|
def build_q_func(network, hiddens=[256], dueling=True, layer_norm=False, **network_kwargs):
|
||||||
if isinstance(network, str):
|
if isinstance(network, str):
|
||||||
from baselines.common.models import get_network_builder
|
from baselines.common.models import get_network_builder
|
||||||
network = get_network_builder(network)(**network_kwargs)
|
network = get_network_builder(network)(**network_kwargs)
|
||||||
|
|
||||||
def q_func_builder(input_placeholder, num_actions, scope, reuse=False):
|
def q_func_builder(input_placeholder, num_actions, scope, reuse=False):
|
||||||
with tf.variable_scope(scope, reuse=reuse):
|
with tf.variable_scope(scope, reuse=reuse):
|
||||||
latent, _ = network(input_placeholder)
|
latent, _ = network(input_placeholder)
|
||||||
@@ -125,5 +125,5 @@ def build_q_func(network, hiddens=[256], dueling=True, layer_norm=False, **netwo
|
|||||||
else:
|
else:
|
||||||
q_out = action_scores
|
q_out = action_scores
|
||||||
return q_out
|
return q_out
|
||||||
|
|
||||||
return q_func_builder
|
return q_func_builder
|
||||||
|
@@ -66,13 +66,13 @@ class Uint8Input(PlaceholderTfInput):
|
|||||||
class ObservationInput(PlaceholderTfInput):
|
class ObservationInput(PlaceholderTfInput):
|
||||||
def __init__(self, observation_space, name=None):
|
def __init__(self, observation_space, name=None):
|
||||||
"""Creates an input placeholder tailored to a specific observation space
|
"""Creates an input placeholder tailored to a specific observation space
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
|
|
||||||
observation_space:
|
observation_space:
|
||||||
observation space of the environment. Should be one of the gym.spaces types
|
observation space of the environment. Should be one of the gym.spaces types
|
||||||
name: str
|
name: str
|
||||||
tensorflow name of the underlying placeholder
|
tensorflow name of the underlying placeholder
|
||||||
"""
|
"""
|
||||||
inpt, self.processed_inpt = observation_input(observation_space, name=name)
|
inpt, self.processed_inpt = observation_input(observation_space, name=name)
|
||||||
@@ -80,5 +80,5 @@ class ObservationInput(PlaceholderTfInput):
|
|||||||
|
|
||||||
def get(self):
|
def get(self):
|
||||||
return self.processed_inpt
|
return self.processed_inpt
|
||||||
|
|
||||||
|
|
||||||
|
@@ -41,7 +41,7 @@ def main(policy_file, seed, n_test_rollouts, render):
|
|||||||
|
|
||||||
for name in ['T', 'gamma', 'noise_eps', 'random_eps']:
|
for name in ['T', 'gamma', 'noise_eps', 'random_eps']:
|
||||||
eval_params[name] = params[name]
|
eval_params[name] = params[name]
|
||||||
|
|
||||||
evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params)
|
evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params)
|
||||||
evaluator.seed(seed)
|
evaluator.seed(seed)
|
||||||
|
|
||||||
|
@@ -37,12 +37,12 @@ def load_results(file):
|
|||||||
|
|
||||||
def pad(xs, value=np.nan):
|
def pad(xs, value=np.nan):
|
||||||
maxlen = np.max([len(x) for x in xs])
|
maxlen = np.max([len(x) for x in xs])
|
||||||
|
|
||||||
padded_xs = []
|
padded_xs = []
|
||||||
for x in xs:
|
for x in xs:
|
||||||
if x.shape[0] >= maxlen:
|
if x.shape[0] >= maxlen:
|
||||||
padded_xs.append(x)
|
padded_xs.append(x)
|
||||||
|
|
||||||
padding = np.ones((maxlen - x.shape[0],) + x.shape[1:]) * value
|
padding = np.ones((maxlen - x.shape[0],) + x.shape[1:]) * value
|
||||||
x_padded = np.concatenate([x, padding], axis=0)
|
x_padded = np.concatenate([x, padding], axis=0)
|
||||||
assert x_padded.shape[1:] == x.shape[1:]
|
assert x_padded.shape[1:] == x.shape[1:]
|
||||||
|
@@ -23,17 +23,17 @@ def train(num_timesteps, seed, model_path=None):
|
|||||||
max_timesteps=num_timesteps,
|
max_timesteps=num_timesteps,
|
||||||
timesteps_per_actorbatch=2048,
|
timesteps_per_actorbatch=2048,
|
||||||
clip_param=0.2, entcoeff=0.0,
|
clip_param=0.2, entcoeff=0.0,
|
||||||
optim_epochs=10,
|
optim_epochs=10,
|
||||||
optim_stepsize=3e-4,
|
optim_stepsize=3e-4,
|
||||||
optim_batchsize=64,
|
optim_batchsize=64,
|
||||||
gamma=0.99,
|
gamma=0.99,
|
||||||
lam=0.95,
|
lam=0.95,
|
||||||
schedule='linear',
|
schedule='linear',
|
||||||
)
|
)
|
||||||
env.close()
|
env.close()
|
||||||
if model_path:
|
if model_path:
|
||||||
U.save_state(model_path)
|
U.save_state(model_path)
|
||||||
|
|
||||||
return pi
|
return pi
|
||||||
|
|
||||||
class RewScale(gym.RewardWrapper):
|
class RewScale(gym.RewardWrapper):
|
||||||
@@ -48,28 +48,28 @@ def main():
|
|||||||
parser = mujoco_arg_parser()
|
parser = mujoco_arg_parser()
|
||||||
parser.add_argument('--model-path', default=os.path.join(logger.get_dir(), 'humanoid_policy'))
|
parser.add_argument('--model-path', default=os.path.join(logger.get_dir(), 'humanoid_policy'))
|
||||||
parser.set_defaults(num_timesteps=int(2e7))
|
parser.set_defaults(num_timesteps=int(2e7))
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
if not args.play:
|
if not args.play:
|
||||||
# train the model
|
# train the model
|
||||||
train(num_timesteps=args.num_timesteps, seed=args.seed, model_path=args.model_path)
|
train(num_timesteps=args.num_timesteps, seed=args.seed, model_path=args.model_path)
|
||||||
else:
|
else:
|
||||||
# construct the model object, load pre-trained model and render
|
# construct the model object, load pre-trained model and render
|
||||||
pi = train(num_timesteps=1, seed=args.seed)
|
pi = train(num_timesteps=1, seed=args.seed)
|
||||||
U.load_state(args.model_path)
|
U.load_state(args.model_path)
|
||||||
env = make_mujoco_env('Humanoid-v2', seed=0)
|
env = make_mujoco_env('Humanoid-v2', seed=0)
|
||||||
|
|
||||||
ob = env.reset()
|
ob = env.reset()
|
||||||
while True:
|
while True:
|
||||||
action = pi.act(stochastic=False, ob=ob)[0]
|
action = pi.act(stochastic=False, ob=ob)[0]
|
||||||
ob, _, done, _ = env.step(action)
|
ob, _, done, _ = env.step(action)
|
||||||
env.render()
|
env.render()
|
||||||
if done:
|
if done:
|
||||||
ob = env.reset()
|
ob = env.reset()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
main()
|
main()
|
||||||
|
@@ -155,20 +155,20 @@ def learn(*, network, env, total_timesteps, seed=None, nsteps=2048, ent_coef=0.0
|
|||||||
save_interval=0, load_path=None, **network_kwargs):
|
save_interval=0, load_path=None, **network_kwargs):
|
||||||
'''
|
'''
|
||||||
Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347)
|
Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347)
|
||||||
|
|
||||||
Parameters:
|
Parameters:
|
||||||
----------
|
----------
|
||||||
|
|
||||||
network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
|
network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
|
||||||
specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
|
specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
|
||||||
tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
|
tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
|
||||||
neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
|
neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
|
||||||
See common/models.py/lstm for more details on using recurrent nets in policies
|
See common/models.py/lstm for more details on using recurrent nets in policies
|
||||||
|
|
||||||
env: baselines.common.vec_env.VecEnv environment. Needs to be vectorized for parallel environment simulation.
|
env: baselines.common.vec_env.VecEnv environment. Needs to be vectorized for parallel environment simulation.
|
||||||
The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class.
|
The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class.
|
||||||
|
|
||||||
|
|
||||||
nsteps: int number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where
|
nsteps: int number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where
|
||||||
nenv is number of environment copies simulated in parallel)
|
nenv is number of environment copies simulated in parallel)
|
||||||
|
|
||||||
@@ -176,38 +176,38 @@ def learn(*, network, env, total_timesteps, seed=None, nsteps=2048, ent_coef=0.0
|
|||||||
|
|
||||||
ent_coef: float policy entropy coefficient in the optimization objective
|
ent_coef: float policy entropy coefficient in the optimization objective
|
||||||
|
|
||||||
lr: float or function learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the
|
lr: float or function learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the
|
||||||
training and 0 is the end of the training.
|
training and 0 is the end of the training.
|
||||||
|
|
||||||
vf_coef: float value function loss coefficient in the optimization objective
|
vf_coef: float value function loss coefficient in the optimization objective
|
||||||
|
|
||||||
max_grad_norm: float or None gradient norm clipping coefficient
|
max_grad_norm: float or None gradient norm clipping coefficient
|
||||||
|
|
||||||
gamma: float discounting factor
|
gamma: float discounting factor
|
||||||
|
|
||||||
lam: float advantage estimation discounting factor (lambda in the paper)
|
lam: float advantage estimation discounting factor (lambda in the paper)
|
||||||
|
|
||||||
log_interval: int number of timesteps between logging events
|
log_interval: int number of timesteps between logging events
|
||||||
|
|
||||||
nminibatches: int number of training minibatches per update. For recurrent policies,
|
nminibatches: int number of training minibatches per update. For recurrent policies,
|
||||||
should be smaller or equal than number of environments run in parallel.
|
should be smaller or equal than number of environments run in parallel.
|
||||||
|
|
||||||
noptepochs: int number of training epochs per update
|
noptepochs: int number of training epochs per update
|
||||||
|
|
||||||
cliprange: float or function clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training
|
cliprange: float or function clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training
|
||||||
and 0 is the end of the training
|
and 0 is the end of the training
|
||||||
|
|
||||||
save_interval: int number of timesteps between saving events
|
save_interval: int number of timesteps between saving events
|
||||||
|
|
||||||
load_path: str path to load the model from
|
load_path: str path to load the model from
|
||||||
|
|
||||||
**network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
|
**network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
|
||||||
For instance, 'mlp' network architecture has arguments num_hidden and num_layers.
|
For instance, 'mlp' network architecture has arguments num_hidden and num_layers.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
'''
|
'''
|
||||||
|
|
||||||
set_global_seeds(seed)
|
set_global_seeds(seed)
|
||||||
|
|
||||||
if isinstance(lr, float): lr = constfn(lr)
|
if isinstance(lr, float): lr = constfn(lr)
|
||||||
|
@@ -84,4 +84,4 @@ def main():
|
|||||||
plt.show()
|
plt.show()
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
main()
|
main()
|
||||||
|
@@ -120,7 +120,7 @@ def build_env(args):
|
|||||||
env = bench.Monitor(env, logger.get_dir())
|
env = bench.Monitor(env, logger.get_dir())
|
||||||
env = retro_wrappers.wrap_deepmind_retro(env)
|
env = retro_wrappers.wrap_deepmind_retro(env)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
get_session(tf.ConfigProto(allow_soft_placement=True,
|
get_session(tf.ConfigProto(allow_soft_placement=True,
|
||||||
intra_op_parallelism_threads=1,
|
intra_op_parallelism_threads=1,
|
||||||
inter_op_parallelism_threads=1))
|
inter_op_parallelism_threads=1))
|
||||||
@@ -128,7 +128,7 @@ def build_env(args):
|
|||||||
env = make_vec_env(env_id, env_type, args.num_env or 1, seed, reward_scale=args.reward_scale)
|
env = make_vec_env(env_id, env_type, args.num_env or 1, seed, reward_scale=args.reward_scale)
|
||||||
|
|
||||||
if env_type == 'mujoco':
|
if env_type == 'mujoco':
|
||||||
env = VecNormalize(env)
|
env = VecNormalize(env)
|
||||||
|
|
||||||
return env
|
return env
|
||||||
|
|
||||||
|
@@ -4,7 +4,7 @@ from baselines.common.models import mlp, cnn_small
|
|||||||
def atari():
|
def atari():
|
||||||
return dict(
|
return dict(
|
||||||
network = cnn_small(),
|
network = cnn_small(),
|
||||||
timesteps_per_batch=512,
|
timesteps_per_batch=512,
|
||||||
max_kl=0.001,
|
max_kl=0.001,
|
||||||
cg_iters=10,
|
cg_iters=10,
|
||||||
cg_damping=1e-3,
|
cg_damping=1e-3,
|
||||||
@@ -26,5 +26,5 @@ def mujoco():
|
|||||||
lam=0.98,
|
lam=0.98,
|
||||||
vf_iters=5,
|
vf_iters=5,
|
||||||
vf_stepsize=1e-3,
|
vf_stepsize=1e-3,
|
||||||
normalize_observations=True,
|
normalize_observations=True,
|
||||||
)
|
)
|
||||||
|
@@ -83,13 +83,13 @@ def add_vtarg_and_adv(seg, gamma, lam):
|
|||||||
seg["tdlamret"] = seg["adv"] + seg["vpred"]
|
seg["tdlamret"] = seg["adv"] + seg["vpred"]
|
||||||
|
|
||||||
def learn(*,
|
def learn(*,
|
||||||
network,
|
network,
|
||||||
env,
|
env,
|
||||||
total_timesteps,
|
total_timesteps,
|
||||||
timesteps_per_batch=1024, # what to train on
|
timesteps_per_batch=1024, # what to train on
|
||||||
max_kl=0.001,
|
max_kl=0.001,
|
||||||
cg_iters=10,
|
cg_iters=10,
|
||||||
gamma=0.99,
|
gamma=0.99,
|
||||||
lam=1.0, # advantage estimation
|
lam=1.0, # advantage estimation
|
||||||
seed=None,
|
seed=None,
|
||||||
entcoeff=0.0,
|
entcoeff=0.0,
|
||||||
@@ -103,7 +103,7 @@ def learn(*,
|
|||||||
):
|
):
|
||||||
'''
|
'''
|
||||||
learn a policy function with TRPO algorithm
|
learn a policy function with TRPO algorithm
|
||||||
|
|
||||||
Parameters:
|
Parameters:
|
||||||
----------
|
----------
|
||||||
|
|
||||||
@@ -121,7 +121,7 @@ def learn(*,
|
|||||||
|
|
||||||
cg_iters number of iterations of conjugate gradient algorithm
|
cg_iters number of iterations of conjugate gradient algorithm
|
||||||
|
|
||||||
cg_damping conjugate gradient damping
|
cg_damping conjugate gradient damping
|
||||||
|
|
||||||
vf_stepsize learning rate for adam optimizer used to optimie value function loss
|
vf_stepsize learning rate for adam optimizer used to optimie value function loss
|
||||||
|
|
||||||
@@ -130,11 +130,11 @@ def learn(*,
|
|||||||
total_timesteps max number of timesteps
|
total_timesteps max number of timesteps
|
||||||
|
|
||||||
max_episodes max number of episodes
|
max_episodes max number of episodes
|
||||||
|
|
||||||
max_iters maximum number of policy optimization iterations
|
max_iters maximum number of policy optimization iterations
|
||||||
|
|
||||||
callback function to be called with (locals(), globals()) each policy optimization step
|
callback function to be called with (locals(), globals()) each policy optimization step
|
||||||
|
|
||||||
load_path str, path to load the model from (default: None, i.e. no model is loaded)
|
load_path str, path to load the model from (default: None, i.e. no model is loaded)
|
||||||
|
|
||||||
**network_kwargs keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
|
**network_kwargs keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
|
||||||
@@ -145,18 +145,18 @@ def learn(*,
|
|||||||
learnt model
|
learnt model
|
||||||
|
|
||||||
'''
|
'''
|
||||||
|
|
||||||
|
|
||||||
nworkers = MPI.COMM_WORLD.Get_size()
|
nworkers = MPI.COMM_WORLD.Get_size()
|
||||||
rank = MPI.COMM_WORLD.Get_rank()
|
rank = MPI.COMM_WORLD.Get_rank()
|
||||||
|
|
||||||
cpus_per_worker = 1
|
cpus_per_worker = 1
|
||||||
U.get_session(config=tf.ConfigProto(
|
U.get_session(config=tf.ConfigProto(
|
||||||
allow_soft_placement=True,
|
allow_soft_placement=True,
|
||||||
inter_op_parallelism_threads=cpus_per_worker,
|
inter_op_parallelism_threads=cpus_per_worker,
|
||||||
intra_op_parallelism_threads=cpus_per_worker
|
intra_op_parallelism_threads=cpus_per_worker
|
||||||
))
|
))
|
||||||
|
|
||||||
|
|
||||||
policy = build_policy(env, network, value_network='copy', **network_kwargs)
|
policy = build_policy(env, network, value_network='copy', **network_kwargs)
|
||||||
set_global_seeds(seed)
|
set_global_seeds(seed)
|
||||||
@@ -245,7 +245,7 @@ def learn(*,
|
|||||||
U.initialize()
|
U.initialize()
|
||||||
if load_path is not None:
|
if load_path is not None:
|
||||||
pi.load(load_path)
|
pi.load(load_path)
|
||||||
|
|
||||||
th_init = get_flat()
|
th_init = get_flat()
|
||||||
MPI.COMM_WORLD.Bcast(th_init, root=0)
|
MPI.COMM_WORLD.Bcast(th_init, root=0)
|
||||||
set_from_flat(th_init)
|
set_from_flat(th_init)
|
||||||
@@ -384,8 +384,8 @@ def get_trainable_variables(scope):
|
|||||||
return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope)
|
return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope)
|
||||||
|
|
||||||
def get_vf_trainable_variables(scope):
|
def get_vf_trainable_variables(scope):
|
||||||
return [v for v in get_trainable_variables(scope) if 'vf' in v.name[len(scope):].split('/')]
|
return [v for v in get_trainable_variables(scope) if 'vf' in v.name[len(scope):].split('/')]
|
||||||
|
|
||||||
def get_pi_trainable_variables(scope):
|
def get_pi_trainable_variables(scope):
|
||||||
return [v for v in get_trainable_variables(scope) if 'pi' in v.name[len(scope):].split('/')]
|
return [v for v in get_trainable_variables(scope) if 'pi' in v.name[len(scope):].split('/')]
|
||||||
|
|
||||||
|
@@ -1,5 +1,5 @@
|
|||||||
[flake8]
|
[flake8]
|
||||||
select = F,E999
|
select = F,E999,W291,W293
|
||||||
exclude =
|
exclude =
|
||||||
.git,
|
.git,
|
||||||
__pycache__,
|
__pycache__,
|
||||||
|
Reference in New Issue
Block a user