Lots of cleanups
Fixes for new gym version Add @olegklimov and @unixpickle to authors list
This commit is contained in:
@@ -26,7 +26,7 @@ pip install -e .
|
|||||||
To cite this repository in publications:
|
To cite this repository in publications:
|
||||||
|
|
||||||
@misc{baselines,
|
@misc{baselines,
|
||||||
author = {Dhariwal, Prafulla and Hesse, Christopher and Plappert, Matthias and Radford, Alec and Schulman, John and Sidor, Szymon and Wu, Yuhuai},
|
author = {Dhariwal, Prafulla and Hesse, Christopher and Klimov, Oleg and Nichol, Alex and Plappert, Matthias and Radford, Alec and Schulman, John and Sidor, Szymon and Wu, Yuhuai},
|
||||||
title = {OpenAI Baselines},
|
title = {OpenAI Baselines},
|
||||||
year = {2017},
|
year = {2017},
|
||||||
publisher = {GitHub},
|
publisher = {GitHub},
|
||||||
|
@@ -1,3 +1,4 @@
|
|||||||
|
import os
|
||||||
import os.path as osp
|
import os.path as osp
|
||||||
import gym
|
import gym
|
||||||
import time
|
import time
|
||||||
@@ -10,22 +11,19 @@ from baselines import logger
|
|||||||
from baselines.common import set_global_seeds, explained_variance
|
from baselines.common import set_global_seeds, explained_variance
|
||||||
from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
|
from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
|
||||||
from baselines.common.atari_wrappers import wrap_deepmind
|
from baselines.common.atari_wrappers import wrap_deepmind
|
||||||
|
from baselines.common import tf_util
|
||||||
|
|
||||||
from baselines.a2c.utils import discount_with_dones
|
from baselines.a2c.utils import discount_with_dones
|
||||||
from baselines.a2c.utils import Scheduler, make_path, find_trainable_variables
|
from baselines.a2c.utils import Scheduler, make_path, find_trainable_variables
|
||||||
from baselines.a2c.policies import CnnPolicy
|
|
||||||
from baselines.a2c.utils import cat_entropy, mse
|
from baselines.a2c.utils import cat_entropy, mse
|
||||||
|
|
||||||
class Model(object):
|
class Model(object):
|
||||||
|
|
||||||
def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, nstack, num_procs,
|
def __init__(self, policy, ob_space, ac_space, nenvs, nsteps,
|
||||||
ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4,
|
ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4,
|
||||||
alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear'):
|
alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear'):
|
||||||
config = tf.ConfigProto(allow_soft_placement=True,
|
|
||||||
intra_op_parallelism_threads=num_procs,
|
sess = tf_util.make_session()
|
||||||
inter_op_parallelism_threads=num_procs)
|
|
||||||
config.gpu_options.allow_growth = True
|
|
||||||
sess = tf.Session(config=config)
|
|
||||||
nact = ac_space.n
|
nact = ac_space.n
|
||||||
nbatch = nenvs*nsteps
|
nbatch = nenvs*nsteps
|
||||||
|
|
||||||
@@ -34,8 +32,8 @@ class Model(object):
|
|||||||
R = tf.placeholder(tf.float32, [nbatch])
|
R = tf.placeholder(tf.float32, [nbatch])
|
||||||
LR = tf.placeholder(tf.float32, [])
|
LR = tf.placeholder(tf.float32, [])
|
||||||
|
|
||||||
step_model = policy(sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False)
|
step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False)
|
||||||
train_model = policy(sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True)
|
train_model = policy(sess, ob_space, ac_space, nenvs*nsteps, nsteps, reuse=True)
|
||||||
|
|
||||||
neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=A)
|
neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=A)
|
||||||
pg_loss = tf.reduce_mean(ADV * neglogpac)
|
pg_loss = tf.reduce_mean(ADV * neglogpac)
|
||||||
@@ -58,7 +56,7 @@ class Model(object):
|
|||||||
for step in range(len(obs)):
|
for step in range(len(obs)):
|
||||||
cur_lr = lr.value()
|
cur_lr = lr.value()
|
||||||
td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, LR:cur_lr}
|
td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, LR:cur_lr}
|
||||||
if states != []:
|
if states is not None:
|
||||||
td_map[train_model.S] = states
|
td_map[train_model.S] = states
|
||||||
td_map[train_model.M] = masks
|
td_map[train_model.M] = masks
|
||||||
policy_loss, value_loss, policy_entropy, _ = sess.run(
|
policy_loss, value_loss, policy_entropy, _ = sess.run(
|
||||||
@@ -91,32 +89,25 @@ class Model(object):
|
|||||||
|
|
||||||
class Runner(object):
|
class Runner(object):
|
||||||
|
|
||||||
def __init__(self, env, model, nsteps=5, nstack=4, gamma=0.99):
|
def __init__(self, env, model, nsteps=5, gamma=0.99):
|
||||||
self.env = env
|
self.env = env
|
||||||
self.model = model
|
self.model = model
|
||||||
nh, nw, nc = env.observation_space.shape
|
nh, nw, nc = env.observation_space.shape
|
||||||
nenv = env.num_envs
|
nenv = env.num_envs
|
||||||
self.batch_ob_shape = (nenv*nsteps, nh, nw, nc*nstack)
|
self.batch_ob_shape = (nenv*nsteps, nh, nw, nc)
|
||||||
self.obs = np.zeros((nenv, nh, nw, nc*nstack), dtype=np.uint8)
|
self.obs = np.zeros((nenv, nh, nw, nc), dtype=np.uint8)
|
||||||
self.nc = nc
|
self.nc = nc
|
||||||
obs = env.reset()
|
obs = env.reset()
|
||||||
self.update_obs(obs)
|
|
||||||
self.gamma = gamma
|
self.gamma = gamma
|
||||||
self.nsteps = nsteps
|
self.nsteps = nsteps
|
||||||
self.states = model.initial_state
|
self.states = model.initial_state
|
||||||
self.dones = [False for _ in range(nenv)]
|
self.dones = [False for _ in range(nenv)]
|
||||||
|
|
||||||
def update_obs(self, obs):
|
|
||||||
# Do frame-stacking here instead of the FrameStack wrapper to reduce
|
|
||||||
# IPC overhead
|
|
||||||
self.obs = np.roll(self.obs, shift=-self.nc, axis=3)
|
|
||||||
self.obs[:, :, :, -self.nc:] = obs
|
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[]
|
mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[]
|
||||||
mb_states = self.states
|
mb_states = self.states
|
||||||
for n in range(self.nsteps):
|
for n in range(self.nsteps):
|
||||||
actions, values, states = self.model.step(self.obs, self.states, self.dones)
|
actions, values, states, _ = self.model.step(self.obs, self.states, self.dones)
|
||||||
mb_obs.append(np.copy(self.obs))
|
mb_obs.append(np.copy(self.obs))
|
||||||
mb_actions.append(actions)
|
mb_actions.append(actions)
|
||||||
mb_values.append(values)
|
mb_values.append(values)
|
||||||
@@ -127,7 +118,7 @@ class Runner(object):
|
|||||||
for n, done in enumerate(dones):
|
for n, done in enumerate(dones):
|
||||||
if done:
|
if done:
|
||||||
self.obs[n] = self.obs[n]*0
|
self.obs[n] = self.obs[n]*0
|
||||||
self.update_obs(obs)
|
self.obs = obs
|
||||||
mb_rewards.append(rewards)
|
mb_rewards.append(rewards)
|
||||||
mb_dones.append(self.dones)
|
mb_dones.append(self.dones)
|
||||||
#batch of steps to batch of rollouts
|
#batch of steps to batch of rollouts
|
||||||
@@ -154,17 +145,16 @@ class Runner(object):
|
|||||||
mb_masks = mb_masks.flatten()
|
mb_masks = mb_masks.flatten()
|
||||||
return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values
|
return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values
|
||||||
|
|
||||||
def learn(policy, env, seed, nsteps=5, nstack=4, total_timesteps=int(80e6), vf_coef=0.5, ent_coef=0.01, max_grad_norm=0.5, lr=7e-4, lrschedule='linear', epsilon=1e-5, alpha=0.99, gamma=0.99, log_interval=100):
|
def learn(policy, env, seed, nsteps=5, total_timesteps=int(80e6), vf_coef=0.5, ent_coef=0.01, max_grad_norm=0.5, lr=7e-4, lrschedule='linear', epsilon=1e-5, alpha=0.99, gamma=0.99, log_interval=100):
|
||||||
tf.reset_default_graph()
|
tf.reset_default_graph()
|
||||||
set_global_seeds(seed)
|
set_global_seeds(seed)
|
||||||
|
|
||||||
nenvs = env.num_envs
|
nenvs = env.num_envs
|
||||||
ob_space = env.observation_space
|
ob_space = env.observation_space
|
||||||
ac_space = env.action_space
|
ac_space = env.action_space
|
||||||
num_procs = len(env.remotes) # HACK
|
model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nenvs=nenvs, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef,
|
||||||
model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nenvs=nenvs, nsteps=nsteps, nstack=nstack, num_procs=num_procs, ent_coef=ent_coef, vf_coef=vf_coef,
|
|
||||||
max_grad_norm=max_grad_norm, lr=lr, alpha=alpha, epsilon=epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule)
|
max_grad_norm=max_grad_norm, lr=lr, alpha=alpha, epsilon=epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule)
|
||||||
runner = Runner(env, model, nsteps=nsteps, nstack=nstack, gamma=gamma)
|
runner = Runner(env, model, nsteps=nsteps, gamma=gamma)
|
||||||
|
|
||||||
nbatch = nenvs*nsteps
|
nbatch = nenvs*nsteps
|
||||||
tstart = time.time()
|
tstart = time.time()
|
||||||
@@ -183,6 +173,3 @@ def learn(policy, env, seed, nsteps=5, nstack=4, total_timesteps=int(80e6), vf_c
|
|||||||
logger.record_tabular("explained_variance", float(ev))
|
logger.record_tabular("explained_variance", float(ev))
|
||||||
logger.dump_tabular()
|
logger.dump_tabular()
|
||||||
env.close()
|
env.close()
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
main()
|
|
||||||
|
@@ -1,36 +1,48 @@
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
import tensorflow as tf
|
import tensorflow as tf
|
||||||
from baselines.a2c.utils import conv, fc, conv_to_fc, batch_to_seq, seq_to_batch, lstm, lnlstm, sample
|
from baselines.a2c.utils import conv, fc, conv_to_fc, batch_to_seq, seq_to_batch, lstm, lnlstm
|
||||||
|
from baselines.common.distributions import make_pdtype
|
||||||
|
|
||||||
|
def nature_cnn(unscaled_images):
|
||||||
|
"""
|
||||||
|
CNN from Nature paper.
|
||||||
|
"""
|
||||||
|
scaled_images = tf.cast(unscaled_images, tf.float32) / 255.
|
||||||
|
activ = tf.nn.relu
|
||||||
|
h = activ(conv(scaled_images, 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2)))
|
||||||
|
h2 = activ(conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2)))
|
||||||
|
h3 = activ(conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2)))
|
||||||
|
h3 = conv_to_fc(h3)
|
||||||
|
return activ(fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2)))
|
||||||
|
|
||||||
class LnLstmPolicy(object):
|
class LnLstmPolicy(object):
|
||||||
def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, nlstm=256, reuse=False):
|
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False):
|
||||||
nbatch = nenv*nsteps
|
nenv = nbatch // nsteps
|
||||||
nh, nw, nc = ob_space.shape
|
nh, nw, nc = ob_space.shape
|
||||||
ob_shape = (nbatch, nh, nw, nc*nstack)
|
ob_shape = (nbatch, nh, nw, nc)
|
||||||
nact = ac_space.n
|
nact = ac_space.n
|
||||||
X = tf.placeholder(tf.uint8, ob_shape) #obs
|
X = tf.placeholder(tf.uint8, ob_shape) #obs
|
||||||
M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
|
M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
|
||||||
S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
|
S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
|
||||||
with tf.variable_scope("model", reuse=reuse):
|
with tf.variable_scope("model", reuse=reuse):
|
||||||
h = conv(tf.cast(X, tf.float32)/255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2))
|
h = nature_cnn(X)
|
||||||
h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))
|
xs = batch_to_seq(h, nenv, nsteps)
|
||||||
h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2))
|
|
||||||
h3 = conv_to_fc(h3)
|
|
||||||
h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
|
|
||||||
xs = batch_to_seq(h4, nenv, nsteps)
|
|
||||||
ms = batch_to_seq(M, nenv, nsteps)
|
ms = batch_to_seq(M, nenv, nsteps)
|
||||||
h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm)
|
h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm)
|
||||||
h5 = seq_to_batch(h5)
|
h5 = seq_to_batch(h5)
|
||||||
pi = fc(h5, 'pi', nact, act=lambda x:x)
|
pi = fc(h5, 'pi', nact)
|
||||||
vf = fc(h5, 'v', 1, act=lambda x:x)
|
vf = fc(h5, 'v', 1)
|
||||||
|
|
||||||
|
self.pdtype = make_pdtype(ac_space)
|
||||||
|
self.pd = self.pdtype.pdfromflat(pi)
|
||||||
|
|
||||||
v0 = vf[:, 0]
|
v0 = vf[:, 0]
|
||||||
a0 = sample(pi)
|
a0 = self.pd.sample()
|
||||||
|
neglogp0 = self.pd.neglogp(a0)
|
||||||
self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)
|
self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)
|
||||||
|
|
||||||
def step(ob, state, mask):
|
def step(ob, state, mask):
|
||||||
a, v, s = sess.run([a0, v0, snew], {X:ob, S:state, M:mask})
|
return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask})
|
||||||
return a, v, s
|
|
||||||
|
|
||||||
def value(ob, state, mask):
|
def value(ob, state, mask):
|
||||||
return sess.run(v0, {X:ob, S:state, M:mask})
|
return sess.run(v0, {X:ob, S:state, M:mask})
|
||||||
@@ -45,34 +57,34 @@ class LnLstmPolicy(object):
|
|||||||
|
|
||||||
class LstmPolicy(object):
|
class LstmPolicy(object):
|
||||||
|
|
||||||
def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, nlstm=256, reuse=False):
|
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False):
|
||||||
nbatch = nenv*nsteps
|
nenv = nbatch // nsteps
|
||||||
|
|
||||||
nh, nw, nc = ob_space.shape
|
nh, nw, nc = ob_space.shape
|
||||||
ob_shape = (nbatch, nh, nw, nc*nstack)
|
ob_shape = (nbatch, nh, nw, nc)
|
||||||
nact = ac_space.n
|
nact = ac_space.n
|
||||||
X = tf.placeholder(tf.uint8, ob_shape) #obs
|
X = tf.placeholder(tf.uint8, ob_shape) #obs
|
||||||
M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
|
M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
|
||||||
S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
|
S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
|
||||||
with tf.variable_scope("model", reuse=reuse):
|
with tf.variable_scope("model", reuse=reuse):
|
||||||
h = conv(tf.cast(X, tf.float32)/255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2))
|
h = nature_cnn(X)
|
||||||
h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))
|
xs = batch_to_seq(h, nenv, nsteps)
|
||||||
h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2))
|
|
||||||
h3 = conv_to_fc(h3)
|
|
||||||
h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
|
|
||||||
xs = batch_to_seq(h4, nenv, nsteps)
|
|
||||||
ms = batch_to_seq(M, nenv, nsteps)
|
ms = batch_to_seq(M, nenv, nsteps)
|
||||||
h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
|
h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
|
||||||
h5 = seq_to_batch(h5)
|
h5 = seq_to_batch(h5)
|
||||||
pi = fc(h5, 'pi', nact, act=lambda x:x)
|
pi = fc(h5, 'pi', nact)
|
||||||
vf = fc(h5, 'v', 1, act=lambda x:x)
|
vf = fc(h5, 'v', 1)
|
||||||
|
|
||||||
|
self.pdtype = make_pdtype(ac_space)
|
||||||
|
self.pd = self.pdtype.pdfromflat(pi)
|
||||||
|
|
||||||
v0 = vf[:, 0]
|
v0 = vf[:, 0]
|
||||||
a0 = sample(pi)
|
a0 = self.pd.sample()
|
||||||
|
neglogp0 = self.pd.neglogp(a0)
|
||||||
self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)
|
self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)
|
||||||
|
|
||||||
def step(ob, state, mask):
|
def step(ob, state, mask):
|
||||||
a, v, s = sess.run([a0, v0, snew], {X:ob, S:state, M:mask})
|
return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask})
|
||||||
return a, v, s
|
|
||||||
|
|
||||||
def value(ob, state, mask):
|
def value(ob, state, mask):
|
||||||
return sess.run(v0, {X:ob, S:state, M:mask})
|
return sess.run(v0, {X:ob, S:state, M:mask})
|
||||||
@@ -87,31 +99,67 @@ class LstmPolicy(object):
|
|||||||
|
|
||||||
class CnnPolicy(object):
|
class CnnPolicy(object):
|
||||||
|
|
||||||
def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False):
|
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False): #pylint: disable=W0613
|
||||||
nbatch = nenv*nsteps
|
|
||||||
nh, nw, nc = ob_space.shape
|
nh, nw, nc = ob_space.shape
|
||||||
ob_shape = (nbatch, nh, nw, nc*nstack)
|
ob_shape = (nbatch, nh, nw, nc)
|
||||||
nact = ac_space.n
|
nact = ac_space.n
|
||||||
X = tf.placeholder(tf.uint8, ob_shape) #obs
|
X = tf.placeholder(tf.uint8, ob_shape) #obs
|
||||||
with tf.variable_scope("model", reuse=reuse):
|
with tf.variable_scope("model", reuse=reuse):
|
||||||
h = conv(tf.cast(X, tf.float32)/255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2))
|
h = nature_cnn(X)
|
||||||
h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))
|
pi = fc(h, 'pi', nact, init_scale=0.01)
|
||||||
h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2))
|
vf = fc(h, 'v', 1)[:,0]
|
||||||
h3 = conv_to_fc(h3)
|
|
||||||
h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
|
|
||||||
pi = fc(h4, 'pi', nact, act=lambda x:x)
|
|
||||||
vf = fc(h4, 'v', 1, act=lambda x:x)
|
|
||||||
|
|
||||||
v0 = vf[:, 0]
|
self.pdtype = make_pdtype(ac_space)
|
||||||
a0 = sample(pi)
|
self.pd = self.pdtype.pdfromflat(pi)
|
||||||
self.initial_state = [] #not stateful
|
|
||||||
|
a0 = self.pd.sample()
|
||||||
|
neglogp0 = self.pd.neglogp(a0)
|
||||||
|
self.initial_state = None
|
||||||
|
|
||||||
def step(ob, *_args, **_kwargs):
|
def step(ob, *_args, **_kwargs):
|
||||||
a, v = sess.run([a0, v0], {X:ob})
|
a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob})
|
||||||
return a, v, [] #dummy state
|
return a, v, self.initial_state, neglogp
|
||||||
|
|
||||||
def value(ob, *_args, **_kwargs):
|
def value(ob, *_args, **_kwargs):
|
||||||
return sess.run(v0, {X:ob})
|
return sess.run(vf, {X:ob})
|
||||||
|
|
||||||
|
self.X = X
|
||||||
|
self.pi = pi
|
||||||
|
self.vf = vf
|
||||||
|
self.step = step
|
||||||
|
self.value = value
|
||||||
|
|
||||||
|
class MlpPolicy(object):
|
||||||
|
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False): #pylint: disable=W0613
|
||||||
|
ob_shape = (nbatch,) + ob_space.shape
|
||||||
|
actdim = ac_space.shape[0]
|
||||||
|
X = tf.placeholder(tf.float32, ob_shape, name='Ob') #obs
|
||||||
|
with tf.variable_scope("model", reuse=reuse):
|
||||||
|
activ = tf.tanh
|
||||||
|
h1 = activ(fc(X, 'pi_fc1', nh=64, init_scale=np.sqrt(2)))
|
||||||
|
h2 = activ(fc(h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2)))
|
||||||
|
pi = fc(h2, 'pi', actdim, init_scale=0.01)
|
||||||
|
h1 = activ(fc(X, 'vf_fc1', nh=64, init_scale=np.sqrt(2)))
|
||||||
|
h2 = activ(fc(h1, 'vf_fc2', nh=64, init_scale=np.sqrt(2)))
|
||||||
|
vf = fc(h2, 'vf', 1)[:,0]
|
||||||
|
logstd = tf.get_variable(name="logstd", shape=[1, actdim],
|
||||||
|
initializer=tf.zeros_initializer())
|
||||||
|
|
||||||
|
pdparam = tf.concat([pi, pi * 0.0 + logstd], axis=1)
|
||||||
|
|
||||||
|
self.pdtype = make_pdtype(ac_space)
|
||||||
|
self.pd = self.pdtype.pdfromflat(pdparam)
|
||||||
|
|
||||||
|
a0 = self.pd.sample()
|
||||||
|
neglogp0 = self.pd.neglogp(a0)
|
||||||
|
self.initial_state = None
|
||||||
|
|
||||||
|
def step(ob, *_args, **_kwargs):
|
||||||
|
a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob})
|
||||||
|
return a, v, self.initial_state, neglogp
|
||||||
|
|
||||||
|
def value(ob, *_args, **_kwargs):
|
||||||
|
return sess.run(vf, {X:ob})
|
||||||
|
|
||||||
self.X = X
|
self.X = X
|
||||||
self.pi = pi
|
self.pi = pi
|
||||||
|
@@ -1,45 +1,30 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
import os, logging, gym
|
|
||||||
from baselines import logger
|
|
||||||
from baselines.common import set_global_seeds
|
|
||||||
from baselines import bench
|
|
||||||
from baselines.a2c.a2c import learn
|
|
||||||
from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
|
|
||||||
from baselines.common.atari_wrappers import make_atari, wrap_deepmind
|
|
||||||
from baselines.a2c.policies import CnnPolicy, LstmPolicy, LnLstmPolicy
|
|
||||||
|
|
||||||
def train(env_id, num_timesteps, seed, policy, lrschedule, num_cpu):
|
from baselines import logger
|
||||||
def make_env(rank):
|
from baselines.common.cmd_util import make_atari_env, atari_arg_parser
|
||||||
def _thunk():
|
from baselines.common.vec_env.vec_frame_stack import VecFrameStack
|
||||||
env = make_atari(env_id)
|
from baselines.a2c.a2c import learn
|
||||||
env.seed(seed + rank)
|
from baselines.ppo2.policies import CnnPolicy, LstmPolicy, LnLstmPolicy
|
||||||
env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
|
|
||||||
gym.logger.setLevel(logging.WARN)
|
def train(env_id, num_timesteps, seed, policy, lrschedule, num_env):
|
||||||
return wrap_deepmind(env)
|
|
||||||
return _thunk
|
|
||||||
set_global_seeds(seed)
|
|
||||||
env = SubprocVecEnv([make_env(i) for i in range(num_cpu)])
|
|
||||||
if policy == 'cnn':
|
if policy == 'cnn':
|
||||||
policy_fn = CnnPolicy
|
policy_fn = CnnPolicy
|
||||||
elif policy == 'lstm':
|
elif policy == 'lstm':
|
||||||
policy_fn = LstmPolicy
|
policy_fn = LstmPolicy
|
||||||
elif policy == 'lnlstm':
|
elif policy == 'lnlstm':
|
||||||
policy_fn = LnLstmPolicy
|
policy_fn = LnLstmPolicy
|
||||||
|
env = VecFrameStack(make_atari_env(env_id, num_env, seed), 4)
|
||||||
learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), lrschedule=lrschedule)
|
learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), lrschedule=lrschedule)
|
||||||
env.close()
|
env.close()
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
import argparse
|
parser = atari_arg_parser()
|
||||||
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
|
||||||
parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4')
|
|
||||||
parser.add_argument('--seed', help='RNG seed', type=int, default=0)
|
|
||||||
parser.add_argument('--policy', help='Policy architecture', choices=['cnn', 'lstm', 'lnlstm'], default='cnn')
|
parser.add_argument('--policy', help='Policy architecture', choices=['cnn', 'lstm', 'lnlstm'], default='cnn')
|
||||||
parser.add_argument('--lrschedule', help='Learning rate schedule', choices=['constant', 'linear'], default='constant')
|
parser.add_argument('--lrschedule', help='Learning rate schedule', choices=['constant', 'linear'], default='constant')
|
||||||
parser.add_argument('--num-timesteps', type=int, default=int(10e6))
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
logger.configure()
|
logger.configure()
|
||||||
train(args.env, num_timesteps=args.num_timesteps, seed=args.seed,
|
train(args.env, num_timesteps=args.num_timesteps, seed=args.seed,
|
||||||
policy=args.policy, lrschedule=args.lrschedule, num_cpu=16)
|
policy=args.policy, lrschedule=args.lrschedule, num_env=16)
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
main()
|
main()
|
||||||
|
@@ -39,23 +39,19 @@ def ortho_init(scale=1.0):
|
|||||||
return (scale * q[:shape[0], :shape[1]]).astype(np.float32)
|
return (scale * q[:shape[0], :shape[1]]).astype(np.float32)
|
||||||
return _ortho_init
|
return _ortho_init
|
||||||
|
|
||||||
def conv(x, scope, nf, rf, stride, pad='VALID', act=tf.nn.relu, init_scale=1.0):
|
def conv(x, scope, *, nf, rf, stride, pad='VALID', init_scale=1.0):
|
||||||
with tf.variable_scope(scope):
|
with tf.variable_scope(scope):
|
||||||
nin = x.get_shape()[3].value
|
nin = x.get_shape()[3].value
|
||||||
w = tf.get_variable("w", [rf, rf, nin, nf], initializer=ortho_init(init_scale))
|
w = tf.get_variable("w", [rf, rf, nin, nf], initializer=ortho_init(init_scale))
|
||||||
b = tf.get_variable("b", [nf], initializer=tf.constant_initializer(0.0))
|
b = tf.get_variable("b", [nf], initializer=tf.constant_initializer(0.0))
|
||||||
z = tf.nn.conv2d(x, w, strides=[1, stride, stride, 1], padding=pad)+b
|
return tf.nn.conv2d(x, w, strides=[1, stride, stride, 1], padding=pad)+b
|
||||||
h = act(z)
|
|
||||||
return h
|
|
||||||
|
|
||||||
def fc(x, scope, nh, act=tf.nn.relu, init_scale=1.0):
|
def fc(x, scope, nh, *, init_scale=1.0, init_bias=0.0):
|
||||||
with tf.variable_scope(scope):
|
with tf.variable_scope(scope):
|
||||||
nin = x.get_shape()[1].value
|
nin = x.get_shape()[1].value
|
||||||
w = tf.get_variable("w", [nin, nh], initializer=ortho_init(init_scale))
|
w = tf.get_variable("w", [nin, nh], initializer=ortho_init(init_scale))
|
||||||
b = tf.get_variable("b", [nh], initializer=tf.constant_initializer(0.0))
|
b = tf.get_variable("b", [nh], initializer=tf.constant_initializer(init_bias))
|
||||||
z = tf.matmul(x, w)+b
|
return tf.matmul(x, w)+b
|
||||||
h = act(z)
|
|
||||||
return h
|
|
||||||
|
|
||||||
def batch_to_seq(h, nbatch, nsteps, flat=False):
|
def batch_to_seq(h, nbatch, nsteps, flat=False):
|
||||||
if flat:
|
if flat:
|
||||||
@@ -162,9 +158,34 @@ def constant(p):
|
|||||||
def linear(p):
|
def linear(p):
|
||||||
return 1-p
|
return 1-p
|
||||||
|
|
||||||
|
def middle_drop(p):
|
||||||
|
eps = 0.75
|
||||||
|
if 1-p<eps:
|
||||||
|
return eps*0.1
|
||||||
|
return 1-p
|
||||||
|
|
||||||
|
def double_linear_con(p):
|
||||||
|
p *= 2
|
||||||
|
eps = 0.125
|
||||||
|
if 1-p<eps:
|
||||||
|
return eps
|
||||||
|
return 1-p
|
||||||
|
|
||||||
|
def double_middle_drop(p):
|
||||||
|
eps1 = 0.75
|
||||||
|
eps2 = 0.25
|
||||||
|
if 1-p<eps1:
|
||||||
|
if 1-p<eps2:
|
||||||
|
return eps2*0.5
|
||||||
|
return eps1*0.1
|
||||||
|
return 1-p
|
||||||
|
|
||||||
schedules = {
|
schedules = {
|
||||||
'linear':linear,
|
'linear':linear,
|
||||||
'constant':constant
|
'constant':constant,
|
||||||
|
'double_linear_con': double_linear_con,
|
||||||
|
'middle_drop': middle_drop,
|
||||||
|
'double_middle_drop': double_middle_drop
|
||||||
}
|
}
|
||||||
|
|
||||||
class Scheduler(object):
|
class Scheduler(object):
|
||||||
|
@@ -1,6 +1,7 @@
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
import tensorflow as tf
|
import tensorflow as tf
|
||||||
from baselines.a2c.utils import conv, fc, conv_to_fc, batch_to_seq, seq_to_batch, lstm, lnlstm, sample, check_shape
|
from baselines.ppo2.policies import nature_cnn
|
||||||
|
from baselines.a2c.utils import fc, batch_to_seq, seq_to_batch, lstm, sample
|
||||||
|
|
||||||
|
|
||||||
class AcerCnnPolicy(object):
|
class AcerCnnPolicy(object):
|
||||||
@@ -12,14 +13,10 @@ class AcerCnnPolicy(object):
|
|||||||
nact = ac_space.n
|
nact = ac_space.n
|
||||||
X = tf.placeholder(tf.uint8, ob_shape) # obs
|
X = tf.placeholder(tf.uint8, ob_shape) # obs
|
||||||
with tf.variable_scope("model", reuse=reuse):
|
with tf.variable_scope("model", reuse=reuse):
|
||||||
h = conv(tf.cast(X, tf.float32) / 255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2))
|
h = nature_cnn(X)
|
||||||
h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))
|
pi_logits = fc(h, 'pi', nact, init_scale=0.01)
|
||||||
h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2))
|
|
||||||
h3 = conv_to_fc(h3)
|
|
||||||
h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
|
|
||||||
pi_logits = fc(h4, 'pi', nact, act=lambda x: x, init_scale=0.01)
|
|
||||||
pi = tf.nn.softmax(pi_logits)
|
pi = tf.nn.softmax(pi_logits)
|
||||||
q = fc(h4, 'q', nact, act=lambda x: x)
|
q = fc(h, 'q', nact)
|
||||||
|
|
||||||
a = sample(pi_logits) # could change this to use self.pi instead
|
a = sample(pi_logits) # could change this to use self.pi instead
|
||||||
self.initial_state = [] # not stateful
|
self.initial_state = [] # not stateful
|
||||||
@@ -54,14 +51,10 @@ class AcerLstmPolicy(object):
|
|||||||
M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
|
M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
|
||||||
S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
|
S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
|
||||||
with tf.variable_scope("model", reuse=reuse):
|
with tf.variable_scope("model", reuse=reuse):
|
||||||
h = conv(tf.cast(X, tf.float32) / 255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2))
|
h = nature_cnn(X)
|
||||||
h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))
|
|
||||||
h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2))
|
|
||||||
h3 = conv_to_fc(h3)
|
|
||||||
h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
|
|
||||||
|
|
||||||
# lstm
|
# lstm
|
||||||
xs = batch_to_seq(h4, nenv, nsteps)
|
xs = batch_to_seq(h, nenv, nsteps)
|
||||||
ms = batch_to_seq(M, nenv, nsteps)
|
ms = batch_to_seq(M, nenv, nsteps)
|
||||||
h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
|
h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
|
||||||
h5 = seq_to_batch(h5)
|
h5 = seq_to_batch(h5)
|
||||||
|
@@ -1,24 +1,11 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python3
|
||||||
import os, logging, gym
|
|
||||||
from baselines import logger
|
from baselines import logger
|
||||||
from baselines.common import set_global_seeds
|
|
||||||
from baselines import bench
|
|
||||||
from baselines.acer.acer_simple import learn
|
from baselines.acer.acer_simple import learn
|
||||||
from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
|
|
||||||
from baselines.common.atari_wrappers import make_atari, wrap_deepmind
|
|
||||||
from baselines.acer.policies import AcerCnnPolicy, AcerLstmPolicy
|
from baselines.acer.policies import AcerCnnPolicy, AcerLstmPolicy
|
||||||
|
from baselines.common.cmd_util import make_atari_env, atari_arg_parser
|
||||||
|
|
||||||
def train(env_id, num_timesteps, seed, policy, lrschedule, num_cpu):
|
def train(env_id, num_timesteps, seed, policy, lrschedule, num_cpu):
|
||||||
def make_env(rank):
|
env = make_atari_env(env_id, num_cpu, seed)
|
||||||
def _thunk():
|
|
||||||
env = make_atari(env_id)
|
|
||||||
env.seed(seed + rank)
|
|
||||||
env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
|
|
||||||
gym.logger.setLevel(logging.WARN)
|
|
||||||
return wrap_deepmind(env)
|
|
||||||
return _thunk
|
|
||||||
set_global_seeds(seed)
|
|
||||||
env = SubprocVecEnv([make_env(i) for i in range(num_cpu)])
|
|
||||||
if policy == 'cnn':
|
if policy == 'cnn':
|
||||||
policy_fn = AcerCnnPolicy
|
policy_fn = AcerCnnPolicy
|
||||||
elif policy == 'lstm':
|
elif policy == 'lstm':
|
||||||
@@ -30,16 +17,12 @@ def train(env_id, num_timesteps, seed, policy, lrschedule, num_cpu):
|
|||||||
env.close()
|
env.close()
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
import argparse
|
parser = atari_arg_parser()
|
||||||
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
|
||||||
parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4')
|
|
||||||
parser.add_argument('--seed', help='RNG seed', type=int, default=0)
|
|
||||||
parser.add_argument('--policy', help='Policy architecture', choices=['cnn', 'lstm', 'lnlstm'], default='cnn')
|
parser.add_argument('--policy', help='Policy architecture', choices=['cnn', 'lstm', 'lnlstm'], default='cnn')
|
||||||
parser.add_argument('--lrschedule', help='Learning rate schedule', choices=['constant', 'linear'], default='constant')
|
parser.add_argument('--lrschedule', help='Learning rate schedule', choices=['constant', 'linear'], default='constant')
|
||||||
parser.add_argument('--logdir', help ='Directory for logging', default='./log')
|
parser.add_argument('--logdir', help ='Directory for logging')
|
||||||
parser.add_argument('--num-timesteps', type=int, default=int(10e6))
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
logger.configure(os.path.abspath(args.logdir))
|
logger.configure(args.logdir)
|
||||||
train(args.env, num_timesteps=args.num_timesteps, seed=args.seed,
|
train(args.env, num_timesteps=args.num_timesteps, seed=args.seed,
|
||||||
policy=args.policy, lrschedule=args.lrschedule, num_cpu=16)
|
policy=args.policy, lrschedule=args.lrschedule, num_cpu=16)
|
||||||
|
|
||||||
|
@@ -1,10 +1,10 @@
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
import tensorflow as tf
|
import tensorflow as tf
|
||||||
from baselines import logger
|
from baselines import logger
|
||||||
from baselines import common
|
import baselines.common as common
|
||||||
from baselines.common import tf_util as U
|
from baselines.common import tf_util as U
|
||||||
from baselines.acktr import kfac
|
from baselines.acktr import kfac
|
||||||
from baselines.acktr.filters import ZFilter
|
from baselines.common.filters import ZFilter
|
||||||
|
|
||||||
def pathlength(path):
|
def pathlength(path):
|
||||||
return path["reward"].shape[0]# Loss function that we'll differentiate to get the policy gradient
|
return path["reward"].shape[0]# Loss function that we'll differentiate to get the policy gradient
|
||||||
@@ -70,7 +70,7 @@ def learn(env, policy, vf, gamma, lam, timesteps_per_batch, num_timesteps,
|
|||||||
coord = tf.train.Coordinator()
|
coord = tf.train.Coordinator()
|
||||||
for qr in [q_runner, vf.q_runner]:
|
for qr in [q_runner, vf.q_runner]:
|
||||||
assert (qr != None)
|
assert (qr != None)
|
||||||
enqueue_threads.extend(qr.create_threads(U.get_session(), coord=coord, start=True))
|
enqueue_threads.extend(qr.create_threads(tf.get_default_session(), coord=coord, start=True))
|
||||||
|
|
||||||
i = 0
|
i = 0
|
||||||
timesteps_so_far = 0
|
timesteps_so_far = 0
|
||||||
@@ -122,10 +122,10 @@ def learn(env, policy, vf, gamma, lam, timesteps_per_batch, num_timesteps,
|
|||||||
kl = policy.compute_kl(ob_no, oldac_dist)
|
kl = policy.compute_kl(ob_no, oldac_dist)
|
||||||
if kl > desired_kl * 2:
|
if kl > desired_kl * 2:
|
||||||
logger.log("kl too high")
|
logger.log("kl too high")
|
||||||
U.eval(tf.assign(stepsize, tf.maximum(min_stepsize, stepsize / 1.5)))
|
tf.assign(stepsize, tf.maximum(min_stepsize, stepsize / 1.5)).eval()
|
||||||
elif kl < desired_kl / 2:
|
elif kl < desired_kl / 2:
|
||||||
logger.log("kl too low")
|
logger.log("kl too low")
|
||||||
U.eval(tf.assign(stepsize, tf.minimum(max_stepsize, stepsize * 1.5)))
|
tf.assign(stepsize, tf.minimum(max_stepsize, stepsize * 1.5)).eval()
|
||||||
else:
|
else:
|
||||||
logger.log("kl just right!")
|
logger.log("kl just right!")
|
||||||
|
|
||||||
|
@@ -7,16 +7,17 @@ from baselines import logger
|
|||||||
|
|
||||||
from baselines.common import set_global_seeds, explained_variance
|
from baselines.common import set_global_seeds, explained_variance
|
||||||
|
|
||||||
from baselines.acktr.utils import discount_with_dones
|
from baselines.a2c.a2c import Runner
|
||||||
from baselines.acktr.utils import Scheduler, find_trainable_variables
|
from baselines.a2c.utils import discount_with_dones
|
||||||
from baselines.acktr.utils import cat_entropy, mse
|
from baselines.a2c.utils import Scheduler, find_trainable_variables
|
||||||
|
from baselines.a2c.utils import cat_entropy, mse
|
||||||
from baselines.acktr import kfac
|
from baselines.acktr import kfac
|
||||||
|
|
||||||
|
|
||||||
class Model(object):
|
class Model(object):
|
||||||
|
|
||||||
def __init__(self, policy, ob_space, ac_space, nenvs,total_timesteps, nprocs=32, nsteps=20,
|
def __init__(self, policy, ob_space, ac_space, nenvs,total_timesteps, nprocs=32, nsteps=20,
|
||||||
nstack=4, ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5,
|
ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5,
|
||||||
kfac_clip=0.001, lrschedule='linear'):
|
kfac_clip=0.001, lrschedule='linear'):
|
||||||
config = tf.ConfigProto(allow_soft_placement=True,
|
config = tf.ConfigProto(allow_soft_placement=True,
|
||||||
intra_op_parallelism_threads=nprocs,
|
intra_op_parallelism_threads=nprocs,
|
||||||
@@ -31,8 +32,8 @@ class Model(object):
|
|||||||
PG_LR = tf.placeholder(tf.float32, [])
|
PG_LR = tf.placeholder(tf.float32, [])
|
||||||
VF_LR = tf.placeholder(tf.float32, [])
|
VF_LR = tf.placeholder(tf.float32, [])
|
||||||
|
|
||||||
self.model = step_model = policy(sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False)
|
self.model = step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False)
|
||||||
self.model2 = train_model = policy(sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True)
|
self.model2 = train_model = policy(sess, ob_space, ac_space, nenvs*nsteps, nsteps, reuse=True)
|
||||||
|
|
||||||
logpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=A)
|
logpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=A)
|
||||||
self.logits = logits = train_model.pi
|
self.logits = logits = train_model.pi
|
||||||
@@ -71,7 +72,7 @@ class Model(object):
|
|||||||
cur_lr = self.lr.value()
|
cur_lr = self.lr.value()
|
||||||
|
|
||||||
td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, PG_LR:cur_lr}
|
td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, PG_LR:cur_lr}
|
||||||
if states != []:
|
if states is not None:
|
||||||
td_map[train_model.S] = states
|
td_map[train_model.S] = states
|
||||||
td_map[train_model.M] = masks
|
td_map[train_model.M] = masks
|
||||||
|
|
||||||
@@ -104,70 +105,8 @@ class Model(object):
|
|||||||
self.initial_state = step_model.initial_state
|
self.initial_state = step_model.initial_state
|
||||||
tf.global_variables_initializer().run(session=sess)
|
tf.global_variables_initializer().run(session=sess)
|
||||||
|
|
||||||
class Runner(object):
|
|
||||||
|
|
||||||
def __init__(self, env, model, nsteps, nstack, gamma):
|
|
||||||
self.env = env
|
|
||||||
self.model = model
|
|
||||||
nh, nw, nc = env.observation_space.shape
|
|
||||||
nenv = env.num_envs
|
|
||||||
self.batch_ob_shape = (nenv*nsteps, nh, nw, nc*nstack)
|
|
||||||
self.obs = np.zeros((nenv, nh, nw, nc*nstack), dtype=np.uint8)
|
|
||||||
obs = env.reset()
|
|
||||||
self.update_obs(obs)
|
|
||||||
self.gamma = gamma
|
|
||||||
self.nsteps = nsteps
|
|
||||||
self.states = model.initial_state
|
|
||||||
self.dones = [False for _ in range(nenv)]
|
|
||||||
|
|
||||||
def update_obs(self, obs):
|
|
||||||
self.obs = np.roll(self.obs, shift=-1, axis=3)
|
|
||||||
self.obs[:, :, :, -1] = obs[:, :, :, 0]
|
|
||||||
|
|
||||||
def run(self):
|
|
||||||
mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[]
|
|
||||||
mb_states = self.states
|
|
||||||
for n in range(self.nsteps):
|
|
||||||
actions, values, states = self.model.step(self.obs, self.states, self.dones)
|
|
||||||
mb_obs.append(np.copy(self.obs))
|
|
||||||
mb_actions.append(actions)
|
|
||||||
mb_values.append(values)
|
|
||||||
mb_dones.append(self.dones)
|
|
||||||
obs, rewards, dones, _ = self.env.step(actions)
|
|
||||||
self.states = states
|
|
||||||
self.dones = dones
|
|
||||||
for n, done in enumerate(dones):
|
|
||||||
if done:
|
|
||||||
self.obs[n] = self.obs[n]*0
|
|
||||||
self.update_obs(obs)
|
|
||||||
mb_rewards.append(rewards)
|
|
||||||
mb_dones.append(self.dones)
|
|
||||||
#batch of steps to batch of rollouts
|
|
||||||
mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape(self.batch_ob_shape)
|
|
||||||
mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0)
|
|
||||||
mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0)
|
|
||||||
mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0)
|
|
||||||
mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
|
|
||||||
mb_masks = mb_dones[:, :-1]
|
|
||||||
mb_dones = mb_dones[:, 1:]
|
|
||||||
last_values = self.model.value(self.obs, self.states, self.dones).tolist()
|
|
||||||
#discount/bootstrap off value fn
|
|
||||||
for n, (rewards, dones, value) in enumerate(zip(mb_rewards, mb_dones, last_values)):
|
|
||||||
rewards = rewards.tolist()
|
|
||||||
dones = dones.tolist()
|
|
||||||
if dones[-1] == 0:
|
|
||||||
rewards = discount_with_dones(rewards+[value], dones+[0], self.gamma)[:-1]
|
|
||||||
else:
|
|
||||||
rewards = discount_with_dones(rewards, dones, self.gamma)
|
|
||||||
mb_rewards[n] = rewards
|
|
||||||
mb_rewards = mb_rewards.flatten()
|
|
||||||
mb_actions = mb_actions.flatten()
|
|
||||||
mb_values = mb_values.flatten()
|
|
||||||
mb_masks = mb_masks.flatten()
|
|
||||||
return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values
|
|
||||||
|
|
||||||
def learn(policy, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interval=1, nprocs=32, nsteps=20,
|
def learn(policy, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interval=1, nprocs=32, nsteps=20,
|
||||||
nstack=4, ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5,
|
ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5,
|
||||||
kfac_clip=0.001, save_interval=None, lrschedule='linear'):
|
kfac_clip=0.001, save_interval=None, lrschedule='linear'):
|
||||||
tf.reset_default_graph()
|
tf.reset_default_graph()
|
||||||
set_global_seeds(seed)
|
set_global_seeds(seed)
|
||||||
@@ -176,7 +115,7 @@ def learn(policy, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interval
|
|||||||
ob_space = env.observation_space
|
ob_space = env.observation_space
|
||||||
ac_space = env.action_space
|
ac_space = env.action_space
|
||||||
make_model = lambda : Model(policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=nprocs, nsteps
|
make_model = lambda : Model(policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=nprocs, nsteps
|
||||||
=nsteps, nstack=nstack, ent_coef=ent_coef, vf_coef=vf_coef, vf_fisher_coef=
|
=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, vf_fisher_coef=
|
||||||
vf_fisher_coef, lr=lr, max_grad_norm=max_grad_norm, kfac_clip=kfac_clip,
|
vf_fisher_coef, lr=lr, max_grad_norm=max_grad_norm, kfac_clip=kfac_clip,
|
||||||
lrschedule=lrschedule)
|
lrschedule=lrschedule)
|
||||||
if save_interval and logger.get_dir():
|
if save_interval and logger.get_dir():
|
||||||
@@ -185,7 +124,7 @@ def learn(policy, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interval
|
|||||||
fh.write(cloudpickle.dumps(make_model))
|
fh.write(cloudpickle.dumps(make_model))
|
||||||
model = make_model()
|
model = make_model()
|
||||||
|
|
||||||
runner = Runner(env, model, nsteps=nsteps, nstack=nstack, gamma=gamma)
|
runner = Runner(env, model, nsteps=nsteps, gamma=gamma)
|
||||||
nbatch = nenvs*nsteps
|
nbatch = nenvs*nsteps
|
||||||
tstart = time.time()
|
tstart = time.time()
|
||||||
coord = tf.train.Coordinator()
|
coord = tf.train.Coordinator()
|
||||||
|
@@ -1,4 +1,4 @@
|
|||||||
from baselines.acktr.running_stat import RunningStat
|
from .running_stat import RunningStat
|
||||||
from collections import deque
|
from collections import deque
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
|
@@ -1,93 +1,55 @@
|
|||||||
import tensorflow as tf
|
import tensorflow as tf
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
|
|
||||||
def gmatmul(a, b, transpose_a=False, transpose_b=False, reduce_dim=None):
|
def gmatmul(a, b, transpose_a=False, transpose_b=False, reduce_dim=None):
|
||||||
if reduce_dim == None:
|
assert reduce_dim is not None
|
||||||
# general batch matmul
|
|
||||||
if len(a.get_shape()) == 3 and len(b.get_shape()) == 3:
|
|
||||||
return tf.batch_matmul(a, b, adj_x=transpose_a, adj_y=transpose_b)
|
|
||||||
elif len(a.get_shape()) == 3 and len(b.get_shape()) == 2:
|
|
||||||
if transpose_b:
|
|
||||||
N = b.get_shape()[0].value
|
|
||||||
else:
|
|
||||||
N = b.get_shape()[1].value
|
|
||||||
B = a.get_shape()[0].value
|
|
||||||
if transpose_a:
|
|
||||||
K = a.get_shape()[1].value
|
|
||||||
a = tf.reshape(tf.transpose(a, [0, 2, 1]), [-1, K])
|
|
||||||
else:
|
|
||||||
K = a.get_shape()[-1].value
|
|
||||||
a = tf.reshape(a, [-1, K])
|
|
||||||
result = tf.matmul(a, b, transpose_b=transpose_b)
|
|
||||||
result = tf.reshape(result, [B, -1, N])
|
|
||||||
return result
|
|
||||||
elif len(a.get_shape()) == 2 and len(b.get_shape()) == 3:
|
|
||||||
if transpose_a:
|
|
||||||
M = a.get_shape()[1].value
|
|
||||||
else:
|
|
||||||
M = a.get_shape()[0].value
|
|
||||||
B = b.get_shape()[0].value
|
|
||||||
if transpose_b:
|
|
||||||
K = b.get_shape()[-1].value
|
|
||||||
b = tf.transpose(tf.reshape(b, [-1, K]), [1, 0])
|
|
||||||
else:
|
|
||||||
K = b.get_shape()[1].value
|
|
||||||
b = tf.transpose(tf.reshape(
|
|
||||||
tf.transpose(b, [0, 2, 1]), [-1, K]), [1, 0])
|
|
||||||
result = tf.matmul(a, b, transpose_a=transpose_a)
|
|
||||||
result = tf.transpose(tf.reshape(result, [M, B, -1]), [1, 0, 2])
|
|
||||||
return result
|
|
||||||
else:
|
|
||||||
return tf.matmul(a, b, transpose_a=transpose_a, transpose_b=transpose_b)
|
|
||||||
else:
|
|
||||||
# weird batch matmul
|
|
||||||
if len(a.get_shape()) == 2 and len(b.get_shape()) > 2:
|
|
||||||
# reshape reduce_dim to the left most dim in b
|
|
||||||
b_shape = b.get_shape()
|
|
||||||
if reduce_dim != 0:
|
|
||||||
b_dims = list(range(len(b_shape)))
|
|
||||||
b_dims.remove(reduce_dim)
|
|
||||||
b_dims.insert(0, reduce_dim)
|
|
||||||
b = tf.transpose(b, b_dims)
|
|
||||||
b_t_shape = b.get_shape()
|
|
||||||
b = tf.reshape(b, [int(b_shape[reduce_dim]), -1])
|
|
||||||
result = tf.matmul(a, b, transpose_a=transpose_a,
|
|
||||||
transpose_b=transpose_b)
|
|
||||||
result = tf.reshape(result, b_t_shape)
|
|
||||||
if reduce_dim != 0:
|
|
||||||
b_dims = list(range(len(b_shape)))
|
|
||||||
b_dims.remove(0)
|
|
||||||
b_dims.insert(reduce_dim, 0)
|
|
||||||
result = tf.transpose(result, b_dims)
|
|
||||||
return result
|
|
||||||
|
|
||||||
elif len(a.get_shape()) > 2 and len(b.get_shape()) == 2:
|
# weird batch matmul
|
||||||
# reshape reduce_dim to the right most dim in a
|
if len(a.get_shape()) == 2 and len(b.get_shape()) > 2:
|
||||||
a_shape = a.get_shape()
|
# reshape reduce_dim to the left most dim in b
|
||||||
outter_dim = len(a_shape) - 1
|
b_shape = b.get_shape()
|
||||||
reduce_dim = len(a_shape) - reduce_dim - 1
|
if reduce_dim != 0:
|
||||||
if reduce_dim != outter_dim:
|
b_dims = list(range(len(b_shape)))
|
||||||
a_dims = list(range(len(a_shape)))
|
b_dims.remove(reduce_dim)
|
||||||
a_dims.remove(reduce_dim)
|
b_dims.insert(0, reduce_dim)
|
||||||
a_dims.insert(outter_dim, reduce_dim)
|
b = tf.transpose(b, b_dims)
|
||||||
a = tf.transpose(a, a_dims)
|
b_t_shape = b.get_shape()
|
||||||
a_t_shape = a.get_shape()
|
b = tf.reshape(b, [int(b_shape[reduce_dim]), -1])
|
||||||
a = tf.reshape(a, [-1, int(a_shape[reduce_dim])])
|
result = tf.matmul(a, b, transpose_a=transpose_a,
|
||||||
result = tf.matmul(a, b, transpose_a=transpose_a,
|
transpose_b=transpose_b)
|
||||||
transpose_b=transpose_b)
|
result = tf.reshape(result, b_t_shape)
|
||||||
result = tf.reshape(result, a_t_shape)
|
if reduce_dim != 0:
|
||||||
if reduce_dim != outter_dim:
|
b_dims = list(range(len(b_shape)))
|
||||||
a_dims = list(range(len(a_shape)))
|
b_dims.remove(0)
|
||||||
a_dims.remove(outter_dim)
|
b_dims.insert(reduce_dim, 0)
|
||||||
a_dims.insert(reduce_dim, outter_dim)
|
result = tf.transpose(result, b_dims)
|
||||||
result = tf.transpose(result, a_dims)
|
return result
|
||||||
return result
|
|
||||||
|
|
||||||
elif len(a.get_shape()) == 2 and len(b.get_shape()) == 2:
|
elif len(a.get_shape()) > 2 and len(b.get_shape()) == 2:
|
||||||
return tf.matmul(a, b, transpose_a=transpose_a, transpose_b=transpose_b)
|
# reshape reduce_dim to the right most dim in a
|
||||||
|
a_shape = a.get_shape()
|
||||||
|
outter_dim = len(a_shape) - 1
|
||||||
|
reduce_dim = len(a_shape) - reduce_dim - 1
|
||||||
|
if reduce_dim != outter_dim:
|
||||||
|
a_dims = list(range(len(a_shape)))
|
||||||
|
a_dims.remove(reduce_dim)
|
||||||
|
a_dims.insert(outter_dim, reduce_dim)
|
||||||
|
a = tf.transpose(a, a_dims)
|
||||||
|
a_t_shape = a.get_shape()
|
||||||
|
a = tf.reshape(a, [-1, int(a_shape[reduce_dim])])
|
||||||
|
result = tf.matmul(a, b, transpose_a=transpose_a,
|
||||||
|
transpose_b=transpose_b)
|
||||||
|
result = tf.reshape(result, a_t_shape)
|
||||||
|
if reduce_dim != outter_dim:
|
||||||
|
a_dims = list(range(len(a_shape)))
|
||||||
|
a_dims.remove(outter_dim)
|
||||||
|
a_dims.insert(reduce_dim, outter_dim)
|
||||||
|
result = tf.transpose(result, a_dims)
|
||||||
|
return result
|
||||||
|
|
||||||
assert False, 'something went wrong'
|
elif len(a.get_shape()) == 2 and len(b.get_shape()) == 2:
|
||||||
|
return tf.matmul(a, b, transpose_a=transpose_a, transpose_b=transpose_b)
|
||||||
|
|
||||||
|
assert False, 'something went wrong'
|
||||||
|
|
||||||
|
|
||||||
def clipoutNeg(vec, threshold=1e-6):
|
def clipoutNeg(vec, threshold=1e-6):
|
||||||
|
@@ -1,43 +1,8 @@
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
import tensorflow as tf
|
import tensorflow as tf
|
||||||
from baselines.acktr.utils import conv, fc, dense, conv_to_fc, sample, kl_div
|
from baselines.acktr.utils import dense, kl_div
|
||||||
import baselines.common.tf_util as U
|
import baselines.common.tf_util as U
|
||||||
|
|
||||||
class CnnPolicy(object):
|
|
||||||
|
|
||||||
def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False):
|
|
||||||
nbatch = nenv*nsteps
|
|
||||||
nh, nw, nc = ob_space.shape
|
|
||||||
ob_shape = (nbatch, nh, nw, nc*nstack)
|
|
||||||
nact = ac_space.n
|
|
||||||
X = tf.placeholder(tf.uint8, ob_shape) #obs
|
|
||||||
with tf.variable_scope("model", reuse=reuse):
|
|
||||||
h = conv(tf.cast(X, tf.float32)/255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2))
|
|
||||||
h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))
|
|
||||||
h3 = conv(h2, 'c3', nf=32, rf=3, stride=1, init_scale=np.sqrt(2))
|
|
||||||
h3 = conv_to_fc(h3)
|
|
||||||
h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
|
|
||||||
pi = fc(h4, 'pi', nact, act=lambda x:x)
|
|
||||||
vf = fc(h4, 'v', 1, act=lambda x:x)
|
|
||||||
|
|
||||||
v0 = vf[:, 0]
|
|
||||||
a0 = sample(pi)
|
|
||||||
self.initial_state = [] #not stateful
|
|
||||||
|
|
||||||
def step(ob, *_args, **_kwargs):
|
|
||||||
a, v = sess.run([a0, v0], {X:ob})
|
|
||||||
return a, v, [] #dummy state
|
|
||||||
|
|
||||||
def value(ob, *_args, **_kwargs):
|
|
||||||
return sess.run(v0, {X:ob})
|
|
||||||
|
|
||||||
self.X = X
|
|
||||||
self.pi = pi
|
|
||||||
self.vf = vf
|
|
||||||
self.step = step
|
|
||||||
self.value = value
|
|
||||||
|
|
||||||
|
|
||||||
class GaussianMlpPolicy(object):
|
class GaussianMlpPolicy(object):
|
||||||
def __init__(self, ob_dim, ac_dim):
|
def __init__(self, ob_dim, ac_dim):
|
||||||
# Here we'll construct a bunch of expressions, which will be used in two places:
|
# Here we'll construct a bunch of expressions, which will be used in two places:
|
||||||
@@ -60,12 +25,12 @@ class GaussianMlpPolicy(object):
|
|||||||
std_na = tf.tile(std_1a, [tf.shape(mean_na)[0], 1])
|
std_na = tf.tile(std_1a, [tf.shape(mean_na)[0], 1])
|
||||||
ac_dist = tf.concat([tf.reshape(mean_na, [-1, ac_dim]), tf.reshape(std_na, [-1, ac_dim])], 1)
|
ac_dist = tf.concat([tf.reshape(mean_na, [-1, ac_dim]), tf.reshape(std_na, [-1, ac_dim])], 1)
|
||||||
sampled_ac_na = tf.random_normal(tf.shape(ac_dist[:,ac_dim:])) * ac_dist[:,ac_dim:] + ac_dist[:,:ac_dim] # This is the sampled action we'll perform.
|
sampled_ac_na = tf.random_normal(tf.shape(ac_dist[:,ac_dim:])) * ac_dist[:,ac_dim:] + ac_dist[:,:ac_dim] # This is the sampled action we'll perform.
|
||||||
logprobsampled_n = - U.sum(tf.log(ac_dist[:,ac_dim:]), axis=1) - 0.5 * tf.log(2.0*np.pi)*ac_dim - 0.5 * U.sum(tf.square(ac_dist[:,:ac_dim] - sampled_ac_na) / (tf.square(ac_dist[:,ac_dim:])), axis=1) # Logprob of sampled action
|
logprobsampled_n = - tf.reduce_sum(tf.log(ac_dist[:,ac_dim:]), axis=1) - 0.5 * tf.log(2.0*np.pi)*ac_dim - 0.5 * tf.reduce_sum(tf.square(ac_dist[:,:ac_dim] - sampled_ac_na) / (tf.square(ac_dist[:,ac_dim:])), axis=1) # Logprob of sampled action
|
||||||
logprob_n = - U.sum(tf.log(ac_dist[:,ac_dim:]), axis=1) - 0.5 * tf.log(2.0*np.pi)*ac_dim - 0.5 * U.sum(tf.square(ac_dist[:,:ac_dim] - oldac_na) / (tf.square(ac_dist[:,ac_dim:])), axis=1) # Logprob of previous actions under CURRENT policy (whereas oldlogprob_n is under OLD policy)
|
logprob_n = - tf.reduce_sum(tf.log(ac_dist[:,ac_dim:]), axis=1) - 0.5 * tf.log(2.0*np.pi)*ac_dim - 0.5 * tf.reduce_sum(tf.square(ac_dist[:,:ac_dim] - oldac_na) / (tf.square(ac_dist[:,ac_dim:])), axis=1) # Logprob of previous actions under CURRENT policy (whereas oldlogprob_n is under OLD policy)
|
||||||
kl = U.mean(kl_div(oldac_dist, ac_dist, ac_dim))
|
kl = tf.reduce_mean(kl_div(oldac_dist, ac_dist, ac_dim))
|
||||||
#kl = .5 * U.mean(tf.square(logprob_n - oldlogprob_n)) # Approximation of KL divergence between old policy used to generate actions, and new policy used to compute logprob_n
|
#kl = .5 * tf.reduce_mean(tf.square(logprob_n - oldlogprob_n)) # Approximation of KL divergence between old policy used to generate actions, and new policy used to compute logprob_n
|
||||||
surr = - U.mean(adv_n * logprob_n) # Loss function that we'll differentiate to get the policy gradient
|
surr = - tf.reduce_mean(adv_n * logprob_n) # Loss function that we'll differentiate to get the policy gradient
|
||||||
surr_sampled = - U.mean(logprob_n) # Sampled loss of the policy
|
surr_sampled = - tf.reduce_mean(logprob_n) # Sampled loss of the policy
|
||||||
self._act = U.function([ob_no], [sampled_ac_na, ac_dist, logprobsampled_n]) # Generate a new action and its logprob
|
self._act = U.function([ob_no], [sampled_ac_na, ac_dist, logprobsampled_n]) # Generate a new action and its logprob
|
||||||
#self.compute_kl = U.function([ob_no, oldac_na, oldlogprob_n], kl) # Compute (approximate) KL divergence between old policy and new policy
|
#self.compute_kl = U.function([ob_no, oldac_na, oldlogprob_n], kl) # Compute (approximate) KL divergence between old policy and new policy
|
||||||
self.compute_kl = U.function([ob_no, oldac_dist], kl)
|
self.compute_kl = U.function([ob_no, oldac_dist], kl)
|
||||||
|
@@ -1,38 +1,21 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
import os, logging, gym
|
|
||||||
from baselines import logger
|
from baselines import logger
|
||||||
from baselines.common import set_global_seeds
|
|
||||||
from baselines import bench
|
|
||||||
from baselines.acktr.acktr_disc import learn
|
from baselines.acktr.acktr_disc import learn
|
||||||
from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
|
from baselines.common.cmd_util import make_atari_env, atari_arg_parser
|
||||||
from baselines.common.atari_wrappers import make_atari, wrap_deepmind
|
from baselines.common.vec_env.vec_frame_stack import VecFrameStack
|
||||||
from baselines.acktr.policies import CnnPolicy
|
from baselines.ppo2.policies import CnnPolicy
|
||||||
|
|
||||||
def train(env_id, num_timesteps, seed, num_cpu):
|
def train(env_id, num_timesteps, seed, num_cpu):
|
||||||
def make_env(rank):
|
env = VecFrameStack(make_atari_env(env_id, num_cpu, seed), 4)
|
||||||
def _thunk():
|
|
||||||
env = make_atari(env_id)
|
|
||||||
env.seed(seed + rank)
|
|
||||||
env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
|
|
||||||
gym.logger.setLevel(logging.WARN)
|
|
||||||
return wrap_deepmind(env)
|
|
||||||
return _thunk
|
|
||||||
set_global_seeds(seed)
|
|
||||||
env = SubprocVecEnv([make_env(i) for i in range(num_cpu)])
|
|
||||||
policy_fn = CnnPolicy
|
policy_fn = CnnPolicy
|
||||||
learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), nprocs=num_cpu)
|
learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), nprocs=num_cpu)
|
||||||
env.close()
|
env.close()
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
import argparse
|
args = atari_arg_parser().parse_args()
|
||||||
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
|
||||||
parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4')
|
|
||||||
parser.add_argument('--seed', help='RNG seed', type=int, default=0)
|
|
||||||
parser.add_argument('--num-timesteps', type=int, default=int(10e6))
|
|
||||||
args = parser.parse_args()
|
|
||||||
logger.configure()
|
logger.configure()
|
||||||
train(args.env, num_timesteps=args.num_timesteps, seed=args.seed, num_cpu=32)
|
train(args.env, num_timesteps=args.num_timesteps, seed=args.seed, num_cpu=32)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
main()
|
main()
|
||||||
|
@@ -1,22 +1,14 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
import argparse
|
|
||||||
import logging
|
|
||||||
import os
|
|
||||||
import tensorflow as tf
|
import tensorflow as tf
|
||||||
import gym
|
|
||||||
from baselines import logger
|
from baselines import logger
|
||||||
from baselines.common import set_global_seeds
|
from baselines.common.cmd_util import make_mujoco_env, mujoco_arg_parser
|
||||||
from baselines import bench
|
|
||||||
from baselines.acktr.acktr_cont import learn
|
from baselines.acktr.acktr_cont import learn
|
||||||
from baselines.acktr.policies import GaussianMlpPolicy
|
from baselines.acktr.policies import GaussianMlpPolicy
|
||||||
from baselines.acktr.value_functions import NeuralNetValueFunction
|
from baselines.acktr.value_functions import NeuralNetValueFunction
|
||||||
|
|
||||||
def train(env_id, num_timesteps, seed):
|
def train(env_id, num_timesteps, seed):
|
||||||
env=gym.make(env_id)
|
env = make_mujoco_env(env_id, seed)
|
||||||
env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
|
|
||||||
set_global_seeds(seed)
|
|
||||||
env.seed(seed)
|
|
||||||
gym.logger.setLevel(logging.WARN)
|
|
||||||
|
|
||||||
with tf.Session(config=tf.ConfigProto()):
|
with tf.Session(config=tf.ConfigProto()):
|
||||||
ob_dim = env.observation_space.shape[0]
|
ob_dim = env.observation_space.shape[0]
|
||||||
@@ -33,11 +25,10 @@ def train(env_id, num_timesteps, seed):
|
|||||||
|
|
||||||
env.close()
|
env.close()
|
||||||
|
|
||||||
if __name__ == "__main__":
|
def main():
|
||||||
parser = argparse.ArgumentParser(description='Run Mujoco benchmark.')
|
args = mujoco_arg_parser().parse_args()
|
||||||
parser.add_argument('--seed', help='RNG seed', type=int, default=0)
|
|
||||||
parser.add_argument('--env', help='environment ID', type=str, default="Reacher-v1")
|
|
||||||
parser.add_argument('--num-timesteps', type=int, default=int(1e6))
|
|
||||||
args = parser.parse_args()
|
|
||||||
logger.configure()
|
logger.configure()
|
||||||
train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
|
train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
|
@@ -1,69 +1,8 @@
|
|||||||
import os
|
|
||||||
import numpy as np
|
|
||||||
import tensorflow as tf
|
import tensorflow as tf
|
||||||
import baselines.common.tf_util as U
|
|
||||||
from collections import deque
|
|
||||||
|
|
||||||
def sample(logits):
|
|
||||||
noise = tf.random_uniform(tf.shape(logits))
|
|
||||||
return tf.argmax(logits - tf.log(-tf.log(noise)), 1)
|
|
||||||
|
|
||||||
def std(x):
|
|
||||||
mean = tf.reduce_mean(x)
|
|
||||||
var = tf.reduce_mean(tf.square(x-mean))
|
|
||||||
return tf.sqrt(var)
|
|
||||||
|
|
||||||
def cat_entropy(logits):
|
|
||||||
a0 = logits - tf.reduce_max(logits, 1, keep_dims=True)
|
|
||||||
ea0 = tf.exp(a0)
|
|
||||||
z0 = tf.reduce_sum(ea0, 1, keep_dims=True)
|
|
||||||
p0 = ea0 / z0
|
|
||||||
return tf.reduce_sum(p0 * (tf.log(z0) - a0), 1)
|
|
||||||
|
|
||||||
def cat_entropy_softmax(p0):
|
|
||||||
return - tf.reduce_sum(p0 * tf.log(p0 + 1e-6), axis = 1)
|
|
||||||
|
|
||||||
def mse(pred, target):
|
|
||||||
return tf.square(pred-target)/2.
|
|
||||||
|
|
||||||
def ortho_init(scale=1.0):
|
|
||||||
def _ortho_init(shape, dtype, partition_info=None):
|
|
||||||
#lasagne ortho init for tf
|
|
||||||
shape = tuple(shape)
|
|
||||||
if len(shape) == 2:
|
|
||||||
flat_shape = shape
|
|
||||||
elif len(shape) == 4: # assumes NHWC
|
|
||||||
flat_shape = (np.prod(shape[:-1]), shape[-1])
|
|
||||||
else:
|
|
||||||
raise NotImplementedError
|
|
||||||
a = np.random.normal(0.0, 1.0, flat_shape)
|
|
||||||
u, _, v = np.linalg.svd(a, full_matrices=False)
|
|
||||||
q = u if u.shape == flat_shape else v # pick the one with the correct shape
|
|
||||||
q = q.reshape(shape)
|
|
||||||
return (scale * q[:shape[0], :shape[1]]).astype(np.float32)
|
|
||||||
return _ortho_init
|
|
||||||
|
|
||||||
def conv(x, scope, nf, rf, stride, pad='VALID', act=tf.nn.relu, init_scale=1.0):
|
|
||||||
with tf.variable_scope(scope):
|
|
||||||
nin = x.get_shape()[3].value
|
|
||||||
w = tf.get_variable("w", [rf, rf, nin, nf], initializer=ortho_init(init_scale))
|
|
||||||
b = tf.get_variable("b", [nf], initializer=tf.constant_initializer(0.0))
|
|
||||||
z = tf.nn.conv2d(x, w, strides=[1, stride, stride, 1], padding=pad)+b
|
|
||||||
h = act(z)
|
|
||||||
return h
|
|
||||||
|
|
||||||
def fc(x, scope, nh, act=tf.nn.relu, init_scale=1.0):
|
|
||||||
with tf.variable_scope(scope):
|
|
||||||
nin = x.get_shape()[1].value
|
|
||||||
w = tf.get_variable("w", [nin, nh], initializer=ortho_init(init_scale))
|
|
||||||
b = tf.get_variable("b", [nh], initializer=tf.constant_initializer(0.0))
|
|
||||||
z = tf.matmul(x, w)+b
|
|
||||||
h = act(z)
|
|
||||||
return h
|
|
||||||
|
|
||||||
def dense(x, size, name, weight_init=None, bias_init=0, weight_loss_dict=None, reuse=None):
|
def dense(x, size, name, weight_init=None, bias_init=0, weight_loss_dict=None, reuse=None):
|
||||||
with tf.variable_scope(name, reuse=reuse):
|
with tf.variable_scope(name, reuse=reuse):
|
||||||
assert (len(U.scope_name().split('/')) == 2)
|
assert (len(tf.get_variable_scope().name.split('/')) == 2)
|
||||||
|
|
||||||
w = tf.get_variable("w", [x.get_shape()[1], size], initializer=weight_init)
|
w = tf.get_variable("w", [x.get_shape()[1], size], initializer=weight_init)
|
||||||
b = tf.get_variable("b", [size], initializer=tf.constant_initializer(bias_init))
|
b = tf.get_variable("b", [size], initializer=tf.constant_initializer(bias_init))
|
||||||
@@ -75,15 +14,10 @@ def dense(x, size, name, weight_init=None, bias_init=0, weight_loss_dict=None, r
|
|||||||
weight_loss_dict[w] = weight_decay_fc
|
weight_loss_dict[w] = weight_decay_fc
|
||||||
weight_loss_dict[b] = 0.0
|
weight_loss_dict[b] = 0.0
|
||||||
|
|
||||||
tf.add_to_collection(U.scope_name().split('/')[0] + '_' + 'losses', weight_decay)
|
tf.add_to_collection(tf.get_variable_scope().name.split('/')[0] + '_' + 'losses', weight_decay)
|
||||||
|
|
||||||
return tf.nn.bias_add(tf.matmul(x, w), b)
|
return tf.nn.bias_add(tf.matmul(x, w), b)
|
||||||
|
|
||||||
def conv_to_fc(x):
|
|
||||||
nh = np.prod([v.value for v in x.get_shape()[1:]])
|
|
||||||
x = tf.reshape(x, [-1, nh])
|
|
||||||
return x
|
|
||||||
|
|
||||||
def kl_div(action_dist1, action_dist2, action_size):
|
def kl_div(action_dist1, action_dist2, action_size):
|
||||||
mean1, std1 = action_dist1[:, :action_size], action_dist1[:, action_size:]
|
mean1, std1 = action_dist1[:, :action_size], action_dist1[:, action_size:]
|
||||||
mean2, std2 = action_dist2[:, :action_size], action_dist2[:, action_size:]
|
mean2, std2 = action_dist2[:, :action_size], action_dist2[:, action_size:]
|
||||||
@@ -92,109 +26,3 @@ def kl_div(action_dist1, action_dist2, action_size):
|
|||||||
denominator = 2 * tf.square(std2) + 1e-8
|
denominator = 2 * tf.square(std2) + 1e-8
|
||||||
return tf.reduce_sum(
|
return tf.reduce_sum(
|
||||||
numerator/denominator + tf.log(std2) - tf.log(std1),reduction_indices=-1)
|
numerator/denominator + tf.log(std2) - tf.log(std1),reduction_indices=-1)
|
||||||
|
|
||||||
def discount_with_dones(rewards, dones, gamma):
|
|
||||||
discounted = []
|
|
||||||
r = 0
|
|
||||||
for reward, done in zip(rewards[::-1], dones[::-1]):
|
|
||||||
r = reward + gamma*r*(1.-done) # fixed off by one bug
|
|
||||||
discounted.append(r)
|
|
||||||
return discounted[::-1]
|
|
||||||
|
|
||||||
def find_trainable_variables(key):
|
|
||||||
with tf.variable_scope(key):
|
|
||||||
return tf.trainable_variables()
|
|
||||||
|
|
||||||
def make_path(f):
|
|
||||||
return os.makedirs(f, exist_ok=True)
|
|
||||||
|
|
||||||
def constant(p):
|
|
||||||
return 1
|
|
||||||
|
|
||||||
def linear(p):
|
|
||||||
return 1-p
|
|
||||||
|
|
||||||
|
|
||||||
def middle_drop(p):
|
|
||||||
eps = 0.75
|
|
||||||
if 1-p<eps:
|
|
||||||
return eps*0.1
|
|
||||||
return 1-p
|
|
||||||
|
|
||||||
def double_linear_con(p):
|
|
||||||
p *= 2
|
|
||||||
eps = 0.125
|
|
||||||
if 1-p<eps:
|
|
||||||
return eps
|
|
||||||
return 1-p
|
|
||||||
|
|
||||||
|
|
||||||
def double_middle_drop(p):
|
|
||||||
eps1 = 0.75
|
|
||||||
eps2 = 0.25
|
|
||||||
if 1-p<eps1:
|
|
||||||
if 1-p<eps2:
|
|
||||||
return eps2*0.5
|
|
||||||
return eps1*0.1
|
|
||||||
return 1-p
|
|
||||||
|
|
||||||
|
|
||||||
schedules = {
|
|
||||||
'linear':linear,
|
|
||||||
'constant':constant,
|
|
||||||
'double_linear_con':double_linear_con,
|
|
||||||
'middle_drop':middle_drop,
|
|
||||||
'double_middle_drop':double_middle_drop
|
|
||||||
}
|
|
||||||
|
|
||||||
class Scheduler(object):
|
|
||||||
|
|
||||||
def __init__(self, v, nvalues, schedule):
|
|
||||||
self.n = 0.
|
|
||||||
self.v = v
|
|
||||||
self.nvalues = nvalues
|
|
||||||
self.schedule = schedules[schedule]
|
|
||||||
|
|
||||||
def value(self):
|
|
||||||
current_value = self.v*self.schedule(self.n/self.nvalues)
|
|
||||||
self.n += 1.
|
|
||||||
return current_value
|
|
||||||
|
|
||||||
def value_steps(self, steps):
|
|
||||||
return self.v*self.schedule(steps/self.nvalues)
|
|
||||||
|
|
||||||
|
|
||||||
class EpisodeStats:
|
|
||||||
def __init__(self, nsteps, nenvs):
|
|
||||||
self.episode_rewards = []
|
|
||||||
for i in range(nenvs):
|
|
||||||
self.episode_rewards.append([])
|
|
||||||
self.lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths
|
|
||||||
self.rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards
|
|
||||||
self.nsteps = nsteps
|
|
||||||
self.nenvs = nenvs
|
|
||||||
|
|
||||||
def feed(self, rewards, masks):
|
|
||||||
rewards = np.reshape(rewards, [self.nenvs, self.nsteps])
|
|
||||||
masks = np.reshape(masks, [self.nenvs, self.nsteps])
|
|
||||||
for i in range(0, self.nenvs):
|
|
||||||
for j in range(0, self.nsteps):
|
|
||||||
self.episode_rewards[i].append(rewards[i][j])
|
|
||||||
if masks[i][j]:
|
|
||||||
l = len(self.episode_rewards[i])
|
|
||||||
s = sum(self.episode_rewards[i])
|
|
||||||
self.lenbuffer.append(l)
|
|
||||||
self.rewbuffer.append(s)
|
|
||||||
self.episode_rewards[i] = []
|
|
||||||
|
|
||||||
def mean_length(self):
|
|
||||||
if self.lenbuffer:
|
|
||||||
return np.mean(self.lenbuffer)
|
|
||||||
else:
|
|
||||||
return 0 # on the first params dump, no episodes are finished
|
|
||||||
|
|
||||||
def mean_reward(self):
|
|
||||||
if self.rewbuffer:
|
|
||||||
return np.mean(self.rewbuffer)
|
|
||||||
else:
|
|
||||||
return 0
|
|
||||||
|
@@ -1,6 +1,6 @@
|
|||||||
from baselines import logger
|
from baselines import logger
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from baselines import common
|
import baselines.common as common
|
||||||
from baselines.common import tf_util as U
|
from baselines.common import tf_util as U
|
||||||
import tensorflow as tf
|
import tensorflow as tf
|
||||||
from baselines.acktr import kfac
|
from baselines.acktr import kfac
|
||||||
@@ -16,8 +16,8 @@ class NeuralNetValueFunction(object):
|
|||||||
vpred_n = dense(h2, 1, "hfinal", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)[:,0]
|
vpred_n = dense(h2, 1, "hfinal", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)[:,0]
|
||||||
sample_vpred_n = vpred_n + tf.random_normal(tf.shape(vpred_n))
|
sample_vpred_n = vpred_n + tf.random_normal(tf.shape(vpred_n))
|
||||||
wd_loss = tf.get_collection("vf_losses", None)
|
wd_loss = tf.get_collection("vf_losses", None)
|
||||||
loss = U.mean(tf.square(vpred_n - vtarg_n)) + tf.add_n(wd_loss)
|
loss = tf.reduce_mean(tf.square(vpred_n - vtarg_n)) + tf.add_n(wd_loss)
|
||||||
loss_sampled = U.mean(tf.square(vpred_n - tf.stop_gradient(sample_vpred_n)))
|
loss_sampled = tf.reduce_mean(tf.square(vpred_n - tf.stop_gradient(sample_vpred_n)))
|
||||||
self._predict = U.function([X], vpred_n)
|
self._predict = U.function([X], vpred_n)
|
||||||
optim = kfac.KfacOptimizer(learning_rate=0.001, cold_lr=0.001*(1-0.9), momentum=0.9, \
|
optim = kfac.KfacOptimizer(learning_rate=0.001, cold_lr=0.001*(1-0.9), momentum=0.9, \
|
||||||
clip_kl=0.3, epsilon=0.1, stats_decay=0.95, \
|
clip_kl=0.3, epsilon=0.1, stats_decay=0.95, \
|
||||||
|
@@ -1,15 +1,24 @@
|
|||||||
|
import re
|
||||||
import os.path as osp
|
import os.path as osp
|
||||||
|
import os
|
||||||
|
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
|
||||||
_atari7 = ['BeamRider', 'Breakout', 'Enduro', 'Pong', 'Qbert', 'Seaquest', 'SpaceInvaders']
|
_atari7 = ['BeamRider', 'Breakout', 'Enduro', 'Pong', 'Qbert', 'Seaquest', 'SpaceInvaders']
|
||||||
_atariexpl7 = ['Freeway', 'Gravitar', 'MontezumaRevenge', 'Pitfall', 'PrivateEye', 'Solaris', 'Venture']
|
_atariexpl7 = ['Freeway', 'Gravitar', 'MontezumaRevenge', 'Pitfall', 'PrivateEye', 'Solaris', 'Venture']
|
||||||
|
|
||||||
_BENCHMARKS = []
|
_BENCHMARKS = []
|
||||||
|
|
||||||
|
remove_version_re = re.compile(r'-v\d+$')
|
||||||
def register_benchmark(benchmark):
|
def register_benchmark(benchmark):
|
||||||
for b in _BENCHMARKS:
|
for b in _BENCHMARKS:
|
||||||
if b['name'] == benchmark['name']:
|
if b['name'] == benchmark['name']:
|
||||||
raise ValueError('Benchmark with name %s already registered!' % b['name'])
|
raise ValueError('Benchmark with name %s already registered!' % b['name'])
|
||||||
|
|
||||||
|
# automatically add a description if it is not present
|
||||||
|
if 'tasks' in benchmark:
|
||||||
|
for t in benchmark['tasks']:
|
||||||
|
if 'desc' not in t:
|
||||||
|
t['desc'] = remove_version_re.sub('', t['env_id'])
|
||||||
_BENCHMARKS.append(benchmark)
|
_BENCHMARKS.append(benchmark)
|
||||||
|
|
||||||
|
|
||||||
@@ -42,30 +51,28 @@ _ATARI_SUFFIX = 'NoFrameskip-v4'
|
|||||||
register_benchmark({
|
register_benchmark({
|
||||||
'name': 'Atari50M',
|
'name': 'Atari50M',
|
||||||
'description': '7 Atari games from Mnih et al. (2013), with pixel observations, 50M timesteps',
|
'description': '7 Atari games from Mnih et al. (2013), with pixel observations, 50M timesteps',
|
||||||
'tasks': [{'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(50e6)} for _game in _atari7]
|
'tasks': [{'desc': _game, 'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(50e6)} for _game in _atari7]
|
||||||
})
|
})
|
||||||
|
|
||||||
register_benchmark({
|
register_benchmark({
|
||||||
'name': 'Atari10M',
|
'name': 'Atari10M',
|
||||||
'description': '7 Atari games from Mnih et al. (2013), with pixel observations, 10M timesteps',
|
'description': '7 Atari games from Mnih et al. (2013), with pixel observations, 10M timesteps',
|
||||||
'tasks': [{'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(10e6)} for _game in _atari7]
|
'tasks': [{'desc': _game, 'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(10e6)} for _game in _atari7]
|
||||||
})
|
})
|
||||||
|
|
||||||
register_benchmark({
|
register_benchmark({
|
||||||
'name': 'Atari1Hr',
|
'name': 'Atari1Hr',
|
||||||
'description': '7 Atari games from Mnih et al. (2013), with pixel observations, 1 hour of walltime',
|
'description': '7 Atari games from Mnih et al. (2013), with pixel observations, 1 hour of walltime',
|
||||||
'tasks': [{'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_seconds': 60 * 60} for _game in _atari7]
|
'tasks': [{'desc': _game, 'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_seconds': 60 * 60} for _game in _atari7]
|
||||||
})
|
})
|
||||||
|
|
||||||
register_benchmark({
|
register_benchmark({
|
||||||
'name': 'AtariExploration10M',
|
'name': 'AtariExploration10M',
|
||||||
'description': '7 Atari games emphasizing exploration, with pixel observations, 10M timesteps',
|
'description': '7 Atari games emphasizing exploration, with pixel observations, 10M timesteps',
|
||||||
'tasks': [{'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(10e6)} for _game in _atariexpl7]
|
'tasks': [{'desc': _game, 'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(10e6)} for _game in _atariexpl7]
|
||||||
})
|
})
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# MuJoCo
|
# MuJoCo
|
||||||
|
|
||||||
_mujocosmall = [
|
_mujocosmall = [
|
||||||
@@ -128,5 +135,6 @@ _atari50 = [ # actually 47
|
|||||||
register_benchmark({
|
register_benchmark({
|
||||||
'name': 'Atari50_10M',
|
'name': 'Atari50_10M',
|
||||||
'description': '47 Atari games from Mnih et al. (2013), with pixel observations, 10M timesteps',
|
'description': '47 Atari games from Mnih et al. (2013), with pixel observations, 10M timesteps',
|
||||||
'tasks': [{'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(10e6)} for _game in _atari50]
|
'tasks': [{'desc': _game, 'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(10e6)} for _game in _atari50]
|
||||||
})
|
})
|
||||||
|
|
||||||
|
@@ -25,8 +25,7 @@ class Monitor(Wrapper):
|
|||||||
else:
|
else:
|
||||||
filename = filename + "." + Monitor.EXT
|
filename = filename + "." + Monitor.EXT
|
||||||
self.f = open(filename, "wt")
|
self.f = open(filename, "wt")
|
||||||
self.f.write('#%s\n'%json.dumps({"t_start": self.tstart, "gym_version": gym.__version__,
|
self.f.write('#%s\n'%json.dumps({"t_start": self.tstart, 'env_id' : env.spec and env.spec.id}))
|
||||||
"env_id": env.spec.id if env.spec else 'Unknown'}))
|
|
||||||
self.logger = csv.DictWriter(self.f, fieldnames=('r', 'l', 't')+reset_keywords)
|
self.logger = csv.DictWriter(self.f, fieldnames=('r', 'l', 't')+reset_keywords)
|
||||||
self.logger.writeheader()
|
self.logger.writeheader()
|
||||||
|
|
||||||
@@ -36,10 +35,11 @@ class Monitor(Wrapper):
|
|||||||
self.needs_reset = True
|
self.needs_reset = True
|
||||||
self.episode_rewards = []
|
self.episode_rewards = []
|
||||||
self.episode_lengths = []
|
self.episode_lengths = []
|
||||||
|
self.episode_times = []
|
||||||
self.total_steps = 0
|
self.total_steps = 0
|
||||||
self.current_reset_info = {} # extra info about the current episode, that was passed in during reset()
|
self.current_reset_info = {} # extra info about the current episode, that was passed in during reset()
|
||||||
|
|
||||||
def _reset(self, **kwargs):
|
def reset(self, **kwargs):
|
||||||
if not self.allow_early_resets and not self.needs_reset:
|
if not self.allow_early_resets and not self.needs_reset:
|
||||||
raise RuntimeError("Tried to reset an environment before done. If you want to allow early resets, wrap your env with Monitor(env, path, allow_early_resets=True)")
|
raise RuntimeError("Tried to reset an environment before done. If you want to allow early resets, wrap your env with Monitor(env, path, allow_early_resets=True)")
|
||||||
self.rewards = []
|
self.rewards = []
|
||||||
@@ -51,7 +51,7 @@ class Monitor(Wrapper):
|
|||||||
self.current_reset_info[k] = v
|
self.current_reset_info[k] = v
|
||||||
return self.env.reset(**kwargs)
|
return self.env.reset(**kwargs)
|
||||||
|
|
||||||
def _step(self, action):
|
def step(self, action):
|
||||||
if self.needs_reset:
|
if self.needs_reset:
|
||||||
raise RuntimeError("Tried to step environment that needs reset")
|
raise RuntimeError("Tried to step environment that needs reset")
|
||||||
ob, rew, done, info = self.env.step(action)
|
ob, rew, done, info = self.env.step(action)
|
||||||
@@ -61,12 +61,13 @@ class Monitor(Wrapper):
|
|||||||
eprew = sum(self.rewards)
|
eprew = sum(self.rewards)
|
||||||
eplen = len(self.rewards)
|
eplen = len(self.rewards)
|
||||||
epinfo = {"r": round(eprew, 6), "l": eplen, "t": round(time.time() - self.tstart, 6)}
|
epinfo = {"r": round(eprew, 6), "l": eplen, "t": round(time.time() - self.tstart, 6)}
|
||||||
|
self.episode_rewards.append(eprew)
|
||||||
|
self.episode_lengths.append(eplen)
|
||||||
|
self.episode_times.append(time.time() - self.tstart)
|
||||||
epinfo.update(self.current_reset_info)
|
epinfo.update(self.current_reset_info)
|
||||||
if self.logger:
|
if self.logger:
|
||||||
self.logger.writerow(epinfo)
|
self.logger.writerow(epinfo)
|
||||||
self.f.flush()
|
self.f.flush()
|
||||||
self.episode_rewards.append(eprew)
|
|
||||||
self.episode_lengths.append(eplen)
|
|
||||||
info['episode'] = epinfo
|
info['episode'] = epinfo
|
||||||
self.total_steps += 1
|
self.total_steps += 1
|
||||||
return (ob, rew, done, info)
|
return (ob, rew, done, info)
|
||||||
@@ -84,6 +85,9 @@ class Monitor(Wrapper):
|
|||||||
def get_episode_lengths(self):
|
def get_episode_lengths(self):
|
||||||
return self.episode_lengths
|
return self.episode_lengths
|
||||||
|
|
||||||
|
def get_episode_times(self):
|
||||||
|
return self.episode_times
|
||||||
|
|
||||||
class LoadMonitorResultsError(Exception):
|
class LoadMonitorResultsError(Exception):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@@ -92,7 +96,9 @@ def get_monitor_files(dir):
|
|||||||
|
|
||||||
def load_results(dir):
|
def load_results(dir):
|
||||||
import pandas
|
import pandas
|
||||||
monitor_files = glob(osp.join(dir, "*monitor.*")) # get both csv and (old) json files
|
monitor_files = (
|
||||||
|
glob(osp.join(dir, "*monitor.json")) +
|
||||||
|
glob(osp.join(dir, "*monitor.csv"))) # get both csv and (old) json files
|
||||||
if not monitor_files:
|
if not monitor_files:
|
||||||
raise LoadMonitorResultsError("no monitor files of the form *%s found in %s" % (Monitor.EXT, dir))
|
raise LoadMonitorResultsError("no monitor files of the form *%s found in %s" % (Monitor.EXT, dir))
|
||||||
dfs = []
|
dfs = []
|
||||||
@@ -114,10 +120,37 @@ def load_results(dir):
|
|||||||
episode = json.loads(line)
|
episode = json.loads(line)
|
||||||
episodes.append(episode)
|
episodes.append(episode)
|
||||||
df = pandas.DataFrame(episodes)
|
df = pandas.DataFrame(episodes)
|
||||||
df['t'] += header['t_start']
|
else:
|
||||||
|
assert 0, 'unreachable'
|
||||||
|
df['t'] += header['t_start']
|
||||||
dfs.append(df)
|
dfs.append(df)
|
||||||
df = pandas.concat(dfs)
|
df = pandas.concat(dfs)
|
||||||
df.sort_values('t', inplace=True)
|
df.sort_values('t', inplace=True)
|
||||||
|
df.reset_index(inplace=True)
|
||||||
df['t'] -= min(header['t_start'] for header in headers)
|
df['t'] -= min(header['t_start'] for header in headers)
|
||||||
df.headers = headers # HACK to preserve backwards compatibility
|
df.headers = headers # HACK to preserve backwards compatibility
|
||||||
return df
|
return df
|
||||||
|
|
||||||
|
def test_monitor():
|
||||||
|
env = gym.make("CartPole-v1")
|
||||||
|
env.seed(0)
|
||||||
|
mon_file = "/tmp/baselines-test-%s.monitor.csv" % uuid.uuid4()
|
||||||
|
menv = Monitor(env, mon_file)
|
||||||
|
menv.reset()
|
||||||
|
for _ in range(1000):
|
||||||
|
_, _, done, _ = menv.step(0)
|
||||||
|
if done:
|
||||||
|
menv.reset()
|
||||||
|
|
||||||
|
f = open(mon_file, 'rt')
|
||||||
|
|
||||||
|
firstline = f.readline()
|
||||||
|
assert firstline.startswith('#')
|
||||||
|
metadata = json.loads(firstline[1:])
|
||||||
|
assert metadata['env_id'] == "CartPole-v1"
|
||||||
|
assert set(metadata.keys()) == {'env_id', 'gym_version', 't_start'}, "Incorrect keys in monitor metadata"
|
||||||
|
|
||||||
|
last_logline = pandas.read_csv(f, index_col=None)
|
||||||
|
assert set(last_logline.keys()) == {'l', 't', 'r'}, "Incorrect keys in monitor logline"
|
||||||
|
f.close()
|
||||||
|
os.remove(mon_file)
|
@@ -3,6 +3,7 @@ from collections import deque
|
|||||||
import gym
|
import gym
|
||||||
from gym import spaces
|
from gym import spaces
|
||||||
import cv2
|
import cv2
|
||||||
|
cv2.ocl.setUseOpenCL(False)
|
||||||
|
|
||||||
class NoopResetEnv(gym.Wrapper):
|
class NoopResetEnv(gym.Wrapper):
|
||||||
def __init__(self, env, noop_max=30):
|
def __init__(self, env, noop_max=30):
|
||||||
@@ -15,7 +16,7 @@ class NoopResetEnv(gym.Wrapper):
|
|||||||
self.noop_action = 0
|
self.noop_action = 0
|
||||||
assert env.unwrapped.get_action_meanings()[0] == 'NOOP'
|
assert env.unwrapped.get_action_meanings()[0] == 'NOOP'
|
||||||
|
|
||||||
def _reset(self, **kwargs):
|
def reset(self, **kwargs):
|
||||||
""" Do no-op action for a number of steps in [1, noop_max]."""
|
""" Do no-op action for a number of steps in [1, noop_max]."""
|
||||||
self.env.reset(**kwargs)
|
self.env.reset(**kwargs)
|
||||||
if self.override_num_noops is not None:
|
if self.override_num_noops is not None:
|
||||||
@@ -30,6 +31,9 @@ class NoopResetEnv(gym.Wrapper):
|
|||||||
obs = self.env.reset(**kwargs)
|
obs = self.env.reset(**kwargs)
|
||||||
return obs
|
return obs
|
||||||
|
|
||||||
|
def step(self, ac):
|
||||||
|
return self.env.step(ac)
|
||||||
|
|
||||||
class FireResetEnv(gym.Wrapper):
|
class FireResetEnv(gym.Wrapper):
|
||||||
def __init__(self, env):
|
def __init__(self, env):
|
||||||
"""Take action on reset for environments that are fixed until firing."""
|
"""Take action on reset for environments that are fixed until firing."""
|
||||||
@@ -37,7 +41,7 @@ class FireResetEnv(gym.Wrapper):
|
|||||||
assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
|
assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
|
||||||
assert len(env.unwrapped.get_action_meanings()) >= 3
|
assert len(env.unwrapped.get_action_meanings()) >= 3
|
||||||
|
|
||||||
def _reset(self, **kwargs):
|
def reset(self, **kwargs):
|
||||||
self.env.reset(**kwargs)
|
self.env.reset(**kwargs)
|
||||||
obs, _, done, _ = self.env.step(1)
|
obs, _, done, _ = self.env.step(1)
|
||||||
if done:
|
if done:
|
||||||
@@ -47,6 +51,9 @@ class FireResetEnv(gym.Wrapper):
|
|||||||
self.env.reset(**kwargs)
|
self.env.reset(**kwargs)
|
||||||
return obs
|
return obs
|
||||||
|
|
||||||
|
def step(self, ac):
|
||||||
|
return self.env.step(ac)
|
||||||
|
|
||||||
class EpisodicLifeEnv(gym.Wrapper):
|
class EpisodicLifeEnv(gym.Wrapper):
|
||||||
def __init__(self, env):
|
def __init__(self, env):
|
||||||
"""Make end-of-life == end-of-episode, but only reset on true game over.
|
"""Make end-of-life == end-of-episode, but only reset on true game over.
|
||||||
@@ -56,7 +63,7 @@ class EpisodicLifeEnv(gym.Wrapper):
|
|||||||
self.lives = 0
|
self.lives = 0
|
||||||
self.was_real_done = True
|
self.was_real_done = True
|
||||||
|
|
||||||
def _step(self, action):
|
def step(self, action):
|
||||||
obs, reward, done, info = self.env.step(action)
|
obs, reward, done, info = self.env.step(action)
|
||||||
self.was_real_done = done
|
self.was_real_done = done
|
||||||
# check current lives, make loss of life terminal,
|
# check current lives, make loss of life terminal,
|
||||||
@@ -70,7 +77,7 @@ class EpisodicLifeEnv(gym.Wrapper):
|
|||||||
self.lives = lives
|
self.lives = lives
|
||||||
return obs, reward, done, info
|
return obs, reward, done, info
|
||||||
|
|
||||||
def _reset(self, **kwargs):
|
def reset(self, **kwargs):
|
||||||
"""Reset only when lives are exhausted.
|
"""Reset only when lives are exhausted.
|
||||||
This way all states are still reachable even though lives are episodic,
|
This way all states are still reachable even though lives are episodic,
|
||||||
and the learner need not know about any of this behind-the-scenes.
|
and the learner need not know about any of this behind-the-scenes.
|
||||||
@@ -88,10 +95,13 @@ class MaxAndSkipEnv(gym.Wrapper):
|
|||||||
"""Return only every `skip`-th frame"""
|
"""Return only every `skip`-th frame"""
|
||||||
gym.Wrapper.__init__(self, env)
|
gym.Wrapper.__init__(self, env)
|
||||||
# most recent raw observations (for max pooling across time steps)
|
# most recent raw observations (for max pooling across time steps)
|
||||||
self._obs_buffer = np.zeros((2,)+env.observation_space.shape, dtype='uint8')
|
self._obs_buffer = np.zeros((2,)+env.observation_space.shape, dtype=np.uint8)
|
||||||
self._skip = skip
|
self._skip = skip
|
||||||
|
|
||||||
def _step(self, action):
|
def reset(self):
|
||||||
|
return self.env.reset()
|
||||||
|
|
||||||
|
def step(self, action):
|
||||||
"""Repeat action, sum reward, and max over last observations."""
|
"""Repeat action, sum reward, and max over last observations."""
|
||||||
total_reward = 0.0
|
total_reward = 0.0
|
||||||
done = None
|
done = None
|
||||||
@@ -108,8 +118,14 @@ class MaxAndSkipEnv(gym.Wrapper):
|
|||||||
|
|
||||||
return max_frame, total_reward, done, info
|
return max_frame, total_reward, done, info
|
||||||
|
|
||||||
|
def reset(self, **kwargs):
|
||||||
|
return self.env.reset(**kwargs)
|
||||||
|
|
||||||
class ClipRewardEnv(gym.RewardWrapper):
|
class ClipRewardEnv(gym.RewardWrapper):
|
||||||
def _reward(self, reward):
|
def __init__(self, env):
|
||||||
|
gym.RewardWrapper.__init__(self, env)
|
||||||
|
|
||||||
|
def reward(self, reward):
|
||||||
"""Bin reward to {+1, 0, -1} by its sign."""
|
"""Bin reward to {+1, 0, -1} by its sign."""
|
||||||
return np.sign(reward)
|
return np.sign(reward)
|
||||||
|
|
||||||
@@ -119,9 +135,10 @@ class WarpFrame(gym.ObservationWrapper):
|
|||||||
gym.ObservationWrapper.__init__(self, env)
|
gym.ObservationWrapper.__init__(self, env)
|
||||||
self.width = 84
|
self.width = 84
|
||||||
self.height = 84
|
self.height = 84
|
||||||
self.observation_space = spaces.Box(low=0, high=255, shape=(self.height, self.width, 1))
|
self.observation_space = spaces.Box(low=0, high=255,
|
||||||
|
shape=(self.height, self.width, 1), dtype=np.uint8)
|
||||||
|
|
||||||
def _observation(self, frame):
|
def observation(self, frame):
|
||||||
frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
|
frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
|
||||||
frame = cv2.resize(frame, (self.width, self.height), interpolation=cv2.INTER_AREA)
|
frame = cv2.resize(frame, (self.width, self.height), interpolation=cv2.INTER_AREA)
|
||||||
return frame[:, :, None]
|
return frame[:, :, None]
|
||||||
@@ -140,15 +157,15 @@ class FrameStack(gym.Wrapper):
|
|||||||
self.k = k
|
self.k = k
|
||||||
self.frames = deque([], maxlen=k)
|
self.frames = deque([], maxlen=k)
|
||||||
shp = env.observation_space.shape
|
shp = env.observation_space.shape
|
||||||
self.observation_space = spaces.Box(low=0, high=255, shape=(shp[0], shp[1], shp[2] * k))
|
self.observation_space = spaces.Box(low=0, high=255, shape=(shp[0], shp[1], shp[2] * k), dtype=np.uint8)
|
||||||
|
|
||||||
def _reset(self):
|
def reset(self):
|
||||||
ob = self.env.reset()
|
ob = self.env.reset()
|
||||||
for _ in range(self.k):
|
for _ in range(self.k):
|
||||||
self.frames.append(ob)
|
self.frames.append(ob)
|
||||||
return self._get_ob()
|
return self._get_ob()
|
||||||
|
|
||||||
def _step(self, action):
|
def step(self, action):
|
||||||
ob, reward, done, info = self.env.step(action)
|
ob, reward, done, info = self.env.step(action)
|
||||||
self.frames.append(ob)
|
self.frames.append(ob)
|
||||||
return self._get_ob(), reward, done, info
|
return self._get_ob(), reward, done, info
|
||||||
@@ -158,7 +175,10 @@ class FrameStack(gym.Wrapper):
|
|||||||
return LazyFrames(list(self.frames))
|
return LazyFrames(list(self.frames))
|
||||||
|
|
||||||
class ScaledFloatFrame(gym.ObservationWrapper):
|
class ScaledFloatFrame(gym.ObservationWrapper):
|
||||||
def _observation(self, observation):
|
def __init__(self, env):
|
||||||
|
gym.ObservationWrapper.__init__(self, env)
|
||||||
|
|
||||||
|
def observation(self, observation):
|
||||||
# careful! This undoes the memory optimization, use
|
# careful! This undoes the memory optimization, use
|
||||||
# with smaller replay buffers only.
|
# with smaller replay buffers only.
|
||||||
return np.array(observation).astype(np.float32) / 255.0
|
return np.array(observation).astype(np.float32) / 255.0
|
||||||
|
64
baselines/common/cmd_util.py
Normal file
64
baselines/common/cmd_util.py
Normal file
@@ -0,0 +1,64 @@
|
|||||||
|
"""
|
||||||
|
Helpers for scripts like run_atari.py.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import gym
|
||||||
|
from baselines import logger
|
||||||
|
from baselines.bench import Monitor
|
||||||
|
from baselines.common import set_global_seeds
|
||||||
|
from baselines.common.atari_wrappers import make_atari, wrap_deepmind
|
||||||
|
from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
|
||||||
|
from mpi4py import MPI
|
||||||
|
|
||||||
|
def make_atari_env(env_id, num_env, seed, wrapper_kwargs=None, start_index=0):
|
||||||
|
"""
|
||||||
|
Create a wrapped, monitored SubprocVecEnv for Atari.
|
||||||
|
"""
|
||||||
|
if wrapper_kwargs is None: wrapper_kwargs = {}
|
||||||
|
def make_env(rank): # pylint: disable=C0111
|
||||||
|
def _thunk():
|
||||||
|
env = make_atari(env_id)
|
||||||
|
env.seed(seed + rank)
|
||||||
|
env = Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
|
||||||
|
return wrap_deepmind(env, **wrapper_kwargs)
|
||||||
|
return _thunk
|
||||||
|
set_global_seeds(seed)
|
||||||
|
return SubprocVecEnv([make_env(i + start_index) for i in range(num_env)])
|
||||||
|
|
||||||
|
def make_mujoco_env(env_id, seed):
|
||||||
|
"""
|
||||||
|
Create a wrapped, monitored gym.Env for MuJoCo.
|
||||||
|
"""
|
||||||
|
set_global_seeds(seed)
|
||||||
|
env = gym.make(env_id)
|
||||||
|
env = Monitor(env, logger.get_dir())
|
||||||
|
env.seed(seed)
|
||||||
|
return env
|
||||||
|
|
||||||
|
def arg_parser():
|
||||||
|
"""
|
||||||
|
Create an empty argparse.ArgumentParser.
|
||||||
|
"""
|
||||||
|
import argparse
|
||||||
|
return argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||||
|
|
||||||
|
def atari_arg_parser():
|
||||||
|
"""
|
||||||
|
Create an argparse.ArgumentParser for run_atari.py.
|
||||||
|
"""
|
||||||
|
parser = arg_parser()
|
||||||
|
parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4')
|
||||||
|
parser.add_argument('--seed', help='RNG seed', type=int, default=0)
|
||||||
|
parser.add_argument('--num-timesteps', type=int, default=int(10e6))
|
||||||
|
return parser
|
||||||
|
|
||||||
|
def mujoco_arg_parser():
|
||||||
|
"""
|
||||||
|
Create an argparse.ArgumentParser for run_mujoco.py.
|
||||||
|
"""
|
||||||
|
parser = arg_parser()
|
||||||
|
parser.add_argument('--env', help='environment ID', type=str, default="Reacher-v1")
|
||||||
|
parser.add_argument('--seed', help='RNG seed', type=int, default=0)
|
||||||
|
parser.add_argument('--num-timesteps', type=int, default=int(1e6))
|
||||||
|
return parser
|
@@ -57,14 +57,12 @@ class CategoricalPdType(PdType):
|
|||||||
|
|
||||||
|
|
||||||
class MultiCategoricalPdType(PdType):
|
class MultiCategoricalPdType(PdType):
|
||||||
def __init__(self, low, high):
|
def __init__(self, nvec):
|
||||||
self.low = low
|
self.ncats = nvec
|
||||||
self.high = high
|
|
||||||
self.ncats = high - low + 1
|
|
||||||
def pdclass(self):
|
def pdclass(self):
|
||||||
return MultiCategoricalPd
|
return MultiCategoricalPd
|
||||||
def pdfromflat(self, flat):
|
def pdfromflat(self, flat):
|
||||||
return MultiCategoricalPd(self.low, self.high, flat)
|
return MultiCategoricalPd(self.ncats, flat)
|
||||||
def param_shape(self):
|
def param_shape(self):
|
||||||
return [sum(self.ncats)]
|
return [sum(self.ncats)]
|
||||||
def sample_shape(self):
|
def sample_shape(self):
|
||||||
@@ -125,7 +123,7 @@ class CategoricalPd(Pd):
|
|||||||
def flatparam(self):
|
def flatparam(self):
|
||||||
return self.logits
|
return self.logits
|
||||||
def mode(self):
|
def mode(self):
|
||||||
return U.argmax(self.logits, axis=-1)
|
return tf.argmax(self.logits, axis=-1)
|
||||||
def neglogp(self, x):
|
def neglogp(self, x):
|
||||||
# return tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=x)
|
# return tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=x)
|
||||||
# Note: we can't use sparse_softmax_cross_entropy_with_logits because
|
# Note: we can't use sparse_softmax_cross_entropy_with_logits because
|
||||||
@@ -135,20 +133,20 @@ class CategoricalPd(Pd):
|
|||||||
logits=self.logits,
|
logits=self.logits,
|
||||||
labels=one_hot_actions)
|
labels=one_hot_actions)
|
||||||
def kl(self, other):
|
def kl(self, other):
|
||||||
a0 = self.logits - U.max(self.logits, axis=-1, keepdims=True)
|
a0 = self.logits - tf.reduce_max(self.logits, axis=-1, keep_dims=True)
|
||||||
a1 = other.logits - U.max(other.logits, axis=-1, keepdims=True)
|
a1 = other.logits - tf.reduce_max(other.logits, axis=-1, keep_dims=True)
|
||||||
ea0 = tf.exp(a0)
|
ea0 = tf.exp(a0)
|
||||||
ea1 = tf.exp(a1)
|
ea1 = tf.exp(a1)
|
||||||
z0 = U.sum(ea0, axis=-1, keepdims=True)
|
z0 = tf.reduce_sum(ea0, axis=-1, keep_dims=True)
|
||||||
z1 = U.sum(ea1, axis=-1, keepdims=True)
|
z1 = tf.reduce_sum(ea1, axis=-1, keep_dims=True)
|
||||||
p0 = ea0 / z0
|
p0 = ea0 / z0
|
||||||
return U.sum(p0 * (a0 - tf.log(z0) - a1 + tf.log(z1)), axis=-1)
|
return tf.reduce_sum(p0 * (a0 - tf.log(z0) - a1 + tf.log(z1)), axis=-1)
|
||||||
def entropy(self):
|
def entropy(self):
|
||||||
a0 = self.logits - U.max(self.logits, axis=-1, keepdims=True)
|
a0 = self.logits - tf.reduce_max(self.logits, axis=-1, keep_dims=True)
|
||||||
ea0 = tf.exp(a0)
|
ea0 = tf.exp(a0)
|
||||||
z0 = U.sum(ea0, axis=-1, keepdims=True)
|
z0 = tf.reduce_sum(ea0, axis=-1, keep_dims=True)
|
||||||
p0 = ea0 / z0
|
p0 = ea0 / z0
|
||||||
return U.sum(p0 * (tf.log(z0) - a0), axis=-1)
|
return tf.reduce_sum(p0 * (tf.log(z0) - a0), axis=-1)
|
||||||
def sample(self):
|
def sample(self):
|
||||||
u = tf.random_uniform(tf.shape(self.logits))
|
u = tf.random_uniform(tf.shape(self.logits))
|
||||||
return tf.argmax(self.logits - tf.log(-tf.log(u)), axis=-1)
|
return tf.argmax(self.logits - tf.log(-tf.log(u)), axis=-1)
|
||||||
@@ -157,24 +155,21 @@ class CategoricalPd(Pd):
|
|||||||
return cls(flat)
|
return cls(flat)
|
||||||
|
|
||||||
class MultiCategoricalPd(Pd):
|
class MultiCategoricalPd(Pd):
|
||||||
def __init__(self, low, high, flat):
|
def __init__(self, nvec, flat):
|
||||||
self.flat = flat
|
self.flat = flat
|
||||||
self.low = tf.constant(low, dtype=tf.int32)
|
self.categoricals = list(map(CategoricalPd, tf.split(flat, nvec, axis=-1)))
|
||||||
self.categoricals = list(map(CategoricalPd, tf.split(flat, high - low + 1, axis=len(flat.get_shape()) - 1)))
|
|
||||||
def flatparam(self):
|
def flatparam(self):
|
||||||
return self.flat
|
return self.flat
|
||||||
def mode(self):
|
def mode(self):
|
||||||
return self.low + tf.cast(tf.stack([p.mode() for p in self.categoricals], axis=-1), tf.int32)
|
return tf.cast(tf.stack([p.mode() for p in self.categoricals], axis=-1), tf.int32)
|
||||||
def neglogp(self, x):
|
def neglogp(self, x):
|
||||||
return tf.add_n([p.neglogp(px) for p, px in zip(self.categoricals, tf.unstack(x - self.low, axis=len(x.get_shape()) - 1))])
|
return tf.add_n([p.neglogp(px) for p, px in zip(self.categoricals, tf.unstack(x, axis=-1))])
|
||||||
def kl(self, other):
|
def kl(self, other):
|
||||||
return tf.add_n([
|
return tf.add_n([p.kl(q) for p, q in zip(self.categoricals, other.categoricals)])
|
||||||
p.kl(q) for p, q in zip(self.categoricals, other.categoricals)
|
|
||||||
])
|
|
||||||
def entropy(self):
|
def entropy(self):
|
||||||
return tf.add_n([p.entropy() for p in self.categoricals])
|
return tf.add_n([p.entropy() for p in self.categoricals])
|
||||||
def sample(self):
|
def sample(self):
|
||||||
return self.low + tf.cast(tf.stack([p.sample() for p in self.categoricals], axis=-1), tf.int32)
|
return tf.cast(tf.stack([p.sample() for p in self.categoricals], axis=-1), tf.int32)
|
||||||
@classmethod
|
@classmethod
|
||||||
def fromflat(cls, flat):
|
def fromflat(cls, flat):
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
@@ -191,14 +186,14 @@ class DiagGaussianPd(Pd):
|
|||||||
def mode(self):
|
def mode(self):
|
||||||
return self.mean
|
return self.mean
|
||||||
def neglogp(self, x):
|
def neglogp(self, x):
|
||||||
return 0.5 * U.sum(tf.square((x - self.mean) / self.std), axis=-1) \
|
return 0.5 * tf.reduce_sum(tf.square((x - self.mean) / self.std), axis=-1) \
|
||||||
+ 0.5 * np.log(2.0 * np.pi) * tf.to_float(tf.shape(x)[-1]) \
|
+ 0.5 * np.log(2.0 * np.pi) * tf.to_float(tf.shape(x)[-1]) \
|
||||||
+ U.sum(self.logstd, axis=-1)
|
+ tf.reduce_sum(self.logstd, axis=-1)
|
||||||
def kl(self, other):
|
def kl(self, other):
|
||||||
assert isinstance(other, DiagGaussianPd)
|
assert isinstance(other, DiagGaussianPd)
|
||||||
return U.sum(other.logstd - self.logstd + (tf.square(self.std) + tf.square(self.mean - other.mean)) / (2.0 * tf.square(other.std)) - 0.5, axis=-1)
|
return tf.reduce_sum(other.logstd - self.logstd + (tf.square(self.std) + tf.square(self.mean - other.mean)) / (2.0 * tf.square(other.std)) - 0.5, axis=-1)
|
||||||
def entropy(self):
|
def entropy(self):
|
||||||
return U.sum(self.logstd + .5 * np.log(2.0 * np.pi * np.e), axis=-1)
|
return tf.reduce_sum(self.logstd + .5 * np.log(2.0 * np.pi * np.e), axis=-1)
|
||||||
def sample(self):
|
def sample(self):
|
||||||
return self.mean + self.std * tf.random_normal(tf.shape(self.mean))
|
return self.mean + self.std * tf.random_normal(tf.shape(self.mean))
|
||||||
@classmethod
|
@classmethod
|
||||||
@@ -214,11 +209,11 @@ class BernoulliPd(Pd):
|
|||||||
def mode(self):
|
def mode(self):
|
||||||
return tf.round(self.ps)
|
return tf.round(self.ps)
|
||||||
def neglogp(self, x):
|
def neglogp(self, x):
|
||||||
return U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=tf.to_float(x)), axis=-1)
|
return tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=tf.to_float(x)), axis=-1)
|
||||||
def kl(self, other):
|
def kl(self, other):
|
||||||
return U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=other.logits, labels=self.ps), axis=-1) - U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps), axis=-1)
|
return tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=other.logits, labels=self.ps), axis=-1) - tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps), axis=-1)
|
||||||
def entropy(self):
|
def entropy(self):
|
||||||
return U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps), axis=-1)
|
return tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps), axis=-1)
|
||||||
def sample(self):
|
def sample(self):
|
||||||
u = tf.random_uniform(tf.shape(self.ps))
|
u = tf.random_uniform(tf.shape(self.ps))
|
||||||
return tf.to_float(math_ops.less(u, self.ps))
|
return tf.to_float(math_ops.less(u, self.ps))
|
||||||
@@ -234,7 +229,7 @@ def make_pdtype(ac_space):
|
|||||||
elif isinstance(ac_space, spaces.Discrete):
|
elif isinstance(ac_space, spaces.Discrete):
|
||||||
return CategoricalPdType(ac_space.n)
|
return CategoricalPdType(ac_space.n)
|
||||||
elif isinstance(ac_space, spaces.MultiDiscrete):
|
elif isinstance(ac_space, spaces.MultiDiscrete):
|
||||||
return MultiCategoricalPdType(ac_space.low, ac_space.high)
|
return MultiCategoricalPdType(ac_space.nvec)
|
||||||
elif isinstance(ac_space, spaces.MultiBinary):
|
elif isinstance(ac_space, spaces.MultiBinary):
|
||||||
return BernoulliPdType(ac_space.n)
|
return BernoulliPdType(ac_space.n)
|
||||||
else:
|
else:
|
||||||
@@ -259,6 +254,11 @@ def test_probtypes():
|
|||||||
categorical = CategoricalPdType(pdparam_categorical.size) #pylint: disable=E1101
|
categorical = CategoricalPdType(pdparam_categorical.size) #pylint: disable=E1101
|
||||||
validate_probtype(categorical, pdparam_categorical)
|
validate_probtype(categorical, pdparam_categorical)
|
||||||
|
|
||||||
|
nvec = [1,2,3]
|
||||||
|
pdparam_multicategorical = np.array([-.2, .3, .5, .1, 1, -.1])
|
||||||
|
multicategorical = MultiCategoricalPdType(nvec) #pylint: disable=E1101
|
||||||
|
validate_probtype(multicategorical, pdparam_multicategorical)
|
||||||
|
|
||||||
pdparam_bernoulli = np.array([-.2, .3, .5])
|
pdparam_bernoulli = np.array([-.2, .3, .5])
|
||||||
bernoulli = BernoulliPdType(pdparam_bernoulli.size) #pylint: disable=E1101
|
bernoulli = BernoulliPdType(pdparam_bernoulli.size) #pylint: disable=E1101
|
||||||
validate_probtype(bernoulli, pdparam_bernoulli)
|
validate_probtype(bernoulli, pdparam_bernoulli)
|
||||||
@@ -270,10 +270,10 @@ def validate_probtype(probtype, pdparam):
|
|||||||
Mval = np.repeat(pdparam[None, :], N, axis=0)
|
Mval = np.repeat(pdparam[None, :], N, axis=0)
|
||||||
M = probtype.param_placeholder([N])
|
M = probtype.param_placeholder([N])
|
||||||
X = probtype.sample_placeholder([N])
|
X = probtype.sample_placeholder([N])
|
||||||
pd = probtype.pdclass()(M)
|
pd = probtype.pdfromflat(M)
|
||||||
calcloglik = U.function([X, M], pd.logp(X))
|
calcloglik = U.function([X, M], pd.logp(X))
|
||||||
calcent = U.function([M], pd.entropy())
|
calcent = U.function([M], pd.entropy())
|
||||||
Xval = U.eval(pd.sample(), feed_dict={M:Mval})
|
Xval = tf.get_default_session().run(pd.sample(), feed_dict={M:Mval})
|
||||||
logliks = calcloglik(Xval, Mval)
|
logliks = calcloglik(Xval, Mval)
|
||||||
entval_ll = - logliks.mean() #pylint: disable=E1101
|
entval_ll = - logliks.mean() #pylint: disable=E1101
|
||||||
entval_ll_stderr = logliks.std() / np.sqrt(N) #pylint: disable=E1101
|
entval_ll_stderr = logliks.std() / np.sqrt(N) #pylint: disable=E1101
|
||||||
@@ -282,7 +282,7 @@ def validate_probtype(probtype, pdparam):
|
|||||||
|
|
||||||
# Check to see if kldiv[p,q] = - ent[p] - E_p[log q]
|
# Check to see if kldiv[p,q] = - ent[p] - E_p[log q]
|
||||||
M2 = probtype.param_placeholder([N])
|
M2 = probtype.param_placeholder([N])
|
||||||
pd2 = probtype.pdclass()(M2)
|
pd2 = probtype.pdfromflat(M2)
|
||||||
q = pdparam + np.random.randn(pdparam.size) * 0.1
|
q = pdparam + np.random.randn(pdparam.size) * 0.1
|
||||||
Mval2 = np.repeat(q[None, :], N, axis=0)
|
Mval2 = np.repeat(q[None, :], N, axis=0)
|
||||||
calckl = U.function([M, M2], pd.kl(pd2))
|
calckl = U.function([M, M2], pd.kl(pd2))
|
||||||
@@ -291,3 +291,5 @@ def validate_probtype(probtype, pdparam):
|
|||||||
klval_ll = - entval - logliks.mean() #pylint: disable=E1101
|
klval_ll = - entval - logliks.mean() #pylint: disable=E1101
|
||||||
klval_ll_stderr = logliks.std() / np.sqrt(N) #pylint: disable=E1101
|
klval_ll_stderr = logliks.std() / np.sqrt(N) #pylint: disable=E1101
|
||||||
assert np.abs(klval - klval_ll) < 3 * klval_ll_stderr # within 3 sigmas
|
assert np.abs(klval - klval_ll) < 3 * klval_ll_stderr # within 3 sigmas
|
||||||
|
print('ok on', probtype, pdparam)
|
||||||
|
|
||||||
|
@@ -53,7 +53,7 @@ class MpiAdam(object):
|
|||||||
def test_MpiAdam():
|
def test_MpiAdam():
|
||||||
np.random.seed(0)
|
np.random.seed(0)
|
||||||
tf.set_random_seed(0)
|
tf.set_random_seed(0)
|
||||||
|
|
||||||
a = tf.Variable(np.random.randn(3).astype('float32'))
|
a = tf.Variable(np.random.randn(3).astype('float32'))
|
||||||
b = tf.Variable(np.random.randn(2,5).astype('float32'))
|
b = tf.Variable(np.random.randn(2,5).astype('float32'))
|
||||||
loss = tf.reduce_sum(tf.square(a)) + tf.reduce_sum(tf.sin(b))
|
loss = tf.reduce_sum(tf.square(a)) + tf.reduce_sum(tf.sin(b))
|
||||||
|
@@ -2,29 +2,41 @@ from mpi4py import MPI
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
from baselines.common import zipsame
|
from baselines.common import zipsame
|
||||||
|
|
||||||
def mpi_moments(x, axis=0):
|
def mpi_mean(x, axis=0, comm=None, keepdims=False):
|
||||||
x = np.asarray(x, dtype='float64')
|
x = np.asarray(x)
|
||||||
newshape = list(x.shape)
|
assert x.ndim > 0
|
||||||
newshape.pop(axis)
|
if comm is None: comm = MPI.COMM_WORLD
|
||||||
n = np.prod(newshape,dtype=int)
|
xsum = x.sum(axis=axis, keepdims=keepdims)
|
||||||
totalvec = np.zeros(n*2+1, 'float64')
|
n = xsum.size
|
||||||
addvec = np.concatenate([x.sum(axis=axis).ravel(),
|
localsum = np.zeros(n+1, x.dtype)
|
||||||
np.square(x).sum(axis=axis).ravel(),
|
localsum[:n] = xsum.ravel()
|
||||||
np.array([x.shape[axis]],dtype='float64')])
|
localsum[n] = x.shape[axis]
|
||||||
MPI.COMM_WORLD.Allreduce(addvec, totalvec, op=MPI.SUM)
|
globalsum = np.zeros_like(localsum)
|
||||||
sum = totalvec[:n]
|
comm.Allreduce(localsum, globalsum, op=MPI.SUM)
|
||||||
sumsq = totalvec[n:2*n]
|
return globalsum[:n].reshape(xsum.shape) / globalsum[n], globalsum[n]
|
||||||
count = totalvec[2*n]
|
|
||||||
if count == 0:
|
def mpi_moments(x, axis=0, comm=None, keepdims=False):
|
||||||
mean = np.empty(newshape); mean[:] = np.nan
|
x = np.asarray(x)
|
||||||
std = np.empty(newshape); std[:] = np.nan
|
assert x.ndim > 0
|
||||||
else:
|
mean, count = mpi_mean(x, axis=axis, comm=comm, keepdims=True)
|
||||||
mean = sum/count
|
sqdiffs = np.square(x - mean)
|
||||||
std = np.sqrt(np.maximum(sumsq/count - np.square(mean),0))
|
meansqdiff, count1 = mpi_mean(sqdiffs, axis=axis, comm=comm, keepdims=True)
|
||||||
|
assert count1 == count
|
||||||
|
std = np.sqrt(meansqdiff)
|
||||||
|
if not keepdims:
|
||||||
|
newshape = mean.shape[:axis] + mean.shape[axis+1:]
|
||||||
|
mean = mean.reshape(newshape)
|
||||||
|
std = std.reshape(newshape)
|
||||||
return mean, std, count
|
return mean, std, count
|
||||||
|
|
||||||
|
|
||||||
def test_runningmeanstd():
|
def test_runningmeanstd():
|
||||||
|
import subprocess
|
||||||
|
subprocess.check_call(['mpirun', '-np', '3',
|
||||||
|
'python','-c',
|
||||||
|
'from baselines.common.mpi_moments import _helper_runningmeanstd; _helper_runningmeanstd()'])
|
||||||
|
|
||||||
|
def _helper_runningmeanstd():
|
||||||
comm = MPI.COMM_WORLD
|
comm = MPI.COMM_WORLD
|
||||||
np.random.seed(0)
|
np.random.seed(0)
|
||||||
for (triple,axis) in [
|
for (triple,axis) in [
|
||||||
@@ -45,6 +57,3 @@ def test_runningmeanstd():
|
|||||||
assert np.allclose(a1, a2)
|
assert np.allclose(a1, a2)
|
||||||
print("ok!")
|
print("ok!")
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
#mpirun -np 3 python <script>
|
|
||||||
test_runningmeanstd()
|
|
@@ -57,7 +57,7 @@ def test_runningmeanstd():
|
|||||||
rms.update(x1)
|
rms.update(x1)
|
||||||
rms.update(x2)
|
rms.update(x2)
|
||||||
rms.update(x3)
|
rms.update(x3)
|
||||||
ms2 = U.eval([rms.mean, rms.std])
|
ms2 = [rms.mean.eval(), rms.std.eval()]
|
||||||
|
|
||||||
assert np.allclose(ms1, ms2)
|
assert np.allclose(ms1, ms2)
|
||||||
|
|
||||||
@@ -94,11 +94,11 @@ def test_dist():
|
|||||||
|
|
||||||
assert checkallclose(
|
assert checkallclose(
|
||||||
bigvec.mean(axis=0),
|
bigvec.mean(axis=0),
|
||||||
U.eval(rms.mean)
|
rms.mean.eval(),
|
||||||
)
|
)
|
||||||
assert checkallclose(
|
assert checkallclose(
|
||||||
bigvec.std(axis=0),
|
bigvec.std(axis=0),
|
||||||
U.eval(rms.std)
|
rms.std.eval(),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@@ -6,12 +6,13 @@ class RunningMeanStd(object):
|
|||||||
self.var = np.ones(shape, 'float64')
|
self.var = np.ones(shape, 'float64')
|
||||||
self.count = epsilon
|
self.count = epsilon
|
||||||
|
|
||||||
|
|
||||||
def update(self, x):
|
def update(self, x):
|
||||||
batch_mean = np.mean(x, axis=0)
|
batch_mean = np.mean(x, axis=0)
|
||||||
batch_var = np.var(x, axis=0)
|
batch_var = np.var(x, axis=0)
|
||||||
batch_count = x.shape[0]
|
batch_count = x.shape[0]
|
||||||
|
self.update_from_moments(batch_mean, batch_var, batch_count)
|
||||||
|
|
||||||
|
def update_from_moments(self, batch_mean, batch_var, batch_count):
|
||||||
delta = batch_mean - self.mean
|
delta = batch_mean - self.mean
|
||||||
tot_count = self.count + batch_count
|
tot_count = self.count + batch_count
|
||||||
|
|
||||||
@@ -25,4 +26,21 @@ class RunningMeanStd(object):
|
|||||||
|
|
||||||
self.mean = new_mean
|
self.mean = new_mean
|
||||||
self.var = new_var
|
self.var = new_var
|
||||||
self.count = new_count
|
self.count = new_count
|
||||||
|
|
||||||
|
def test_runningmeanstd():
|
||||||
|
for (x1, x2, x3) in [
|
||||||
|
(np.random.randn(3), np.random.randn(4), np.random.randn(5)),
|
||||||
|
(np.random.randn(3,2), np.random.randn(4,2), np.random.randn(5,2)),
|
||||||
|
]:
|
||||||
|
|
||||||
|
rms = RunningMeanStd(epsilon=0.0, shape=x1.shape[1:])
|
||||||
|
|
||||||
|
x = np.concatenate([x1, x2, x3], axis=0)
|
||||||
|
ms1 = [x.mean(axis=0), x.var(axis=0)]
|
||||||
|
rms.update(x1)
|
||||||
|
rms.update(x2)
|
||||||
|
rms.update(x3)
|
||||||
|
ms2 = [rms.mean, rms.var]
|
||||||
|
|
||||||
|
assert np.allclose(ms1, ms2)
|
||||||
|
@@ -3,30 +3,10 @@ import tensorflow as tf
|
|||||||
from baselines.common.tf_util import (
|
from baselines.common.tf_util import (
|
||||||
function,
|
function,
|
||||||
initialize,
|
initialize,
|
||||||
set_value,
|
|
||||||
single_threaded_session
|
single_threaded_session
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_set_value():
|
|
||||||
a = tf.Variable(42.)
|
|
||||||
with single_threaded_session():
|
|
||||||
set_value(a, 5)
|
|
||||||
assert a.eval() == 5
|
|
||||||
g = tf.get_default_graph()
|
|
||||||
g.finalize()
|
|
||||||
set_value(a, 6)
|
|
||||||
assert a.eval() == 6
|
|
||||||
|
|
||||||
# test the test
|
|
||||||
try:
|
|
||||||
assert a.eval() == 7
|
|
||||||
except AssertionError:
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
assert False, "assertion should have failed"
|
|
||||||
|
|
||||||
|
|
||||||
def test_function():
|
def test_function():
|
||||||
tf.reset_default_graph()
|
tf.reset_default_graph()
|
||||||
x = tf.placeholder(tf.int32, (), name="x")
|
x = tf.placeholder(tf.int32, (), name="x")
|
||||||
@@ -38,9 +18,7 @@ def test_function():
|
|||||||
initialize()
|
initialize()
|
||||||
|
|
||||||
assert lin(2) == 6
|
assert lin(2) == 6
|
||||||
assert lin(x=3) == 9
|
|
||||||
assert lin(2, 2) == 10
|
assert lin(2, 2) == 10
|
||||||
assert lin(x=2, y=3) == 12
|
|
||||||
|
|
||||||
|
|
||||||
def test_multikwargs():
|
def test_multikwargs():
|
||||||
@@ -56,14 +34,8 @@ def test_multikwargs():
|
|||||||
assert lin(2) == 6
|
assert lin(2) == 6
|
||||||
assert lin(2, 2) == 10
|
assert lin(2, 2) == 10
|
||||||
expt_caught = False
|
expt_caught = False
|
||||||
try:
|
|
||||||
lin(x=2)
|
|
||||||
except AssertionError:
|
|
||||||
expt_caught = True
|
|
||||||
assert expt_caught
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
test_set_value()
|
|
||||||
test_function()
|
test_function()
|
||||||
test_multikwargs()
|
test_multikwargs()
|
||||||
|
@@ -1,45 +1,10 @@
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
import tensorflow as tf # pylint: ignore-module
|
import tensorflow as tf # pylint: ignore-module
|
||||||
import builtins
|
|
||||||
import functools
|
|
||||||
import copy
|
import copy
|
||||||
import os
|
import os
|
||||||
|
import functools
|
||||||
import collections
|
import collections
|
||||||
|
import multiprocessing
|
||||||
# ================================================================
|
|
||||||
# Make consistent with numpy
|
|
||||||
# ================================================================
|
|
||||||
|
|
||||||
clip = tf.clip_by_value
|
|
||||||
|
|
||||||
def sum(x, axis=None, keepdims=False):
|
|
||||||
axis = None if axis is None else [axis]
|
|
||||||
return tf.reduce_sum(x, axis=axis, keep_dims=keepdims)
|
|
||||||
|
|
||||||
def mean(x, axis=None, keepdims=False):
|
|
||||||
axis = None if axis is None else [axis]
|
|
||||||
return tf.reduce_mean(x, axis=axis, keep_dims=keepdims)
|
|
||||||
|
|
||||||
def var(x, axis=None, keepdims=False):
|
|
||||||
meanx = mean(x, axis=axis, keepdims=keepdims)
|
|
||||||
return mean(tf.square(x - meanx), axis=axis, keepdims=keepdims)
|
|
||||||
|
|
||||||
def std(x, axis=None, keepdims=False):
|
|
||||||
return tf.sqrt(var(x, axis=axis, keepdims=keepdims))
|
|
||||||
|
|
||||||
def max(x, axis=None, keepdims=False):
|
|
||||||
axis = None if axis is None else [axis]
|
|
||||||
return tf.reduce_max(x, axis=axis, keep_dims=keepdims)
|
|
||||||
|
|
||||||
def min(x, axis=None, keepdims=False):
|
|
||||||
axis = None if axis is None else [axis]
|
|
||||||
return tf.reduce_min(x, axis=axis, keep_dims=keepdims)
|
|
||||||
|
|
||||||
def concatenate(arrs, axis=0):
|
|
||||||
return tf.concat(axis=axis, values=arrs)
|
|
||||||
|
|
||||||
def argmax(x, axis=None):
|
|
||||||
return tf.argmax(x, axis=axis)
|
|
||||||
|
|
||||||
def switch(condition, then_expression, else_expression):
|
def switch(condition, then_expression, else_expression):
|
||||||
"""Switches between two operations depending on a scalar value (int or bool).
|
"""Switches between two operations depending on a scalar value (int or bool).
|
||||||
@@ -62,105 +27,11 @@ def switch(condition, then_expression, else_expression):
|
|||||||
# Extras
|
# Extras
|
||||||
# ================================================================
|
# ================================================================
|
||||||
|
|
||||||
def l2loss(params):
|
|
||||||
if len(params) == 0:
|
|
||||||
return tf.constant(0.0)
|
|
||||||
else:
|
|
||||||
return tf.add_n([sum(tf.square(p)) for p in params])
|
|
||||||
|
|
||||||
def lrelu(x, leak=0.2):
|
def lrelu(x, leak=0.2):
|
||||||
f1 = 0.5 * (1 + leak)
|
f1 = 0.5 * (1 + leak)
|
||||||
f2 = 0.5 * (1 - leak)
|
f2 = 0.5 * (1 - leak)
|
||||||
return f1 * x + f2 * abs(x)
|
return f1 * x + f2 * abs(x)
|
||||||
|
|
||||||
def categorical_sample_logits(X):
|
|
||||||
# https://github.com/tensorflow/tensorflow/issues/456
|
|
||||||
U = tf.random_uniform(tf.shape(X))
|
|
||||||
return argmax(X - tf.log(-tf.log(U)), axis=1)
|
|
||||||
|
|
||||||
# ================================================================
|
|
||||||
# Inputs
|
|
||||||
# ================================================================
|
|
||||||
|
|
||||||
def is_placeholder(x):
|
|
||||||
return type(x) is tf.Tensor and len(x.op.inputs) == 0
|
|
||||||
|
|
||||||
class TfInput(object):
|
|
||||||
def __init__(self, name="(unnamed)"):
|
|
||||||
"""Generalized Tensorflow placeholder. The main differences are:
|
|
||||||
- possibly uses multiple placeholders internally and returns multiple values
|
|
||||||
- can apply light postprocessing to the value feed to placeholder.
|
|
||||||
"""
|
|
||||||
self.name = name
|
|
||||||
|
|
||||||
def get(self):
|
|
||||||
"""Return the tf variable(s) representing the possibly postprocessed value
|
|
||||||
of placeholder(s).
|
|
||||||
"""
|
|
||||||
raise NotImplemented()
|
|
||||||
|
|
||||||
def make_feed_dict(data):
|
|
||||||
"""Given data input it to the placeholder(s)."""
|
|
||||||
raise NotImplemented()
|
|
||||||
|
|
||||||
class PlacholderTfInput(TfInput):
|
|
||||||
def __init__(self, placeholder):
|
|
||||||
"""Wrapper for regular tensorflow placeholder."""
|
|
||||||
super().__init__(placeholder.name)
|
|
||||||
self._placeholder = placeholder
|
|
||||||
|
|
||||||
def get(self):
|
|
||||||
return self._placeholder
|
|
||||||
|
|
||||||
def make_feed_dict(self, data):
|
|
||||||
return {self._placeholder: data}
|
|
||||||
|
|
||||||
class BatchInput(PlacholderTfInput):
|
|
||||||
def __init__(self, shape, dtype=tf.float32, name=None):
|
|
||||||
"""Creates a placeholder for a batch of tensors of a given shape and dtype
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
shape: [int]
|
|
||||||
shape of a single elemenet of the batch
|
|
||||||
dtype: tf.dtype
|
|
||||||
number representation used for tensor contents
|
|
||||||
name: str
|
|
||||||
name of the underlying placeholder
|
|
||||||
"""
|
|
||||||
super().__init__(tf.placeholder(dtype, [None] + list(shape), name=name))
|
|
||||||
|
|
||||||
class Uint8Input(PlacholderTfInput):
|
|
||||||
def __init__(self, shape, name=None):
|
|
||||||
"""Takes input in uint8 format which is cast to float32 and divided by 255
|
|
||||||
before passing it to the model.
|
|
||||||
|
|
||||||
On GPU this ensures lower data transfer times.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
shape: [int]
|
|
||||||
shape of the tensor.
|
|
||||||
name: str
|
|
||||||
name of the underlying placeholder
|
|
||||||
"""
|
|
||||||
|
|
||||||
super().__init__(tf.placeholder(tf.uint8, [None] + list(shape), name=name))
|
|
||||||
self._shape = shape
|
|
||||||
self._output = tf.cast(super().get(), tf.float32) / 255.0
|
|
||||||
|
|
||||||
def get(self):
|
|
||||||
return self._output
|
|
||||||
|
|
||||||
def ensure_tf_input(thing):
|
|
||||||
"""Takes either tf.placeholder of TfInput and outputs equivalent TfInput"""
|
|
||||||
if isinstance(thing, TfInput):
|
|
||||||
return thing
|
|
||||||
elif is_placeholder(thing):
|
|
||||||
return PlacholderTfInput(thing)
|
|
||||||
else:
|
|
||||||
raise ValueError("Must be a placeholder or TfInput")
|
|
||||||
|
|
||||||
# ================================================================
|
# ================================================================
|
||||||
# Mathematical utils
|
# Mathematical utils
|
||||||
# ================================================================
|
# ================================================================
|
||||||
@@ -173,96 +44,42 @@ def huber_loss(x, delta=1.0):
|
|||||||
delta * (tf.abs(x) - 0.5 * delta)
|
delta * (tf.abs(x) - 0.5 * delta)
|
||||||
)
|
)
|
||||||
|
|
||||||
def logsigmoid(a):
|
|
||||||
'''Equivalent to tf.log(tf.sigmoid(a))'''
|
|
||||||
return -tf.nn.softplus(-a)
|
|
||||||
|
|
||||||
""" Reference: https://github.com/openai/imitation/blob/99fbccf3e060b6e6c739bdf209758620fcdefd3c/policyopt/thutil.py#L48-L51"""
|
|
||||||
def logit_bernoulli_entropy(logits):
|
|
||||||
ent = (1.-tf.nn.sigmoid(logits))*logits - logsigmoid(logits)
|
|
||||||
return ent
|
|
||||||
|
|
||||||
# ================================================================
|
|
||||||
# Optimizer utils
|
|
||||||
# ================================================================
|
|
||||||
|
|
||||||
def minimize_and_clip(optimizer, objective, var_list, clip_val=10):
|
|
||||||
"""Minimized `objective` using `optimizer` w.r.t. variables in
|
|
||||||
`var_list` while ensure the norm of the gradients for each
|
|
||||||
variable is clipped to `clip_val`
|
|
||||||
"""
|
|
||||||
gradients = optimizer.compute_gradients(objective, var_list=var_list)
|
|
||||||
for i, (grad, var) in enumerate(gradients):
|
|
||||||
if grad is not None:
|
|
||||||
gradients[i] = (tf.clip_by_norm(grad, clip_val), var)
|
|
||||||
return optimizer.apply_gradients(gradients)
|
|
||||||
|
|
||||||
# ================================================================
|
# ================================================================
|
||||||
# Global session
|
# Global session
|
||||||
# ================================================================
|
# ================================================================
|
||||||
|
|
||||||
def get_session():
|
def make_session(num_cpu=None, make_default=False):
|
||||||
"""Returns recently made Tensorflow session"""
|
|
||||||
return tf.get_default_session()
|
|
||||||
|
|
||||||
def make_session(num_cpu):
|
|
||||||
"""Returns a session that will use <num_cpu> CPU's only"""
|
"""Returns a session that will use <num_cpu> CPU's only"""
|
||||||
|
if num_cpu is None:
|
||||||
|
num_cpu = int(os.getenv('RCALL_NUM_CPU', multiprocessing.cpu_count()))
|
||||||
tf_config = tf.ConfigProto(
|
tf_config = tf.ConfigProto(
|
||||||
inter_op_parallelism_threads=num_cpu,
|
inter_op_parallelism_threads=num_cpu,
|
||||||
intra_op_parallelism_threads=num_cpu)
|
intra_op_parallelism_threads=num_cpu)
|
||||||
return tf.Session(config=tf_config)
|
tf_config.gpu_options.allocator_type = 'BFC'
|
||||||
|
if make_default:
|
||||||
|
return tf.InteractiveSession(config=tf_config)
|
||||||
|
else:
|
||||||
|
return tf.Session(config=tf_config)
|
||||||
|
|
||||||
def single_threaded_session():
|
def single_threaded_session():
|
||||||
"""Returns a session which will only use a single CPU"""
|
"""Returns a session which will only use a single CPU"""
|
||||||
return make_session(1)
|
return make_session(num_cpu=1)
|
||||||
|
|
||||||
|
def in_session(f):
|
||||||
|
@functools.wraps(f)
|
||||||
|
def newfunc(*args, **kwargs):
|
||||||
|
with tf.Session():
|
||||||
|
f(*args, **kwargs)
|
||||||
|
return newfunc
|
||||||
|
|
||||||
ALREADY_INITIALIZED = set()
|
ALREADY_INITIALIZED = set()
|
||||||
|
|
||||||
def initialize():
|
def initialize():
|
||||||
"""Initialize all the uninitialized variables in the global scope."""
|
"""Initialize all the uninitialized variables in the global scope."""
|
||||||
new_variables = set(tf.global_variables()) - ALREADY_INITIALIZED
|
new_variables = set(tf.global_variables()) - ALREADY_INITIALIZED
|
||||||
get_session().run(tf.variables_initializer(new_variables))
|
tf.get_default_session().run(tf.variables_initializer(new_variables))
|
||||||
ALREADY_INITIALIZED.update(new_variables)
|
ALREADY_INITIALIZED.update(new_variables)
|
||||||
|
|
||||||
def eval(expr, feed_dict=None):
|
|
||||||
if feed_dict is None:
|
|
||||||
feed_dict = {}
|
|
||||||
return get_session().run(expr, feed_dict=feed_dict)
|
|
||||||
|
|
||||||
VALUE_SETTERS = collections.OrderedDict()
|
|
||||||
|
|
||||||
def set_value(v, val):
|
|
||||||
global VALUE_SETTERS
|
|
||||||
if v in VALUE_SETTERS:
|
|
||||||
set_op, set_endpoint = VALUE_SETTERS[v]
|
|
||||||
else:
|
|
||||||
set_endpoint = tf.placeholder(v.dtype)
|
|
||||||
set_op = v.assign(set_endpoint)
|
|
||||||
VALUE_SETTERS[v] = (set_op, set_endpoint)
|
|
||||||
get_session().run(set_op, feed_dict={set_endpoint: val})
|
|
||||||
|
|
||||||
# ================================================================
|
|
||||||
# Save tensorflow summary
|
|
||||||
# ================================================================
|
|
||||||
|
|
||||||
def file_writer(dir_path):
|
|
||||||
os.makedirs(dir_path, exist_ok=True)
|
|
||||||
return tf.summary.FileWriter(dir_path, get_session().graph)
|
|
||||||
|
|
||||||
|
|
||||||
# ================================================================
|
|
||||||
# Saving variables
|
|
||||||
# ================================================================
|
|
||||||
|
|
||||||
def load_state(fname, var_list=None):
|
|
||||||
saver = tf.train.Saver(var_list=var_list)
|
|
||||||
saver.restore(get_session(), fname)
|
|
||||||
|
|
||||||
def save_state(fname, var_list=None):
|
|
||||||
os.makedirs(os.path.dirname(fname), exist_ok=True)
|
|
||||||
saver = tf.train.Saver(var_list=var_list)
|
|
||||||
saver.save(get_session(), fname)
|
|
||||||
|
|
||||||
# ================================================================
|
# ================================================================
|
||||||
# Model components
|
# Model components
|
||||||
# ================================================================
|
# ================================================================
|
||||||
@@ -303,36 +120,6 @@ def conv2d(x, num_filters, name, filter_size=(3, 3), stride=(1, 1), pad="SAME",
|
|||||||
|
|
||||||
return tf.nn.conv2d(x, w, stride_shape, pad) + b
|
return tf.nn.conv2d(x, w, stride_shape, pad) + b
|
||||||
|
|
||||||
def dense(x, size, name, weight_init=None, bias=True):
|
|
||||||
w = tf.get_variable(name + "/w", [x.get_shape()[1], size], initializer=weight_init)
|
|
||||||
ret = tf.matmul(x, w)
|
|
||||||
if bias:
|
|
||||||
b = tf.get_variable(name + "/b", [size], initializer=tf.zeros_initializer())
|
|
||||||
return ret + b
|
|
||||||
else:
|
|
||||||
return ret
|
|
||||||
|
|
||||||
def wndense(x, size, name, init_scale=1.0):
|
|
||||||
v = tf.get_variable(name + "/V", [int(x.get_shape()[1]), size],
|
|
||||||
initializer=tf.random_normal_initializer(0, 0.05))
|
|
||||||
g = tf.get_variable(name + "/g", [size], initializer=tf.constant_initializer(init_scale))
|
|
||||||
b = tf.get_variable(name + "/b", [size], initializer=tf.constant_initializer(0.0))
|
|
||||||
|
|
||||||
# use weight normalization (Salimans & Kingma, 2016)
|
|
||||||
x = tf.matmul(x, v)
|
|
||||||
scaler = g / tf.sqrt(sum(tf.square(v), axis=0, keepdims=True))
|
|
||||||
return tf.reshape(scaler, [1, size]) * x + tf.reshape(b, [1, size])
|
|
||||||
|
|
||||||
def densenobias(x, size, name, weight_init=None):
|
|
||||||
return dense(x, size, name, weight_init=weight_init, bias=False)
|
|
||||||
|
|
||||||
def dropout(x, pkeep, phase=None, mask=None):
|
|
||||||
mask = tf.floor(pkeep + tf.random_uniform(tf.shape(x))) if mask is None else mask
|
|
||||||
if phase is None:
|
|
||||||
return mask * x
|
|
||||||
else:
|
|
||||||
return switch(phase, mask * x, pkeep * x)
|
|
||||||
|
|
||||||
# ================================================================
|
# ================================================================
|
||||||
# Theano-like Function
|
# Theano-like Function
|
||||||
# ================================================================
|
# ================================================================
|
||||||
@@ -362,7 +149,7 @@ def function(inputs, outputs, updates=None, givens=None):
|
|||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
inputs: [tf.placeholder or TfInput]
|
inputs: [tf.placeholder, tf.constant, or object with make_feed_dict method]
|
||||||
list of input arguments
|
list of input arguments
|
||||||
outputs: [tf.Variable] or tf.Variable
|
outputs: [tf.Variable] or tf.Variable
|
||||||
list of outputs or a single output to be returned from function. Returned
|
list of outputs or a single output to be returned from function. Returned
|
||||||
@@ -377,183 +164,36 @@ def function(inputs, outputs, updates=None, givens=None):
|
|||||||
f = _Function(inputs, [outputs], updates, givens=givens)
|
f = _Function(inputs, [outputs], updates, givens=givens)
|
||||||
return lambda *args, **kwargs: f(*args, **kwargs)[0]
|
return lambda *args, **kwargs: f(*args, **kwargs)[0]
|
||||||
|
|
||||||
|
|
||||||
class _Function(object):
|
class _Function(object):
|
||||||
def __init__(self, inputs, outputs, updates, givens, check_nan=False):
|
def __init__(self, inputs, outputs, updates, givens):
|
||||||
for inpt in inputs:
|
for inpt in inputs:
|
||||||
if not issubclass(type(inpt), TfInput):
|
if not hasattr(inpt, 'make_feed_dict') and not (type(inpt) is tf.Tensor and len(inpt.op.inputs) == 0):
|
||||||
assert len(inpt.op.inputs) == 0, "inputs should all be placeholders of baselines.common.TfInput"
|
assert False, "inputs should all be placeholders, constants, or have a make_feed_dict method"
|
||||||
self.inputs = inputs
|
self.inputs = inputs
|
||||||
updates = updates or []
|
updates = updates or []
|
||||||
self.update_group = tf.group(*updates)
|
self.update_group = tf.group(*updates)
|
||||||
self.outputs_update = list(outputs) + [self.update_group]
|
self.outputs_update = list(outputs) + [self.update_group]
|
||||||
self.givens = {} if givens is None else givens
|
self.givens = {} if givens is None else givens
|
||||||
self.check_nan = check_nan
|
|
||||||
|
|
||||||
def _feed_input(self, feed_dict, inpt, value):
|
def _feed_input(self, feed_dict, inpt, value):
|
||||||
if issubclass(type(inpt), TfInput):
|
if hasattr(inpt, 'make_feed_dict'):
|
||||||
feed_dict.update(inpt.make_feed_dict(value))
|
feed_dict.update(inpt.make_feed_dict(value))
|
||||||
elif is_placeholder(inpt):
|
else:
|
||||||
feed_dict[inpt] = value
|
feed_dict[inpt] = value
|
||||||
|
|
||||||
def __call__(self, *args, **kwargs):
|
def __call__(self, *args):
|
||||||
assert len(args) <= len(self.inputs), "Too many arguments provided"
|
assert len(args) <= len(self.inputs), "Too many arguments provided"
|
||||||
feed_dict = {}
|
feed_dict = {}
|
||||||
# Update the args
|
# Update the args
|
||||||
for inpt, value in zip(self.inputs, args):
|
for inpt, value in zip(self.inputs, args):
|
||||||
self._feed_input(feed_dict, inpt, value)
|
self._feed_input(feed_dict, inpt, value)
|
||||||
# Update the kwargs
|
|
||||||
kwargs_passed_inpt_names = set()
|
|
||||||
for inpt in self.inputs[len(args):]:
|
|
||||||
inpt_name = inpt.name.split(':')[0]
|
|
||||||
inpt_name = inpt_name.split('/')[-1]
|
|
||||||
assert inpt_name not in kwargs_passed_inpt_names, \
|
|
||||||
"this function has two arguments with the same name \"{}\", so kwargs cannot be used.".format(inpt_name)
|
|
||||||
if inpt_name in kwargs:
|
|
||||||
kwargs_passed_inpt_names.add(inpt_name)
|
|
||||||
self._feed_input(feed_dict, inpt, kwargs.pop(inpt_name))
|
|
||||||
else:
|
|
||||||
assert inpt in self.givens, "Missing argument " + inpt_name
|
|
||||||
assert len(kwargs) == 0, "Function got extra arguments " + str(list(kwargs.keys()))
|
|
||||||
# Update feed dict with givens.
|
# Update feed dict with givens.
|
||||||
for inpt in self.givens:
|
for inpt in self.givens:
|
||||||
feed_dict[inpt] = feed_dict.get(inpt, self.givens[inpt])
|
feed_dict[inpt] = feed_dict.get(inpt, self.givens[inpt])
|
||||||
results = get_session().run(self.outputs_update, feed_dict=feed_dict)[:-1]
|
results = tf.get_default_session().run(self.outputs_update, feed_dict=feed_dict)[:-1]
|
||||||
if self.check_nan:
|
|
||||||
if any(np.isnan(r).any() for r in results):
|
|
||||||
raise RuntimeError("Nan detected")
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
def mem_friendly_function(nondata_inputs, data_inputs, outputs, batch_size):
|
|
||||||
if isinstance(outputs, list):
|
|
||||||
return _MemFriendlyFunction(nondata_inputs, data_inputs, outputs, batch_size)
|
|
||||||
else:
|
|
||||||
f = _MemFriendlyFunction(nondata_inputs, data_inputs, [outputs], batch_size)
|
|
||||||
return lambda *inputs: f(*inputs)[0]
|
|
||||||
|
|
||||||
class _MemFriendlyFunction(object):
|
|
||||||
def __init__(self, nondata_inputs, data_inputs, outputs, batch_size):
|
|
||||||
self.nondata_inputs = nondata_inputs
|
|
||||||
self.data_inputs = data_inputs
|
|
||||||
self.outputs = list(outputs)
|
|
||||||
self.batch_size = batch_size
|
|
||||||
|
|
||||||
def __call__(self, *inputvals):
|
|
||||||
assert len(inputvals) == len(self.nondata_inputs) + len(self.data_inputs)
|
|
||||||
nondata_vals = inputvals[0:len(self.nondata_inputs)]
|
|
||||||
data_vals = inputvals[len(self.nondata_inputs):]
|
|
||||||
feed_dict = dict(zip(self.nondata_inputs, nondata_vals))
|
|
||||||
n = data_vals[0].shape[0]
|
|
||||||
for v in data_vals[1:]:
|
|
||||||
assert v.shape[0] == n
|
|
||||||
for i_start in range(0, n, self.batch_size):
|
|
||||||
slice_vals = [v[i_start:builtins.min(i_start + self.batch_size, n)] for v in data_vals]
|
|
||||||
for (var, val) in zip(self.data_inputs, slice_vals):
|
|
||||||
feed_dict[var] = val
|
|
||||||
results = tf.get_default_session().run(self.outputs, feed_dict=feed_dict)
|
|
||||||
if i_start == 0:
|
|
||||||
sum_results = results
|
|
||||||
else:
|
|
||||||
for i in range(len(results)):
|
|
||||||
sum_results[i] = sum_results[i] + results[i]
|
|
||||||
for i in range(len(results)):
|
|
||||||
sum_results[i] = sum_results[i] / n
|
|
||||||
return sum_results
|
|
||||||
|
|
||||||
# ================================================================
|
|
||||||
# Modules
|
|
||||||
# ================================================================
|
|
||||||
|
|
||||||
class Module(object):
|
|
||||||
def __init__(self, name):
|
|
||||||
self.name = name
|
|
||||||
self.first_time = True
|
|
||||||
self.scope = None
|
|
||||||
self.cache = {}
|
|
||||||
|
|
||||||
def __call__(self, *args):
|
|
||||||
if args in self.cache:
|
|
||||||
print("(%s) retrieving value from cache" % (self.name,))
|
|
||||||
return self.cache[args]
|
|
||||||
with tf.variable_scope(self.name, reuse=not self.first_time):
|
|
||||||
scope = tf.get_variable_scope().name
|
|
||||||
if self.first_time:
|
|
||||||
self.scope = scope
|
|
||||||
print("(%s) running function for the first time" % (self.name,))
|
|
||||||
else:
|
|
||||||
assert self.scope == scope, "Tried calling function with a different scope"
|
|
||||||
print("(%s) running function on new inputs" % (self.name,))
|
|
||||||
self.first_time = False
|
|
||||||
out = self._call(*args)
|
|
||||||
self.cache[args] = out
|
|
||||||
return out
|
|
||||||
|
|
||||||
def _call(self, *args):
|
|
||||||
raise NotImplementedError
|
|
||||||
|
|
||||||
@property
|
|
||||||
def trainable_variables(self):
|
|
||||||
assert self.scope is not None, "need to call module once before getting variables"
|
|
||||||
return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)
|
|
||||||
|
|
||||||
@property
|
|
||||||
def variables(self):
|
|
||||||
assert self.scope is not None, "need to call module once before getting variables"
|
|
||||||
return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope)
|
|
||||||
|
|
||||||
def module(name):
|
|
||||||
@functools.wraps
|
|
||||||
def wrapper(f):
|
|
||||||
class WrapperModule(Module):
|
|
||||||
def _call(self, *args):
|
|
||||||
return f(*args)
|
|
||||||
return WrapperModule(name)
|
|
||||||
return wrapper
|
|
||||||
|
|
||||||
# ================================================================
|
|
||||||
# Graph traversal
|
|
||||||
# ================================================================
|
|
||||||
|
|
||||||
VARIABLES = {}
|
|
||||||
|
|
||||||
def get_parents(node):
|
|
||||||
return node.op.inputs
|
|
||||||
|
|
||||||
def topsorted(outputs):
|
|
||||||
"""
|
|
||||||
Topological sort via non-recursive depth-first search
|
|
||||||
"""
|
|
||||||
assert isinstance(outputs, (list, tuple))
|
|
||||||
marks = {}
|
|
||||||
out = []
|
|
||||||
stack = [] # pylint: disable=W0621
|
|
||||||
# i: node
|
|
||||||
# jidx = number of children visited so far from that node
|
|
||||||
# marks: state of each node, which is one of
|
|
||||||
# 0: haven't visited
|
|
||||||
# 1: have visited, but not done visiting children
|
|
||||||
# 2: done visiting children
|
|
||||||
for x in outputs:
|
|
||||||
stack.append((x, 0))
|
|
||||||
while stack:
|
|
||||||
(i, jidx) = stack.pop()
|
|
||||||
if jidx == 0:
|
|
||||||
m = marks.get(i, 0)
|
|
||||||
if m == 0:
|
|
||||||
marks[i] = 1
|
|
||||||
elif m == 1:
|
|
||||||
raise ValueError("not a dag")
|
|
||||||
else:
|
|
||||||
continue
|
|
||||||
ps = get_parents(i)
|
|
||||||
if jidx == len(ps):
|
|
||||||
marks[i] = 2
|
|
||||||
out.append(i)
|
|
||||||
else:
|
|
||||||
stack.append((i, jidx + 1))
|
|
||||||
j = ps[jidx]
|
|
||||||
stack.append((j, 0))
|
|
||||||
return out
|
|
||||||
|
|
||||||
# ================================================================
|
# ================================================================
|
||||||
# Flat vectors
|
# Flat vectors
|
||||||
# ================================================================
|
# ================================================================
|
||||||
@@ -595,88 +235,14 @@ class SetFromFlat(object):
|
|||||||
self.op = tf.group(*assigns)
|
self.op = tf.group(*assigns)
|
||||||
|
|
||||||
def __call__(self, theta):
|
def __call__(self, theta):
|
||||||
get_session().run(self.op, feed_dict={self.theta: theta})
|
tf.get_default_session().run(self.op, feed_dict={self.theta: theta})
|
||||||
|
|
||||||
class GetFlat(object):
|
class GetFlat(object):
|
||||||
def __init__(self, var_list):
|
def __init__(self, var_list):
|
||||||
self.op = tf.concat(axis=0, values=[tf.reshape(v, [numel(v)]) for v in var_list])
|
self.op = tf.concat(axis=0, values=[tf.reshape(v, [numel(v)]) for v in var_list])
|
||||||
|
|
||||||
def __call__(self):
|
def __call__(self):
|
||||||
return get_session().run(self.op)
|
return tf.get_default_session().run(self.op)
|
||||||
|
|
||||||
# ================================================================
|
|
||||||
# Misc
|
|
||||||
# ================================================================
|
|
||||||
|
|
||||||
def fancy_slice_2d(X, inds0, inds1):
|
|
||||||
"""
|
|
||||||
like numpy X[inds0, inds1]
|
|
||||||
XXX this implementation is bad
|
|
||||||
"""
|
|
||||||
inds0 = tf.cast(inds0, tf.int64)
|
|
||||||
inds1 = tf.cast(inds1, tf.int64)
|
|
||||||
shape = tf.cast(tf.shape(X), tf.int64)
|
|
||||||
ncols = shape[1]
|
|
||||||
Xflat = tf.reshape(X, [-1])
|
|
||||||
return tf.gather(Xflat, inds0 * ncols + inds1)
|
|
||||||
|
|
||||||
# ================================================================
|
|
||||||
# Scopes
|
|
||||||
# ================================================================
|
|
||||||
|
|
||||||
def scope_vars(scope, trainable_only=False):
|
|
||||||
"""
|
|
||||||
Get variables inside a scope
|
|
||||||
The scope can be specified as a string
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
scope: str or VariableScope
|
|
||||||
scope in which the variables reside.
|
|
||||||
trainable_only: bool
|
|
||||||
whether or not to return only the variables that were marked as trainable.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
vars: [tf.Variable]
|
|
||||||
list of variables in `scope`.
|
|
||||||
"""
|
|
||||||
return tf.get_collection(
|
|
||||||
tf.GraphKeys.TRAINABLE_VARIABLES if trainable_only else tf.GraphKeys.GLOBAL_VARIABLES,
|
|
||||||
scope=scope if isinstance(scope, str) else scope.name
|
|
||||||
)
|
|
||||||
|
|
||||||
def scope_name():
|
|
||||||
"""Returns the name of current scope as a string, e.g. deepq/q_func"""
|
|
||||||
return tf.get_variable_scope().name
|
|
||||||
|
|
||||||
def absolute_scope_name(relative_scope_name):
|
|
||||||
"""Appends parent scope name to `relative_scope_name`"""
|
|
||||||
return scope_name() + "/" + relative_scope_name
|
|
||||||
|
|
||||||
def lengths_to_mask(lengths_b, max_length):
|
|
||||||
"""
|
|
||||||
Turns a vector of lengths into a boolean mask
|
|
||||||
|
|
||||||
Args:
|
|
||||||
lengths_b: an integer vector of lengths
|
|
||||||
max_length: maximum length to fill the mask
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
a boolean array of shape (batch_size, max_length)
|
|
||||||
row[i] consists of True repeated lengths_b[i] times, followed by False
|
|
||||||
"""
|
|
||||||
lengths_b = tf.convert_to_tensor(lengths_b)
|
|
||||||
assert lengths_b.get_shape().ndims == 1
|
|
||||||
mask_bt = tf.expand_dims(tf.range(max_length), 0) < tf.expand_dims(lengths_b, 1)
|
|
||||||
return mask_bt
|
|
||||||
|
|
||||||
def in_session(f):
|
|
||||||
@functools.wraps(f)
|
|
||||||
def newfunc(*args, **kwargs):
|
|
||||||
with tf.Session():
|
|
||||||
f(*args, **kwargs)
|
|
||||||
return newfunc
|
|
||||||
|
|
||||||
_PLACEHOLDER_CACHE = {} # name -> (placeholder, dtype, shape)
|
_PLACEHOLDER_CACHE = {} # name -> (placeholder, dtype, shape)
|
||||||
|
|
||||||
@@ -695,10 +261,3 @@ def get_placeholder_cached(name):
|
|||||||
|
|
||||||
def flattenallbut0(x):
|
def flattenallbut0(x):
|
||||||
return tf.reshape(x, [-1, intprod(x.get_shape().as_list()[1:])])
|
return tf.reshape(x, [-1, intprod(x.get_shape().as_list()[1:])])
|
||||||
|
|
||||||
def reset():
|
|
||||||
global _PLACEHOLDER_CACHE
|
|
||||||
global VARIABLES
|
|
||||||
_PLACEHOLDER_CACHE = {}
|
|
||||||
VARIABLES = {}
|
|
||||||
tf.reset_default_graph()
|
|
||||||
|
@@ -1,19 +1,119 @@
|
|||||||
class VecEnv(object):
|
from abc import ABC, abstractmethod
|
||||||
"""
|
from baselines import logger
|
||||||
Vectorized environment base class
|
|
||||||
"""
|
|
||||||
def step(self, vac):
|
|
||||||
"""
|
|
||||||
Apply sequence of actions to sequence of environments
|
|
||||||
actions -> (observations, rewards, news)
|
|
||||||
|
|
||||||
where 'news' is a boolean vector indicating whether each element is new.
|
class AlreadySteppingError(Exception):
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError
|
Raised when an asynchronous step is running while
|
||||||
|
step_async() is called again.
|
||||||
|
"""
|
||||||
|
def __init__(self):
|
||||||
|
msg = 'already running an async step'
|
||||||
|
Exception.__init__(self, msg)
|
||||||
|
|
||||||
|
class NotSteppingError(Exception):
|
||||||
|
"""
|
||||||
|
Raised when an asynchronous step is not running but
|
||||||
|
step_wait() is called.
|
||||||
|
"""
|
||||||
|
def __init__(self):
|
||||||
|
msg = 'not running an async step'
|
||||||
|
Exception.__init__(self, msg)
|
||||||
|
|
||||||
|
class VecEnv(ABC):
|
||||||
|
|
||||||
|
def __init__(self, num_envs, observation_space, action_space):
|
||||||
|
self.num_envs = num_envs
|
||||||
|
self.observation_space = observation_space
|
||||||
|
self.action_space = action_space
|
||||||
|
|
||||||
|
"""
|
||||||
|
An abstract asynchronous, vectorized environment.
|
||||||
|
"""
|
||||||
|
@abstractmethod
|
||||||
def reset(self):
|
def reset(self):
|
||||||
"""
|
"""
|
||||||
Reset all environments
|
Reset all the environments and return an array of
|
||||||
|
observations.
|
||||||
|
|
||||||
|
If step_async is still doing work, that work will
|
||||||
|
be cancelled and step_wait() should not be called
|
||||||
|
until step_async() is invoked again.
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def step_async(self, actions):
|
||||||
|
"""
|
||||||
|
Tell all the environments to start taking a step
|
||||||
|
with the given actions.
|
||||||
|
Call step_wait() to get the results of the step.
|
||||||
|
|
||||||
|
You should not call this if a step_async run is
|
||||||
|
already pending.
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def step_wait(self):
|
||||||
|
"""
|
||||||
|
Wait for the step taken with step_async().
|
||||||
|
|
||||||
|
Returns (obs, rews, dones, infos):
|
||||||
|
- obs: an array of observations
|
||||||
|
- rews: an array of rewards
|
||||||
|
- dones: an array of "episode done" booleans
|
||||||
|
- infos: an array of info objects
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
def close(self):
|
def close(self):
|
||||||
pass
|
"""
|
||||||
|
Clean up the environments' resources.
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
def step(self, actions):
|
||||||
|
self.step_async(actions)
|
||||||
|
return self.step_wait()
|
||||||
|
|
||||||
|
def render(self):
|
||||||
|
logger.warn('Render not defined for %s'%self)
|
||||||
|
|
||||||
|
class VecEnvWrapper(VecEnv):
|
||||||
|
def __init__(self, venv, observation_space=None, action_space=None):
|
||||||
|
self.venv = venv
|
||||||
|
VecEnv.__init__(self,
|
||||||
|
num_envs=venv.num_envs,
|
||||||
|
observation_space=observation_space or venv.observation_space,
|
||||||
|
action_space=action_space or venv.action_space)
|
||||||
|
|
||||||
|
def step_async(self, actions):
|
||||||
|
self.venv.step_async(actions)
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def reset(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def step_wait(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
return self.venv.close()
|
||||||
|
|
||||||
|
def render(self):
|
||||||
|
self.venv.render()
|
||||||
|
|
||||||
|
class CloudpickleWrapper(object):
|
||||||
|
"""
|
||||||
|
Uses cloudpickle to serialize contents (otherwise multiprocessing tries to use pickle)
|
||||||
|
"""
|
||||||
|
def __init__(self, x):
|
||||||
|
self.x = x
|
||||||
|
def __getstate__(self):
|
||||||
|
import cloudpickle
|
||||||
|
return cloudpickle.dumps(self.x)
|
||||||
|
def __setstate__(self, ob):
|
||||||
|
import pickle
|
||||||
|
self.x = pickle.loads(ob)
|
||||||
|
@@ -4,22 +4,28 @@ from . import VecEnv
|
|||||||
class DummyVecEnv(VecEnv):
|
class DummyVecEnv(VecEnv):
|
||||||
def __init__(self, env_fns):
|
def __init__(self, env_fns):
|
||||||
self.envs = [fn() for fn in env_fns]
|
self.envs = [fn() for fn in env_fns]
|
||||||
env = self.envs[0]
|
env = self.envs[0]
|
||||||
self.action_space = env.action_space
|
VecEnv.__init__(self, len(env_fns), env.observation_space, env.action_space)
|
||||||
self.observation_space = env.observation_space
|
|
||||||
self.ts = np.zeros(len(self.envs), dtype='int')
|
self.ts = np.zeros(len(self.envs), dtype='int')
|
||||||
def step(self, action_n):
|
self.actions = None
|
||||||
results = [env.step(a) for (a,env) in zip(action_n, self.envs)]
|
|
||||||
|
def step_async(self, actions):
|
||||||
|
self.actions = actions
|
||||||
|
|
||||||
|
def step_wait(self):
|
||||||
|
results = [env.step(a) for (a,env) in zip(self.actions, self.envs)]
|
||||||
obs, rews, dones, infos = map(np.array, zip(*results))
|
obs, rews, dones, infos = map(np.array, zip(*results))
|
||||||
self.ts += 1
|
self.ts += 1
|
||||||
for (i, done) in enumerate(dones):
|
for (i, done) in enumerate(dones):
|
||||||
if done:
|
if done:
|
||||||
obs[i] = self.envs[i].reset()
|
obs[i] = self.envs[i].reset()
|
||||||
self.ts[i] = 0
|
self.ts[i] = 0
|
||||||
|
self.actions = None
|
||||||
return np.array(obs), np.array(rews), np.array(dones), infos
|
return np.array(obs), np.array(rews), np.array(dones), infos
|
||||||
|
|
||||||
def reset(self):
|
def reset(self):
|
||||||
results = [env.reset() for env in self.envs]
|
results = [env.reset() for env in self.envs]
|
||||||
return np.array(results)
|
return np.array(results)
|
||||||
@property
|
|
||||||
def num_envs(self):
|
def close(self):
|
||||||
return len(self.envs)
|
return
|
@@ -1,6 +1,6 @@
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
from multiprocessing import Process, Pipe
|
from multiprocessing import Process, Pipe
|
||||||
from baselines.common.vec_env import VecEnv
|
from baselines.common.vec_env import VecEnv, CloudpickleWrapper
|
||||||
|
|
||||||
|
|
||||||
def worker(remote, parent_remote, env_fn_wrapper):
|
def worker(remote, parent_remote, env_fn_wrapper):
|
||||||
@@ -23,30 +23,17 @@ def worker(remote, parent_remote, env_fn_wrapper):
|
|||||||
remote.close()
|
remote.close()
|
||||||
break
|
break
|
||||||
elif cmd == 'get_spaces':
|
elif cmd == 'get_spaces':
|
||||||
remote.send((env.action_space, env.observation_space))
|
remote.send((env.observation_space, env.action_space))
|
||||||
else:
|
else:
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
class CloudpickleWrapper(object):
|
|
||||||
"""
|
|
||||||
Uses cloudpickle to serialize contents (otherwise multiprocessing tries to use pickle)
|
|
||||||
"""
|
|
||||||
def __init__(self, x):
|
|
||||||
self.x = x
|
|
||||||
def __getstate__(self):
|
|
||||||
import cloudpickle
|
|
||||||
return cloudpickle.dumps(self.x)
|
|
||||||
def __setstate__(self, ob):
|
|
||||||
import pickle
|
|
||||||
self.x = pickle.loads(ob)
|
|
||||||
|
|
||||||
|
|
||||||
class SubprocVecEnv(VecEnv):
|
class SubprocVecEnv(VecEnv):
|
||||||
def __init__(self, env_fns):
|
def __init__(self, env_fns, spaces=None):
|
||||||
"""
|
"""
|
||||||
envs: list of gym environments to run in subprocesses
|
envs: list of gym environments to run in subprocesses
|
||||||
"""
|
"""
|
||||||
|
self.waiting = False
|
||||||
self.closed = False
|
self.closed = False
|
||||||
nenvs = len(env_fns)
|
nenvs = len(env_fns)
|
||||||
self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(nenvs)])
|
self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(nenvs)])
|
||||||
@@ -59,13 +46,17 @@ class SubprocVecEnv(VecEnv):
|
|||||||
remote.close()
|
remote.close()
|
||||||
|
|
||||||
self.remotes[0].send(('get_spaces', None))
|
self.remotes[0].send(('get_spaces', None))
|
||||||
self.action_space, self.observation_space = self.remotes[0].recv()
|
observation_space, action_space = self.remotes[0].recv()
|
||||||
|
VecEnv.__init__(self, len(env_fns), observation_space, action_space)
|
||||||
|
|
||||||
|
def step_async(self, actions):
|
||||||
def step(self, actions):
|
|
||||||
for remote, action in zip(self.remotes, actions):
|
for remote, action in zip(self.remotes, actions):
|
||||||
remote.send(('step', action))
|
remote.send(('step', action))
|
||||||
|
self.waiting = True
|
||||||
|
|
||||||
|
def step_wait(self):
|
||||||
results = [remote.recv() for remote in self.remotes]
|
results = [remote.recv() for remote in self.remotes]
|
||||||
|
self.waiting = False
|
||||||
obs, rews, dones, infos = zip(*results)
|
obs, rews, dones, infos = zip(*results)
|
||||||
return np.stack(obs), np.stack(rews), np.stack(dones), infos
|
return np.stack(obs), np.stack(rews), np.stack(dones), infos
|
||||||
|
|
||||||
@@ -82,13 +73,11 @@ class SubprocVecEnv(VecEnv):
|
|||||||
def close(self):
|
def close(self):
|
||||||
if self.closed:
|
if self.closed:
|
||||||
return
|
return
|
||||||
|
if self.waiting:
|
||||||
|
for remote in self.remotes:
|
||||||
|
remote.recv()
|
||||||
for remote in self.remotes:
|
for remote in self.remotes:
|
||||||
remote.send(('close', None))
|
remote.send(('close', None))
|
||||||
for p in self.ps:
|
for p in self.ps:
|
||||||
p.join()
|
p.join()
|
||||||
self.closed = True
|
self.closed = True
|
||||||
|
|
||||||
@property
|
|
||||||
def num_envs(self):
|
|
||||||
return len(self.remotes)
|
|
||||||
|
@@ -1,8 +1,8 @@
|
|||||||
from baselines.common.vec_env import VecEnv
|
from baselines.common.vec_env import VecEnvWrapper
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from gym import spaces
|
from gym import spaces
|
||||||
|
|
||||||
class VecFrameStack(VecEnv):
|
class VecFrameStack(VecEnvWrapper):
|
||||||
"""
|
"""
|
||||||
Vectorized environment base class
|
Vectorized environment base class
|
||||||
"""
|
"""
|
||||||
@@ -13,22 +13,18 @@ class VecFrameStack(VecEnv):
|
|||||||
low = np.repeat(wos.low, self.nstack, axis=-1)
|
low = np.repeat(wos.low, self.nstack, axis=-1)
|
||||||
high = np.repeat(wos.high, self.nstack, axis=-1)
|
high = np.repeat(wos.high, self.nstack, axis=-1)
|
||||||
self.stackedobs = np.zeros((venv.num_envs,)+low.shape, low.dtype)
|
self.stackedobs = np.zeros((venv.num_envs,)+low.shape, low.dtype)
|
||||||
self._observation_space = spaces.Box(low=low, high=high)
|
observation_space = spaces.Box(low=low, high=high, dtype=venv.observation_space.dtype)
|
||||||
self._action_space = venv.action_space
|
VecEnvWrapper.__init__(self, venv, observation_space=observation_space)
|
||||||
def step(self, vac):
|
|
||||||
"""
|
|
||||||
Apply sequence of actions to sequence of environments
|
|
||||||
actions -> (observations, rewards, news)
|
|
||||||
|
|
||||||
where 'news' is a boolean vector indicating whether each element is new.
|
def step_wait(self):
|
||||||
"""
|
obs, rews, news, infos = self.venv.step_wait()
|
||||||
obs, rews, news, infos = self.venv.step(vac)
|
|
||||||
self.stackedobs = np.roll(self.stackedobs, shift=-1, axis=-1)
|
self.stackedobs = np.roll(self.stackedobs, shift=-1, axis=-1)
|
||||||
for (i, new) in enumerate(news):
|
for (i, new) in enumerate(news):
|
||||||
if new:
|
if new:
|
||||||
self.stackedobs[i] = 0
|
self.stackedobs[i] = 0
|
||||||
self.stackedobs[..., -obs.shape[-1]:] = obs
|
self.stackedobs[..., -obs.shape[-1]:] = obs
|
||||||
return self.stackedobs, rews, news, infos
|
return self.stackedobs, rews, news, infos
|
||||||
|
|
||||||
def reset(self):
|
def reset(self):
|
||||||
"""
|
"""
|
||||||
Reset all environments
|
Reset all environments
|
||||||
@@ -37,14 +33,6 @@ class VecFrameStack(VecEnv):
|
|||||||
self.stackedobs[...] = 0
|
self.stackedobs[...] = 0
|
||||||
self.stackedobs[..., -obs.shape[-1]:] = obs
|
self.stackedobs[..., -obs.shape[-1]:] = obs
|
||||||
return self.stackedobs
|
return self.stackedobs
|
||||||
@property
|
|
||||||
def action_space(self):
|
|
||||||
return self._action_space
|
|
||||||
@property
|
|
||||||
def observation_space(self):
|
|
||||||
return self._observation_space
|
|
||||||
def close(self):
|
def close(self):
|
||||||
self.venv.close()
|
self.venv.close()
|
||||||
@property
|
|
||||||
def num_envs(self):
|
|
||||||
return self.venv.num_envs
|
|
@@ -1,104 +1,47 @@
|
|||||||
from baselines.common.vec_env import VecEnv
|
from baselines.common.vec_env import VecEnvWrapper
|
||||||
from baselines.common.running_mean_std import RunningMeanStd
|
from baselines.common.running_mean_std import RunningMeanStd
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
class VecNormalize(VecEnv):
|
class VecNormalize(VecEnvWrapper):
|
||||||
"""
|
"""
|
||||||
Vectorized environment base class
|
Vectorized environment base class
|
||||||
"""
|
"""
|
||||||
def __init__(self, venv, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8):
|
def __init__(self, venv, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8):
|
||||||
self.venv = venv
|
VecEnvWrapper.__init__(self, venv)
|
||||||
self._observation_space = self.venv.observation_space
|
self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if ob else None
|
||||||
self._action_space = venv.action_space
|
|
||||||
self.ob_rms = RunningMeanStd(shape=self._observation_space.shape) if ob else None
|
|
||||||
self.ret_rms = RunningMeanStd(shape=()) if ret else None
|
self.ret_rms = RunningMeanStd(shape=()) if ret else None
|
||||||
self.clipob = clipob
|
self.clipob = clipob
|
||||||
self.cliprew = cliprew
|
self.cliprew = cliprew
|
||||||
self.ret = np.zeros(self.num_envs)
|
self.ret = np.zeros(self.num_envs)
|
||||||
self.gamma = gamma
|
self.gamma = gamma
|
||||||
self.epsilon = epsilon
|
self.epsilon = epsilon
|
||||||
def step(self, vac):
|
|
||||||
|
def step_wait(self):
|
||||||
"""
|
"""
|
||||||
Apply sequence of actions to sequence of environments
|
Apply sequence of actions to sequence of environments
|
||||||
actions -> (observations, rewards, news)
|
actions -> (observations, rewards, news)
|
||||||
|
|
||||||
where 'news' is a boolean vector indicating whether each element is new.
|
where 'news' is a boolean vector indicating whether each element is new.
|
||||||
"""
|
"""
|
||||||
obs, rews, news, infos = self.venv.step(vac)
|
obs, rews, news, infos = self.venv.step_wait()
|
||||||
self.ret = self.ret * self.gamma + rews
|
self.ret = self.ret * self.gamma + rews
|
||||||
obs = self._obfilt(obs)
|
obs = self._obfilt(obs)
|
||||||
if self.ret_rms:
|
if self.ret_rms:
|
||||||
self.ret_rms.update(self.ret)
|
self.ret_rms.update(self.ret)
|
||||||
rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew)
|
rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew)
|
||||||
return obs, rews, news, infos
|
return obs, rews, news, infos
|
||||||
|
|
||||||
def _obfilt(self, obs):
|
def _obfilt(self, obs):
|
||||||
if self.ob_rms:
|
if self.ob_rms:
|
||||||
self.ob_rms.update(obs)
|
self.ob_rms.update(obs)
|
||||||
obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob)
|
obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob)
|
||||||
return obs
|
return obs
|
||||||
else:
|
else:
|
||||||
return obs
|
return obs
|
||||||
|
|
||||||
def reset(self):
|
def reset(self):
|
||||||
"""
|
"""
|
||||||
Reset all environments
|
Reset all environments
|
||||||
"""
|
"""
|
||||||
obs = self.venv.reset()
|
obs = self.venv.reset()
|
||||||
return self._obfilt(obs)
|
return self._obfilt(obs)
|
||||||
@property
|
|
||||||
def action_space(self):
|
|
||||||
return self._action_space
|
|
||||||
@property
|
|
||||||
def observation_space(self):
|
|
||||||
return self._observation_space
|
|
||||||
def close(self):
|
|
||||||
self.venv.close()
|
|
||||||
@property
|
|
||||||
def num_envs(self):
|
|
||||||
return self.venv.num_envs
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class RunningMeanStd(object):
|
|
||||||
# https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
|
|
||||||
def __init__(self, epsilon=1e-4, shape=()):
|
|
||||||
self.mean = np.zeros(shape, 'float64')
|
|
||||||
self.var = np.zeros(shape, 'float64')
|
|
||||||
self.count = epsilon
|
|
||||||
|
|
||||||
|
|
||||||
def update(self, x):
|
|
||||||
batch_mean = np.mean(x, axis=0)
|
|
||||||
batch_var = np.var(x, axis=0)
|
|
||||||
batch_count = x.shape[0]
|
|
||||||
|
|
||||||
delta = batch_mean - self.mean
|
|
||||||
tot_count = self.count + batch_count
|
|
||||||
|
|
||||||
new_mean = self.mean + delta * batch_count / tot_count
|
|
||||||
m_a = self.var * (self.count)
|
|
||||||
m_b = batch_var * (batch_count)
|
|
||||||
M2 = m_a + m_b + np.square(delta) * self.count * batch_count / (self.count + batch_count)
|
|
||||||
new_var = M2 / (self.count + batch_count)
|
|
||||||
|
|
||||||
new_count = batch_count + self.count
|
|
||||||
|
|
||||||
self.mean = new_mean
|
|
||||||
self.var = new_var
|
|
||||||
self.count = new_count
|
|
||||||
|
|
||||||
def test_runningmeanstd():
|
|
||||||
for (x1, x2, x3) in [
|
|
||||||
(np.random.randn(3), np.random.randn(4), np.random.randn(5)),
|
|
||||||
(np.random.randn(3,2), np.random.randn(4,2), np.random.randn(5,2)),
|
|
||||||
]:
|
|
||||||
|
|
||||||
rms = RunningMeanStd(epsilon=0.0, shape=x1.shape[1:])
|
|
||||||
|
|
||||||
x = np.concatenate([x1, x2, x3], axis=0)
|
|
||||||
ms1 = [x.mean(axis=0), x.var(axis=0)]
|
|
||||||
rms.update(x1)
|
|
||||||
rms.update(x2)
|
|
||||||
rms.update(x3)
|
|
||||||
ms2 = [rms.mean, rms.var]
|
|
||||||
|
|
||||||
assert np.allclose(ms1, ms2)
|
|
||||||
|
@@ -9,8 +9,7 @@ from baselines import logger
|
|||||||
from baselines.common.mpi_adam import MpiAdam
|
from baselines.common.mpi_adam import MpiAdam
|
||||||
import baselines.common.tf_util as U
|
import baselines.common.tf_util as U
|
||||||
from baselines.common.mpi_running_mean_std import RunningMeanStd
|
from baselines.common.mpi_running_mean_std import RunningMeanStd
|
||||||
from baselines.ddpg.util import reduce_std, mpi_mean
|
from mpi4py import MPI
|
||||||
|
|
||||||
|
|
||||||
def normalize(x, stats):
|
def normalize(x, stats):
|
||||||
if stats is None:
|
if stats is None:
|
||||||
@@ -23,6 +22,13 @@ def denormalize(x, stats):
|
|||||||
return x
|
return x
|
||||||
return x * stats.std + stats.mean
|
return x * stats.std + stats.mean
|
||||||
|
|
||||||
|
def reduce_std(x, axis=None, keepdims=False):
|
||||||
|
return tf.sqrt(reduce_var(x, axis=axis, keepdims=keepdims))
|
||||||
|
|
||||||
|
def reduce_var(x, axis=None, keepdims=False):
|
||||||
|
m = tf.reduce_mean(x, axis=axis, keep_dims=True)
|
||||||
|
devs_squared = tf.square(x - m)
|
||||||
|
return tf.reduce_mean(devs_squared, axis=axis, keep_dims=keepdims)
|
||||||
|
|
||||||
def get_target_updates(vars, target_vars, tau):
|
def get_target_updates(vars, target_vars, tau):
|
||||||
logger.info('setting up target updates ...')
|
logger.info('setting up target updates ...')
|
||||||
@@ -198,7 +204,7 @@ class DDPG(object):
|
|||||||
new_std = self.ret_rms.std
|
new_std = self.ret_rms.std
|
||||||
self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean')
|
self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean')
|
||||||
new_mean = self.ret_rms.mean
|
new_mean = self.ret_rms.mean
|
||||||
|
|
||||||
self.renormalize_Q_outputs_op = []
|
self.renormalize_Q_outputs_op = []
|
||||||
for vs in [self.critic.output_vars, self.target_critic.output_vars]:
|
for vs in [self.critic.output_vars, self.target_critic.output_vars]:
|
||||||
assert len(vs) == 2
|
assert len(vs) == 2
|
||||||
@@ -213,15 +219,15 @@ class DDPG(object):
|
|||||||
def setup_stats(self):
|
def setup_stats(self):
|
||||||
ops = []
|
ops = []
|
||||||
names = []
|
names = []
|
||||||
|
|
||||||
if self.normalize_returns:
|
if self.normalize_returns:
|
||||||
ops += [self.ret_rms.mean, self.ret_rms.std]
|
ops += [self.ret_rms.mean, self.ret_rms.std]
|
||||||
names += ['ret_rms_mean', 'ret_rms_std']
|
names += ['ret_rms_mean', 'ret_rms_std']
|
||||||
|
|
||||||
if self.normalize_observations:
|
if self.normalize_observations:
|
||||||
ops += [tf.reduce_mean(self.obs_rms.mean), tf.reduce_mean(self.obs_rms.std)]
|
ops += [tf.reduce_mean(self.obs_rms.mean), tf.reduce_mean(self.obs_rms.std)]
|
||||||
names += ['obs_rms_mean', 'obs_rms_std']
|
names += ['obs_rms_mean', 'obs_rms_std']
|
||||||
|
|
||||||
ops += [tf.reduce_mean(self.critic_tf)]
|
ops += [tf.reduce_mean(self.critic_tf)]
|
||||||
names += ['reference_Q_mean']
|
names += ['reference_Q_mean']
|
||||||
ops += [reduce_std(self.critic_tf)]
|
ops += [reduce_std(self.critic_tf)]
|
||||||
@@ -231,7 +237,7 @@ class DDPG(object):
|
|||||||
names += ['reference_actor_Q_mean']
|
names += ['reference_actor_Q_mean']
|
||||||
ops += [reduce_std(self.critic_with_actor_tf)]
|
ops += [reduce_std(self.critic_with_actor_tf)]
|
||||||
names += ['reference_actor_Q_std']
|
names += ['reference_actor_Q_std']
|
||||||
|
|
||||||
ops += [tf.reduce_mean(self.actor_tf)]
|
ops += [tf.reduce_mean(self.actor_tf)]
|
||||||
names += ['reference_action_mean']
|
names += ['reference_action_mean']
|
||||||
ops += [reduce_std(self.actor_tf)]
|
ops += [reduce_std(self.actor_tf)]
|
||||||
@@ -347,7 +353,7 @@ class DDPG(object):
|
|||||||
def adapt_param_noise(self):
|
def adapt_param_noise(self):
|
||||||
if self.param_noise is None:
|
if self.param_noise is None:
|
||||||
return 0.
|
return 0.
|
||||||
|
|
||||||
# Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation.
|
# Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation.
|
||||||
batch = self.memory.sample(batch_size=self.batch_size)
|
batch = self.memory.sample(batch_size=self.batch_size)
|
||||||
self.sess.run(self.perturb_adaptive_policy_ops, feed_dict={
|
self.sess.run(self.perturb_adaptive_policy_ops, feed_dict={
|
||||||
@@ -358,7 +364,7 @@ class DDPG(object):
|
|||||||
self.param_noise_stddev: self.param_noise.current_stddev,
|
self.param_noise_stddev: self.param_noise.current_stddev,
|
||||||
})
|
})
|
||||||
|
|
||||||
mean_distance = mpi_mean(distance)
|
mean_distance = MPI.COMM_WORLD.allreduce(distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size()
|
||||||
self.param_noise.adapt(mean_distance)
|
self.param_noise.adapt(mean_distance)
|
||||||
return mean_distance
|
return mean_distance
|
||||||
|
|
||||||
|
@@ -25,7 +25,6 @@ def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs):
|
|||||||
# Create envs.
|
# Create envs.
|
||||||
env = gym.make(env_id)
|
env = gym.make(env_id)
|
||||||
env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
|
env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
|
||||||
gym.logger.setLevel(logging.WARN)
|
|
||||||
|
|
||||||
if evaluation and rank==0:
|
if evaluation and rank==0:
|
||||||
eval_env = gym.make(env_id)
|
eval_env = gym.make(env_id)
|
||||||
|
@@ -4,7 +4,6 @@ from collections import deque
|
|||||||
import pickle
|
import pickle
|
||||||
|
|
||||||
from baselines.ddpg.ddpg import DDPG
|
from baselines.ddpg.ddpg import DDPG
|
||||||
from baselines.ddpg.util import mpi_mean, mpi_std, mpi_max, mpi_sum
|
|
||||||
import baselines.common.tf_util as U
|
import baselines.common.tf_util as U
|
||||||
|
|
||||||
from baselines import logger
|
from baselines import logger
|
||||||
@@ -35,7 +34,7 @@ def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, pa
|
|||||||
saver = tf.train.Saver()
|
saver = tf.train.Saver()
|
||||||
else:
|
else:
|
||||||
saver = None
|
saver = None
|
||||||
|
|
||||||
step = 0
|
step = 0
|
||||||
episode = 0
|
episode = 0
|
||||||
eval_episode_rewards_history = deque(maxlen=100)
|
eval_episode_rewards_history = deque(maxlen=100)
|
||||||
@@ -138,42 +137,46 @@ def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, pa
|
|||||||
eval_episode_rewards_history.append(eval_episode_reward)
|
eval_episode_rewards_history.append(eval_episode_reward)
|
||||||
eval_episode_reward = 0.
|
eval_episode_reward = 0.
|
||||||
|
|
||||||
|
mpi_size = MPI.COMM_WORLD.Get_size()
|
||||||
# Log stats.
|
# Log stats.
|
||||||
epoch_train_duration = time.time() - epoch_start_time
|
# XXX shouldn't call np.mean on variable length lists
|
||||||
duration = time.time() - start_time
|
duration = time.time() - start_time
|
||||||
stats = agent.get_stats()
|
stats = agent.get_stats()
|
||||||
combined_stats = {}
|
combined_stats = stats.copy()
|
||||||
for key in sorted(stats.keys()):
|
combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
|
||||||
combined_stats[key] = mpi_mean(stats[key])
|
combined_stats['rollout/return_history'] = np.mean(episode_rewards_history)
|
||||||
|
combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps)
|
||||||
# Rollout statistics.
|
combined_stats['rollout/actions_mean'] = np.mean(epoch_actions)
|
||||||
combined_stats['rollout/return'] = mpi_mean(epoch_episode_rewards)
|
combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
|
||||||
combined_stats['rollout/return_history'] = mpi_mean(np.mean(episode_rewards_history))
|
combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
|
||||||
combined_stats['rollout/episode_steps'] = mpi_mean(epoch_episode_steps)
|
combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses)
|
||||||
combined_stats['rollout/episodes'] = mpi_sum(epoch_episodes)
|
combined_stats['train/param_noise_distance'] = np.mean(epoch_adaptive_distances)
|
||||||
combined_stats['rollout/actions_mean'] = mpi_mean(epoch_actions)
|
combined_stats['total/duration'] = duration
|
||||||
combined_stats['rollout/actions_std'] = mpi_std(epoch_actions)
|
combined_stats['total/steps_per_second'] = float(t) / float(duration)
|
||||||
combined_stats['rollout/Q_mean'] = mpi_mean(epoch_qs)
|
combined_stats['total/episodes'] = episodes
|
||||||
|
combined_stats['rollout/episodes'] = epoch_episodes
|
||||||
# Train statistics.
|
combined_stats['rollout/actions_std'] = np.std(epoch_actions)
|
||||||
combined_stats['train/loss_actor'] = mpi_mean(epoch_actor_losses)
|
|
||||||
combined_stats['train/loss_critic'] = mpi_mean(epoch_critic_losses)
|
|
||||||
combined_stats['train/param_noise_distance'] = mpi_mean(epoch_adaptive_distances)
|
|
||||||
|
|
||||||
# Evaluation statistics.
|
# Evaluation statistics.
|
||||||
if eval_env is not None:
|
if eval_env is not None:
|
||||||
combined_stats['eval/return'] = mpi_mean(eval_episode_rewards)
|
combined_stats['eval/return'] = eval_episode_rewards
|
||||||
combined_stats['eval/return_history'] = mpi_mean(np.mean(eval_episode_rewards_history))
|
combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history)
|
||||||
combined_stats['eval/Q'] = mpi_mean(eval_qs)
|
combined_stats['eval/Q'] = eval_qs
|
||||||
combined_stats['eval/episodes'] = mpi_mean(len(eval_episode_rewards))
|
combined_stats['eval/episodes'] = len(eval_episode_rewards)
|
||||||
|
def as_scalar(x):
|
||||||
|
if isinstance(x, np.ndarray):
|
||||||
|
assert x.size == 1
|
||||||
|
return x[0]
|
||||||
|
elif np.isscalar(x):
|
||||||
|
return x
|
||||||
|
else:
|
||||||
|
raise ValueError('expected scalar, got %s'%x)
|
||||||
|
combined_stats_sums = MPI.COMM_WORLD.allreduce(np.array([as_scalar(x) for x in combined_stats.values()]))
|
||||||
|
combined_stats = {k : v / mpi_size for (k,v) in zip(combined_stats.keys(), combined_stats_sums)}
|
||||||
|
|
||||||
# Total statistics.
|
# Total statistics.
|
||||||
combined_stats['total/duration'] = mpi_mean(duration)
|
|
||||||
combined_stats['total/steps_per_second'] = mpi_mean(float(t) / float(duration))
|
|
||||||
combined_stats['total/episodes'] = mpi_mean(episodes)
|
|
||||||
combined_stats['total/epochs'] = epoch + 1
|
combined_stats['total/epochs'] = epoch + 1
|
||||||
combined_stats['total/steps'] = t
|
combined_stats['total/steps'] = t
|
||||||
|
|
||||||
for key in sorted(combined_stats.keys()):
|
for key in sorted(combined_stats.keys()):
|
||||||
logger.record_tabular(key, combined_stats[key])
|
logger.record_tabular(key, combined_stats[key])
|
||||||
logger.dump_tabular()
|
logger.dump_tabular()
|
||||||
|
@@ -1,44 +0,0 @@
|
|||||||
import numpy as np
|
|
||||||
import tensorflow as tf
|
|
||||||
from mpi4py import MPI
|
|
||||||
from baselines.common.mpi_moments import mpi_moments
|
|
||||||
|
|
||||||
|
|
||||||
def reduce_var(x, axis=None, keepdims=False):
|
|
||||||
m = tf.reduce_mean(x, axis=axis, keep_dims=True)
|
|
||||||
devs_squared = tf.square(x - m)
|
|
||||||
return tf.reduce_mean(devs_squared, axis=axis, keep_dims=keepdims)
|
|
||||||
|
|
||||||
|
|
||||||
def reduce_std(x, axis=None, keepdims=False):
|
|
||||||
return tf.sqrt(reduce_var(x, axis=axis, keepdims=keepdims))
|
|
||||||
|
|
||||||
|
|
||||||
def mpi_mean(value):
|
|
||||||
if value == []:
|
|
||||||
value = [0.]
|
|
||||||
if not isinstance(value, list):
|
|
||||||
value = [value]
|
|
||||||
return mpi_moments(np.array(value))[0][0]
|
|
||||||
|
|
||||||
|
|
||||||
def mpi_std(value):
|
|
||||||
if value == []:
|
|
||||||
value = [0.]
|
|
||||||
if not isinstance(value, list):
|
|
||||||
value = [value]
|
|
||||||
return mpi_moments(np.array(value))[1][0]
|
|
||||||
|
|
||||||
|
|
||||||
def mpi_max(value):
|
|
||||||
global_max = np.zeros(1, dtype='float64')
|
|
||||||
local_max = np.max(value).astype('float64')
|
|
||||||
MPI.COMM_WORLD.Reduce(local_max, global_max, op=MPI.MAX)
|
|
||||||
return global_max[0]
|
|
||||||
|
|
||||||
|
|
||||||
def mpi_sum(value):
|
|
||||||
global_sum = np.zeros(1, dtype='float64')
|
|
||||||
local_sum = np.sum(np.array(value)).astype('float64')
|
|
||||||
MPI.COMM_WORLD.Reduce(local_sum, global_sum, op=MPI.SUM)
|
|
||||||
return global_sum[0]
|
|
@@ -143,7 +143,7 @@ def build_act(make_obs_ph, q_func, num_actions, scope="deepq", reuse=None):
|
|||||||
` See the top of the file for details.
|
` See the top of the file for details.
|
||||||
"""
|
"""
|
||||||
with tf.variable_scope(scope, reuse=reuse):
|
with tf.variable_scope(scope, reuse=reuse):
|
||||||
observations_ph = U.ensure_tf_input(make_obs_ph("observation"))
|
observations_ph = make_obs_ph("observation")
|
||||||
stochastic_ph = tf.placeholder(tf.bool, (), name="stochastic")
|
stochastic_ph = tf.placeholder(tf.bool, (), name="stochastic")
|
||||||
update_eps_ph = tf.placeholder(tf.float32, (), name="update_eps")
|
update_eps_ph = tf.placeholder(tf.float32, (), name="update_eps")
|
||||||
|
|
||||||
@@ -159,10 +159,12 @@ def build_act(make_obs_ph, q_func, num_actions, scope="deepq", reuse=None):
|
|||||||
|
|
||||||
output_actions = tf.cond(stochastic_ph, lambda: stochastic_actions, lambda: deterministic_actions)
|
output_actions = tf.cond(stochastic_ph, lambda: stochastic_actions, lambda: deterministic_actions)
|
||||||
update_eps_expr = eps.assign(tf.cond(update_eps_ph >= 0, lambda: update_eps_ph, lambda: eps))
|
update_eps_expr = eps.assign(tf.cond(update_eps_ph >= 0, lambda: update_eps_ph, lambda: eps))
|
||||||
act = U.function(inputs=[observations_ph, stochastic_ph, update_eps_ph],
|
_act = U.function(inputs=[observations_ph, stochastic_ph, update_eps_ph],
|
||||||
outputs=output_actions,
|
outputs=output_actions,
|
||||||
givens={update_eps_ph: -1.0, stochastic_ph: True},
|
givens={update_eps_ph: -1.0, stochastic_ph: True},
|
||||||
updates=[update_eps_expr])
|
updates=[update_eps_expr])
|
||||||
|
def act(ob, stochastic=True, update_eps=-1):
|
||||||
|
return _act(ob, stochastic, update_eps)
|
||||||
return act
|
return act
|
||||||
|
|
||||||
|
|
||||||
@@ -203,7 +205,7 @@ def build_act_with_param_noise(make_obs_ph, q_func, num_actions, scope="deepq",
|
|||||||
param_noise_filter_func = default_param_noise_filter
|
param_noise_filter_func = default_param_noise_filter
|
||||||
|
|
||||||
with tf.variable_scope(scope, reuse=reuse):
|
with tf.variable_scope(scope, reuse=reuse):
|
||||||
observations_ph = U.ensure_tf_input(make_obs_ph("observation"))
|
observations_ph = make_obs_ph("observation")
|
||||||
stochastic_ph = tf.placeholder(tf.bool, (), name="stochastic")
|
stochastic_ph = tf.placeholder(tf.bool, (), name="stochastic")
|
||||||
update_eps_ph = tf.placeholder(tf.float32, (), name="update_eps")
|
update_eps_ph = tf.placeholder(tf.float32, (), name="update_eps")
|
||||||
update_param_noise_threshold_ph = tf.placeholder(tf.float32, (), name="update_param_noise_threshold")
|
update_param_noise_threshold_ph = tf.placeholder(tf.float32, (), name="update_param_noise_threshold")
|
||||||
@@ -342,20 +344,20 @@ def build_train(make_obs_ph, q_func, num_actions, optimizer, grad_norm_clipping=
|
|||||||
|
|
||||||
with tf.variable_scope(scope, reuse=reuse):
|
with tf.variable_scope(scope, reuse=reuse):
|
||||||
# set up placeholders
|
# set up placeholders
|
||||||
obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t"))
|
obs_t_input = make_obs_ph("obs_t")
|
||||||
act_t_ph = tf.placeholder(tf.int32, [None], name="action")
|
act_t_ph = tf.placeholder(tf.int32, [None], name="action")
|
||||||
rew_t_ph = tf.placeholder(tf.float32, [None], name="reward")
|
rew_t_ph = tf.placeholder(tf.float32, [None], name="reward")
|
||||||
obs_tp1_input = U.ensure_tf_input(make_obs_ph("obs_tp1"))
|
obs_tp1_input = make_obs_ph("obs_tp1")
|
||||||
done_mask_ph = tf.placeholder(tf.float32, [None], name="done")
|
done_mask_ph = tf.placeholder(tf.float32, [None], name="done")
|
||||||
importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight")
|
importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight")
|
||||||
|
|
||||||
# q network evaluation
|
# q network evaluation
|
||||||
q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True) # reuse parameters from act
|
q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True) # reuse parameters from act
|
||||||
q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))
|
q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/q_func")
|
||||||
|
|
||||||
# target q network evalution
|
# target q network evalution
|
||||||
q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func")
|
q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func")
|
||||||
target_q_func_vars = U.scope_vars(U.absolute_scope_name("target_q_func"))
|
target_q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/target_q_func")
|
||||||
|
|
||||||
# q scores for actions which we know were selected in the given state.
|
# q scores for actions which we know were selected in the given state.
|
||||||
q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions), 1)
|
q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions), 1)
|
||||||
@@ -363,7 +365,7 @@ def build_train(make_obs_ph, q_func, num_actions, optimizer, grad_norm_clipping=
|
|||||||
# compute estimate of best possible value starting from state at t + 1
|
# compute estimate of best possible value starting from state at t + 1
|
||||||
if double_q:
|
if double_q:
|
||||||
q_tp1_using_online_net = q_func(obs_tp1_input.get(), num_actions, scope="q_func", reuse=True)
|
q_tp1_using_online_net = q_func(obs_tp1_input.get(), num_actions, scope="q_func", reuse=True)
|
||||||
q_tp1_best_using_online_net = tf.arg_max(q_tp1_using_online_net, 1)
|
q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1)
|
||||||
q_tp1_best = tf.reduce_sum(q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions), 1)
|
q_tp1_best = tf.reduce_sum(q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions), 1)
|
||||||
else:
|
else:
|
||||||
q_tp1_best = tf.reduce_max(q_tp1, 1)
|
q_tp1_best = tf.reduce_max(q_tp1, 1)
|
||||||
@@ -379,10 +381,11 @@ def build_train(make_obs_ph, q_func, num_actions, optimizer, grad_norm_clipping=
|
|||||||
|
|
||||||
# compute optimization op (potentially with gradient clipping)
|
# compute optimization op (potentially with gradient clipping)
|
||||||
if grad_norm_clipping is not None:
|
if grad_norm_clipping is not None:
|
||||||
optimize_expr = U.minimize_and_clip(optimizer,
|
gradients = optimizer.compute_gradients(weighted_error, var_list=q_func_vars)
|
||||||
weighted_error,
|
for i, (grad, var) in enumerate(gradients):
|
||||||
var_list=q_func_vars,
|
if grad is not None:
|
||||||
clip_val=grad_norm_clipping)
|
gradients[i] = (tf.clip_by_norm(grad, grad_norm_clipping), var)
|
||||||
|
optimize_expr = optimizer.apply_gradients(gradients)
|
||||||
else:
|
else:
|
||||||
optimize_expr = optimizer.minimize(weighted_error, var_list=q_func_vars)
|
optimize_expr = optimizer.minimize(weighted_error, var_list=q_func_vars)
|
||||||
|
|
||||||
|
@@ -14,6 +14,7 @@ from baselines.common.misc_util import (
|
|||||||
from baselines import bench
|
from baselines import bench
|
||||||
from baselines.common.atari_wrappers_deprecated import wrap_dqn
|
from baselines.common.atari_wrappers_deprecated import wrap_dqn
|
||||||
from baselines.deepq.experiments.atari.model import model, dueling_model
|
from baselines.deepq.experiments.atari.model import model, dueling_model
|
||||||
|
from baselines.deepq.utils import Uint8Input, load_state
|
||||||
|
|
||||||
|
|
||||||
def parse_args():
|
def parse_args():
|
||||||
@@ -63,8 +64,8 @@ if __name__ == '__main__':
|
|||||||
args = parse_args()
|
args = parse_args()
|
||||||
env = make_env(args.env)
|
env = make_env(args.env)
|
||||||
act = deepq.build_act(
|
act = deepq.build_act(
|
||||||
make_obs_ph=lambda name: U.Uint8Input(env.observation_space.shape, name=name),
|
make_obs_ph=lambda name: Uint8Input(env.observation_space.shape, name=name),
|
||||||
q_func=dueling_model if args.dueling else model,
|
q_func=dueling_model if args.dueling else model,
|
||||||
num_actions=env.action_space.n)
|
num_actions=env.action_space.n)
|
||||||
U.load_state(os.path.join(args.model_dir, "saved"))
|
load_state(os.path.join(args.model_dir, "saved"))
|
||||||
play(env, act, args.stochastic, args.video)
|
play(env, act, args.stochastic, args.video)
|
||||||
|
@@ -2,14 +2,7 @@ import tensorflow as tf
|
|||||||
import tensorflow.contrib.layers as layers
|
import tensorflow.contrib.layers as layers
|
||||||
|
|
||||||
|
|
||||||
def layer_norm_fn(x, relu=True):
|
def model(img_in, num_actions, scope, reuse=False):
|
||||||
x = layers.layer_norm(x, scale=True, center=True)
|
|
||||||
if relu:
|
|
||||||
x = tf.nn.relu(x)
|
|
||||||
return x
|
|
||||||
|
|
||||||
|
|
||||||
def model(img_in, num_actions, scope, reuse=False, layer_norm=False):
|
|
||||||
"""As described in https://storage.googleapis.com/deepmind-data/assets/papers/DeepMindNature14236Paper.pdf"""
|
"""As described in https://storage.googleapis.com/deepmind-data/assets/papers/DeepMindNature14236Paper.pdf"""
|
||||||
with tf.variable_scope(scope, reuse=reuse):
|
with tf.variable_scope(scope, reuse=reuse):
|
||||||
out = img_in
|
out = img_in
|
||||||
@@ -22,15 +15,12 @@ def model(img_in, num_actions, scope, reuse=False, layer_norm=False):
|
|||||||
|
|
||||||
with tf.variable_scope("action_value"):
|
with tf.variable_scope("action_value"):
|
||||||
value_out = layers.fully_connected(conv_out, num_outputs=512, activation_fn=None)
|
value_out = layers.fully_connected(conv_out, num_outputs=512, activation_fn=None)
|
||||||
if layer_norm:
|
value_out = tf.nn.relu(value_out)
|
||||||
value_out = layer_norm_fn(value_out, relu=True)
|
|
||||||
else:
|
|
||||||
value_out = tf.nn.relu(value_out)
|
|
||||||
value_out = layers.fully_connected(value_out, num_outputs=num_actions, activation_fn=None)
|
value_out = layers.fully_connected(value_out, num_outputs=num_actions, activation_fn=None)
|
||||||
return value_out
|
return value_out
|
||||||
|
|
||||||
|
|
||||||
def dueling_model(img_in, num_actions, scope, reuse=False, layer_norm=False):
|
def dueling_model(img_in, num_actions, scope, reuse=False):
|
||||||
"""As described in https://arxiv.org/abs/1511.06581"""
|
"""As described in https://arxiv.org/abs/1511.06581"""
|
||||||
with tf.variable_scope(scope, reuse=reuse):
|
with tf.variable_scope(scope, reuse=reuse):
|
||||||
out = img_in
|
out = img_in
|
||||||
@@ -43,17 +33,11 @@ def dueling_model(img_in, num_actions, scope, reuse=False, layer_norm=False):
|
|||||||
|
|
||||||
with tf.variable_scope("state_value"):
|
with tf.variable_scope("state_value"):
|
||||||
state_hidden = layers.fully_connected(conv_out, num_outputs=512, activation_fn=None)
|
state_hidden = layers.fully_connected(conv_out, num_outputs=512, activation_fn=None)
|
||||||
if layer_norm:
|
state_hidden = tf.nn.relu(state_hidden)
|
||||||
state_hidden = layer_norm_fn(state_hidden, relu=True)
|
|
||||||
else:
|
|
||||||
state_hidden = tf.nn.relu(state_hidden)
|
|
||||||
state_score = layers.fully_connected(state_hidden, num_outputs=1, activation_fn=None)
|
state_score = layers.fully_connected(state_hidden, num_outputs=1, activation_fn=None)
|
||||||
with tf.variable_scope("action_value"):
|
with tf.variable_scope("action_value"):
|
||||||
actions_hidden = layers.fully_connected(conv_out, num_outputs=512, activation_fn=None)
|
actions_hidden = layers.fully_connected(conv_out, num_outputs=512, activation_fn=None)
|
||||||
if layer_norm:
|
actions_hidden = tf.nn.relu(actions_hidden)
|
||||||
actions_hidden = layer_norm_fn(actions_hidden, relu=True)
|
|
||||||
else:
|
|
||||||
actions_hidden = tf.nn.relu(actions_hidden)
|
|
||||||
action_scores = layers.fully_connected(actions_hidden, num_outputs=num_actions, activation_fn=None)
|
action_scores = layers.fully_connected(actions_hidden, num_outputs=num_actions, activation_fn=None)
|
||||||
action_scores_mean = tf.reduce_mean(action_scores, 1)
|
action_scores_mean = tf.reduce_mean(action_scores, 1)
|
||||||
action_scores = action_scores - tf.expand_dims(action_scores_mean, 1)
|
action_scores = action_scores - tf.expand_dims(action_scores_mean, 1)
|
||||||
|
@@ -25,6 +25,7 @@ from baselines import bench
|
|||||||
from baselines.common.atari_wrappers_deprecated import wrap_dqn
|
from baselines.common.atari_wrappers_deprecated import wrap_dqn
|
||||||
from baselines.common.azure_utils import Container
|
from baselines.common.azure_utils import Container
|
||||||
from .model import model, dueling_model
|
from .model import model, dueling_model
|
||||||
|
from baselines.deepq.utils import Uint8Input, load_state, save_state
|
||||||
|
|
||||||
|
|
||||||
def parse_args():
|
def parse_args():
|
||||||
@@ -73,7 +74,7 @@ def maybe_save_model(savedir, container, state):
|
|||||||
return
|
return
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
model_dir = "model-{}".format(state["num_iters"])
|
model_dir = "model-{}".format(state["num_iters"])
|
||||||
U.save_state(os.path.join(savedir, model_dir, "saved"))
|
save_state(os.path.join(savedir, model_dir, "saved"))
|
||||||
if container is not None:
|
if container is not None:
|
||||||
container.put(os.path.join(savedir, model_dir), model_dir)
|
container.put(os.path.join(savedir, model_dir), model_dir)
|
||||||
relatively_safe_pickle_dump(state, os.path.join(savedir, 'training_state.pkl.zip'), compression=True)
|
relatively_safe_pickle_dump(state, os.path.join(savedir, 'training_state.pkl.zip'), compression=True)
|
||||||
@@ -101,14 +102,14 @@ def maybe_load_model(savedir, container):
|
|||||||
model_dir = "model-{}".format(state["num_iters"])
|
model_dir = "model-{}".format(state["num_iters"])
|
||||||
if container is not None:
|
if container is not None:
|
||||||
container.get(savedir, model_dir)
|
container.get(savedir, model_dir)
|
||||||
U.load_state(os.path.join(savedir, model_dir, "saved"))
|
load_state(os.path.join(savedir, model_dir, "saved"))
|
||||||
logger.log("Loaded models checkpoint at {} iterations".format(state["num_iters"]))
|
logger.log("Loaded models checkpoint at {} iterations".format(state["num_iters"]))
|
||||||
return state
|
return state
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
args = parse_args()
|
args = parse_args()
|
||||||
|
|
||||||
# Parse savedir and azure container.
|
# Parse savedir and azure container.
|
||||||
savedir = args.save_dir
|
savedir = args.save_dir
|
||||||
if savedir is None:
|
if savedir is None:
|
||||||
@@ -143,7 +144,7 @@ if __name__ == '__main__':
|
|||||||
actual_model = dueling_model if args.dueling else model
|
actual_model = dueling_model if args.dueling else model
|
||||||
return actual_model(img_in, num_actions, scope, layer_norm=args.layer_norm, **kwargs)
|
return actual_model(img_in, num_actions, scope, layer_norm=args.layer_norm, **kwargs)
|
||||||
act, train, update_target, debug = deepq.build_train(
|
act, train, update_target, debug = deepq.build_train(
|
||||||
make_obs_ph=lambda name: U.Uint8Input(env.observation_space.shape, name=name),
|
make_obs_ph=lambda name: Uint8Input(env.observation_space.shape, name=name),
|
||||||
q_func=model_wrapper,
|
q_func=model_wrapper,
|
||||||
num_actions=env.action_space.n,
|
num_actions=env.action_space.n,
|
||||||
optimizer=tf.train.AdamOptimizer(learning_rate=args.lr, epsilon=1e-4),
|
optimizer=tf.train.AdamOptimizer(learning_rate=args.lr, epsilon=1e-4),
|
||||||
|
@@ -9,6 +9,7 @@ from baselines import deepq, bench
|
|||||||
from baselines.common.misc_util import get_wrapper_by_name, boolean_flag, set_global_seeds
|
from baselines.common.misc_util import get_wrapper_by_name, boolean_flag, set_global_seeds
|
||||||
from baselines.common.atari_wrappers_deprecated import wrap_dqn
|
from baselines.common.atari_wrappers_deprecated import wrap_dqn
|
||||||
from baselines.deepq.experiments.atari.model import model, dueling_model
|
from baselines.deepq.experiments.atari.model import model, dueling_model
|
||||||
|
from baselines.deepq.utils import Uint8Input, load_state
|
||||||
|
|
||||||
|
|
||||||
def make_env(game_name):
|
def make_env(game_name):
|
||||||
@@ -69,11 +70,11 @@ def main():
|
|||||||
with U.make_session(4): # noqa
|
with U.make_session(4): # noqa
|
||||||
_, env = make_env(args.env)
|
_, env = make_env(args.env)
|
||||||
act = deepq.build_act(
|
act = deepq.build_act(
|
||||||
make_obs_ph=lambda name: U.Uint8Input(env.observation_space.shape, name=name),
|
make_obs_ph=lambda name: Uint8Input(env.observation_space.shape, name=name),
|
||||||
q_func=dueling_model if args.dueling else model,
|
q_func=dueling_model if args.dueling else model,
|
||||||
num_actions=env.action_space.n)
|
num_actions=env.action_space.n)
|
||||||
|
|
||||||
U.load_state(os.path.join(args.model_dir, "saved"))
|
load_state(os.path.join(args.model_dir, "saved"))
|
||||||
wang2015_eval(args.env, act, stochastic=args.stochastic)
|
wang2015_eval(args.env, act, stochastic=args.stochastic)
|
||||||
|
|
||||||
|
|
||||||
|
@@ -9,6 +9,7 @@ import baselines.common.tf_util as U
|
|||||||
from baselines import logger
|
from baselines import logger
|
||||||
from baselines import deepq
|
from baselines import deepq
|
||||||
from baselines.deepq.replay_buffer import ReplayBuffer
|
from baselines.deepq.replay_buffer import ReplayBuffer
|
||||||
|
from baselines.deepq.utils import BatchInput
|
||||||
from baselines.common.schedules import LinearSchedule
|
from baselines.common.schedules import LinearSchedule
|
||||||
|
|
||||||
|
|
||||||
@@ -27,7 +28,7 @@ if __name__ == '__main__':
|
|||||||
env = gym.make("CartPole-v0")
|
env = gym.make("CartPole-v0")
|
||||||
# Create all the functions necessary to train the model
|
# Create all the functions necessary to train the model
|
||||||
act, train, update_target, debug = deepq.build_train(
|
act, train, update_target, debug = deepq.build_train(
|
||||||
make_obs_ph=lambda name: U.BatchInput(env.observation_space.shape, name=name),
|
make_obs_ph=lambda name: BatchInput(env.observation_space.shape, name=name),
|
||||||
q_func=model,
|
q_func=model,
|
||||||
num_actions=env.action_space.n,
|
num_actions=env.action_space.n,
|
||||||
optimizer=tf.train.AdamOptimizer(learning_rate=5e-4),
|
optimizer=tf.train.AdamOptimizer(learning_rate=5e-4),
|
||||||
|
@@ -1,5 +1,3 @@
|
|||||||
import gym
|
|
||||||
|
|
||||||
from baselines import deepq
|
from baselines import deepq
|
||||||
from baselines.common import set_global_seeds
|
from baselines.common import set_global_seeds
|
||||||
from baselines import bench
|
from baselines import bench
|
||||||
|
@@ -3,7 +3,7 @@ import gym
|
|||||||
from baselines import deepq
|
from baselines import deepq
|
||||||
|
|
||||||
|
|
||||||
def callback(lcl, glb):
|
def callback(lcl, _glb):
|
||||||
# stop training if reward exceeds 199
|
# stop training if reward exceeds 199
|
||||||
is_solved = lcl['t'] > 100 and sum(lcl['episode_rewards'][-101:-1]) / 100 >= 199
|
is_solved = lcl['t'] > 100 and sum(lcl['episode_rewards'][-101:-1]) / 100 >= 199
|
||||||
return is_solved
|
return is_solved
|
||||||
|
@@ -12,6 +12,7 @@ from baselines import logger
|
|||||||
from baselines.common.schedules import LinearSchedule
|
from baselines.common.schedules import LinearSchedule
|
||||||
from baselines import deepq
|
from baselines import deepq
|
||||||
from baselines.deepq.replay_buffer import ReplayBuffer, PrioritizedReplayBuffer
|
from baselines.deepq.replay_buffer import ReplayBuffer, PrioritizedReplayBuffer
|
||||||
|
from baselines.deepq.utils import BatchInput, load_state, save_state
|
||||||
|
|
||||||
|
|
||||||
class ActWrapper(object):
|
class ActWrapper(object):
|
||||||
@@ -32,7 +33,7 @@ class ActWrapper(object):
|
|||||||
f.write(model_data)
|
f.write(model_data)
|
||||||
|
|
||||||
zipfile.ZipFile(arc_path, 'r', zipfile.ZIP_DEFLATED).extractall(td)
|
zipfile.ZipFile(arc_path, 'r', zipfile.ZIP_DEFLATED).extractall(td)
|
||||||
U.load_state(os.path.join(td, "model"))
|
load_state(os.path.join(td, "model"))
|
||||||
|
|
||||||
return ActWrapper(act, act_params)
|
return ActWrapper(act, act_params)
|
||||||
|
|
||||||
@@ -45,7 +46,7 @@ class ActWrapper(object):
|
|||||||
path = os.path.join(logger.get_dir(), "model.pkl")
|
path = os.path.join(logger.get_dir(), "model.pkl")
|
||||||
|
|
||||||
with tempfile.TemporaryDirectory() as td:
|
with tempfile.TemporaryDirectory() as td:
|
||||||
U.save_state(os.path.join(td, "model"))
|
save_state(os.path.join(td, "model"))
|
||||||
arc_name = os.path.join(td, "packed.zip")
|
arc_name = os.path.join(td, "packed.zip")
|
||||||
with zipfile.ZipFile(arc_name, 'w') as zipf:
|
with zipfile.ZipFile(arc_name, 'w') as zipf:
|
||||||
for root, dirs, files in os.walk(td):
|
for root, dirs, files in os.walk(td):
|
||||||
@@ -171,7 +172,7 @@ def learn(env,
|
|||||||
# by cloudpickle when serializing make_obs_ph
|
# by cloudpickle when serializing make_obs_ph
|
||||||
observation_space_shape = env.observation_space.shape
|
observation_space_shape = env.observation_space.shape
|
||||||
def make_obs_ph(name):
|
def make_obs_ph(name):
|
||||||
return U.BatchInput(observation_space_shape, name=name)
|
return BatchInput(observation_space_shape, name=name)
|
||||||
|
|
||||||
act, train, update_target, debug = deepq.build_train(
|
act, train, update_target, debug = deepq.build_train(
|
||||||
make_obs_ph=make_obs_ph,
|
make_obs_ph=make_obs_ph,
|
||||||
@@ -283,12 +284,12 @@ def learn(env,
|
|||||||
if print_freq is not None:
|
if print_freq is not None:
|
||||||
logger.log("Saving model due to mean reward increase: {} -> {}".format(
|
logger.log("Saving model due to mean reward increase: {} -> {}".format(
|
||||||
saved_mean_reward, mean_100ep_reward))
|
saved_mean_reward, mean_100ep_reward))
|
||||||
U.save_state(model_file)
|
save_state(model_file)
|
||||||
model_saved = True
|
model_saved = True
|
||||||
saved_mean_reward = mean_100ep_reward
|
saved_mean_reward = mean_100ep_reward
|
||||||
if model_saved:
|
if model_saved:
|
||||||
if print_freq is not None:
|
if print_freq is not None:
|
||||||
logger.log("Restored model with mean reward: {}".format(saved_mean_reward))
|
logger.log("Restored model with mean reward: {}".format(saved_mean_reward))
|
||||||
U.load_state(model_file)
|
load_state(model_file)
|
||||||
|
|
||||||
return act
|
return act
|
||||||
|
88
baselines/deepq/utils.py
Normal file
88
baselines/deepq/utils.py
Normal file
@@ -0,0 +1,88 @@
|
|||||||
|
import os
|
||||||
|
|
||||||
|
import tensorflow as tf
|
||||||
|
|
||||||
|
# ================================================================
|
||||||
|
# Saving variables
|
||||||
|
# ================================================================
|
||||||
|
|
||||||
|
def load_state(fname):
|
||||||
|
saver = tf.train.Saver()
|
||||||
|
saver.restore(tf.get_default_session(), fname)
|
||||||
|
|
||||||
|
def save_state(fname):
|
||||||
|
os.makedirs(os.path.dirname(fname), exist_ok=True)
|
||||||
|
saver = tf.train.Saver()
|
||||||
|
saver.save(tf.get_default_session(), fname)
|
||||||
|
|
||||||
|
# ================================================================
|
||||||
|
# Placeholders
|
||||||
|
# ================================================================
|
||||||
|
|
||||||
|
class TfInput(object):
|
||||||
|
def __init__(self, name="(unnamed)"):
|
||||||
|
"""Generalized Tensorflow placeholder. The main differences are:
|
||||||
|
- possibly uses multiple placeholders internally and returns multiple values
|
||||||
|
- can apply light postprocessing to the value feed to placeholder.
|
||||||
|
"""
|
||||||
|
self.name = name
|
||||||
|
|
||||||
|
def get(self):
|
||||||
|
"""Return the tf variable(s) representing the possibly postprocessed value
|
||||||
|
of placeholder(s).
|
||||||
|
"""
|
||||||
|
raise NotImplemented()
|
||||||
|
|
||||||
|
def make_feed_dict(data):
|
||||||
|
"""Given data input it to the placeholder(s)."""
|
||||||
|
raise NotImplemented()
|
||||||
|
|
||||||
|
|
||||||
|
class PlaceholderTfInput(TfInput):
|
||||||
|
def __init__(self, placeholder):
|
||||||
|
"""Wrapper for regular tensorflow placeholder."""
|
||||||
|
super().__init__(placeholder.name)
|
||||||
|
self._placeholder = placeholder
|
||||||
|
|
||||||
|
def get(self):
|
||||||
|
return self._placeholder
|
||||||
|
|
||||||
|
def make_feed_dict(self, data):
|
||||||
|
return {self._placeholder: data}
|
||||||
|
|
||||||
|
class BatchInput(PlaceholderTfInput):
|
||||||
|
def __init__(self, shape, dtype=tf.float32, name=None):
|
||||||
|
"""Creates a placeholder for a batch of tensors of a given shape and dtype
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
shape: [int]
|
||||||
|
shape of a single elemenet of the batch
|
||||||
|
dtype: tf.dtype
|
||||||
|
number representation used for tensor contents
|
||||||
|
name: str
|
||||||
|
name of the underlying placeholder
|
||||||
|
"""
|
||||||
|
super().__init__(tf.placeholder(dtype, [None] + list(shape), name=name))
|
||||||
|
|
||||||
|
class Uint8Input(PlaceholderTfInput):
|
||||||
|
def __init__(self, shape, name=None):
|
||||||
|
"""Takes input in uint8 format which is cast to float32 and divided by 255
|
||||||
|
before passing it to the model.
|
||||||
|
|
||||||
|
On GPU this ensures lower data transfer times.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
shape: [int]
|
||||||
|
shape of the tensor.
|
||||||
|
name: str
|
||||||
|
name of the underlying placeholder
|
||||||
|
"""
|
||||||
|
|
||||||
|
super().__init__(tf.placeholder(tf.uint8, [None] + list(shape), name=name))
|
||||||
|
self._shape = shape
|
||||||
|
self._output = tf.cast(super().get(), tf.float32) / 255.0
|
||||||
|
|
||||||
|
def get(self):
|
||||||
|
return self._output
|
@@ -8,6 +8,14 @@ import numpy as np
|
|||||||
from baselines.common.mpi_running_mean_std import RunningMeanStd
|
from baselines.common.mpi_running_mean_std import RunningMeanStd
|
||||||
from baselines.common import tf_util as U
|
from baselines.common import tf_util as U
|
||||||
|
|
||||||
|
def logsigmoid(a):
|
||||||
|
'''Equivalent to tf.log(tf.sigmoid(a))'''
|
||||||
|
return -tf.nn.softplus(-a)
|
||||||
|
|
||||||
|
""" Reference: https://github.com/openai/imitation/blob/99fbccf3e060b6e6c739bdf209758620fcdefd3c/policyopt/thutil.py#L48-L51"""
|
||||||
|
def logit_bernoulli_entropy(logits):
|
||||||
|
ent = (1.-tf.nn.sigmoid(logits))*logits - logsigmoid(logits)
|
||||||
|
return ent
|
||||||
|
|
||||||
class TransitionClassifier(object):
|
class TransitionClassifier(object):
|
||||||
def __init__(self, env, hidden_size, entcoeff=0.001, lr_rate=1e-3, scope="adversary"):
|
def __init__(self, env, hidden_size, entcoeff=0.001, lr_rate=1e-3, scope="adversary"):
|
||||||
|
@@ -130,14 +130,14 @@ def learn(env, policy_func, reward_giver, expert_dataset, rank,
|
|||||||
|
|
||||||
kloldnew = oldpi.pd.kl(pi.pd)
|
kloldnew = oldpi.pd.kl(pi.pd)
|
||||||
ent = pi.pd.entropy()
|
ent = pi.pd.entropy()
|
||||||
meankl = U.mean(kloldnew)
|
meankl = tf_util.reduce_mean(kloldnew)
|
||||||
meanent = U.mean(ent)
|
meanent = tf_util.reduce_mean(ent)
|
||||||
entbonus = entcoeff * meanent
|
entbonus = entcoeff * meanent
|
||||||
|
|
||||||
vferr = U.mean(tf.square(pi.vpred - ret))
|
vferr = tf_util.reduce_mean(tf.square(pi.vpred - ret))
|
||||||
|
|
||||||
ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold
|
ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold
|
||||||
surrgain = U.mean(ratio * atarg)
|
surrgain = tf_util.reduce_mean(ratio * atarg)
|
||||||
|
|
||||||
optimgain = surrgain + entbonus
|
optimgain = surrgain + entbonus
|
||||||
losses = [optimgain, meankl, entbonus, surrgain, meanent]
|
losses = [optimgain, meankl, entbonus, surrgain, meanent]
|
||||||
|
@@ -6,7 +6,6 @@ import json
|
|||||||
import time
|
import time
|
||||||
import datetime
|
import datetime
|
||||||
import tempfile
|
import tempfile
|
||||||
from mpi4py import MPI
|
|
||||||
|
|
||||||
LOG_OUTPUT_FORMATS = ['stdout', 'log', 'csv']
|
LOG_OUTPUT_FORMATS = ['stdout', 'log', 'csv']
|
||||||
# Also valid: json, tensorboard
|
# Also valid: json, tensorboard
|
||||||
@@ -170,6 +169,7 @@ class TensorBoardOutputFormat(KVWriter):
|
|||||||
self.writer = None
|
self.writer = None
|
||||||
|
|
||||||
def make_output_format(format, ev_dir):
|
def make_output_format(format, ev_dir):
|
||||||
|
from mpi4py import MPI
|
||||||
os.makedirs(ev_dir, exist_ok=True)
|
os.makedirs(ev_dir, exist_ok=True)
|
||||||
rank = MPI.COMM_WORLD.Get_rank()
|
rank = MPI.COMM_WORLD.Get_rank()
|
||||||
if format == 'stdout':
|
if format == 'stdout':
|
||||||
|
@@ -17,25 +17,25 @@ class CnnPolicy(object):
|
|||||||
sequence_length = None
|
sequence_length = None
|
||||||
|
|
||||||
ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))
|
ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))
|
||||||
|
|
||||||
x = ob / 255.0
|
x = ob / 255.0
|
||||||
if kind == 'small': # from A3C paper
|
if kind == 'small': # from A3C paper
|
||||||
x = tf.nn.relu(U.conv2d(x, 16, "l1", [8, 8], [4, 4], pad="VALID"))
|
x = tf.nn.relu(U.conv2d(x, 16, "l1", [8, 8], [4, 4], pad="VALID"))
|
||||||
x = tf.nn.relu(U.conv2d(x, 32, "l2", [4, 4], [2, 2], pad="VALID"))
|
x = tf.nn.relu(U.conv2d(x, 32, "l2", [4, 4], [2, 2], pad="VALID"))
|
||||||
x = U.flattenallbut0(x)
|
x = U.flattenallbut0(x)
|
||||||
x = tf.nn.relu(U.dense(x, 256, 'lin', U.normc_initializer(1.0)))
|
x = tf.nn.relu(tf.layers.dense(x, 256, name='lin', kernel_initializer=U.normc_initializer(1.0)))
|
||||||
elif kind == 'large': # Nature DQN
|
elif kind == 'large': # Nature DQN
|
||||||
x = tf.nn.relu(U.conv2d(x, 32, "l1", [8, 8], [4, 4], pad="VALID"))
|
x = tf.nn.relu(U.conv2d(x, 32, "l1", [8, 8], [4, 4], pad="VALID"))
|
||||||
x = tf.nn.relu(U.conv2d(x, 64, "l2", [4, 4], [2, 2], pad="VALID"))
|
x = tf.nn.relu(U.conv2d(x, 64, "l2", [4, 4], [2, 2], pad="VALID"))
|
||||||
x = tf.nn.relu(U.conv2d(x, 64, "l3", [3, 3], [1, 1], pad="VALID"))
|
x = tf.nn.relu(U.conv2d(x, 64, "l3", [3, 3], [1, 1], pad="VALID"))
|
||||||
x = U.flattenallbut0(x)
|
x = U.flattenallbut0(x)
|
||||||
x = tf.nn.relu(U.dense(x, 512, 'lin', U.normc_initializer(1.0)))
|
x = tf.nn.relu(tf.layers.dense(x, 512, name='lin', kernel_initializer=U.normc_initializer(1.0)))
|
||||||
else:
|
else:
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
logits = U.dense(x, pdtype.param_shape()[0], "logits", U.normc_initializer(0.01))
|
logits = tf.layers.dense(x, pdtype.param_shape()[0], name='logits', kernel_initializer=U.normc_initializer(0.01))
|
||||||
self.pd = pdtype.pdfromflat(logits)
|
self.pd = pdtype.pdfromflat(logits)
|
||||||
self.vpred = U.dense(x, 1, "value", U.normc_initializer(1.0))[:,0]
|
self.vpred = tf.layers.dense(x, 1, name='value', kernel_initializer=U.normc_initializer(1.0))[:,0]
|
||||||
|
|
||||||
self.state_in = []
|
self.state_in = []
|
||||||
self.state_out = []
|
self.state_out = []
|
||||||
|
@@ -18,25 +18,25 @@ class MlpPolicy(object):
|
|||||||
sequence_length = None
|
sequence_length = None
|
||||||
|
|
||||||
ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))
|
ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))
|
||||||
|
|
||||||
with tf.variable_scope("obfilter"):
|
with tf.variable_scope("obfilter"):
|
||||||
self.ob_rms = RunningMeanStd(shape=ob_space.shape)
|
self.ob_rms = RunningMeanStd(shape=ob_space.shape)
|
||||||
|
|
||||||
obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
|
obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
|
||||||
last_out = obz
|
last_out = obz
|
||||||
for i in range(num_hid_layers):
|
for i in range(num_hid_layers):
|
||||||
last_out = tf.nn.tanh(U.dense(last_out, hid_size, "vffc%i"%(i+1), weight_init=U.normc_initializer(1.0)))
|
last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name="vffc%i"%(i+1), kernel_initializer=U.normc_initializer(1.0)))
|
||||||
self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:,0]
|
self.vpred = tf.layers.dense(last_out, 1, name='vffinal', kernel_initializer=U.normc_initializer(1.0))[:,0]
|
||||||
|
|
||||||
last_out = obz
|
last_out = obz
|
||||||
for i in range(num_hid_layers):
|
for i in range(num_hid_layers):
|
||||||
last_out = tf.nn.tanh(U.dense(last_out, hid_size, "polfc%i"%(i+1), weight_init=U.normc_initializer(1.0)))
|
last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name='polfc%i'%(i+1), kernel_initializer=U.normc_initializer(1.0)))
|
||||||
if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
|
if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
|
||||||
mean = U.dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01))
|
mean = tf.layers.dense(last_out, pdtype.param_shape()[0]//2, name='polfinal', kernel_initializer=U.normc_initializer(0.01))
|
||||||
logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer())
|
logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer())
|
||||||
pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1)
|
pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
|
||||||
else:
|
else:
|
||||||
pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01))
|
pdparam = tf.layers.dense(last_out, pdtype.param_shape()[0], name='polfinal', kernel_initializer=U.normc_initializer(0.01))
|
||||||
|
|
||||||
self.pd = pdtype.pdfromflat(pdparam)
|
self.pd = pdtype.pdfromflat(pdparam)
|
||||||
|
|
||||||
|
@@ -77,7 +77,7 @@ def add_vtarg_and_adv(seg, gamma, lam):
|
|||||||
gaelam[t] = lastgaelam = delta + gamma * lam * nonterminal * lastgaelam
|
gaelam[t] = lastgaelam = delta + gamma * lam * nonterminal * lastgaelam
|
||||||
seg["tdlamret"] = seg["adv"] + seg["vpred"]
|
seg["tdlamret"] = seg["adv"] + seg["vpred"]
|
||||||
|
|
||||||
def learn(env, policy_func, *,
|
def learn(env, policy_fn, *,
|
||||||
timesteps_per_actorbatch, # timesteps per actor per update
|
timesteps_per_actorbatch, # timesteps per actor per update
|
||||||
clip_param, entcoeff, # clipping parameter epsilon, entropy coeff
|
clip_param, entcoeff, # clipping parameter epsilon, entropy coeff
|
||||||
optim_epochs, optim_stepsize, optim_batchsize,# optimization hypers
|
optim_epochs, optim_stepsize, optim_batchsize,# optimization hypers
|
||||||
@@ -91,8 +91,8 @@ def learn(env, policy_func, *,
|
|||||||
# ----------------------------------------
|
# ----------------------------------------
|
||||||
ob_space = env.observation_space
|
ob_space = env.observation_space
|
||||||
ac_space = env.action_space
|
ac_space = env.action_space
|
||||||
pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy
|
pi = policy_fn("pi", ob_space, ac_space) # Construct network for new policy
|
||||||
oldpi = policy_func("oldpi", ob_space, ac_space) # Network for old policy
|
oldpi = policy_fn("oldpi", ob_space, ac_space) # Network for old policy
|
||||||
atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable)
|
atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable)
|
||||||
ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return
|
ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return
|
||||||
|
|
||||||
@@ -104,15 +104,15 @@ def learn(env, policy_func, *,
|
|||||||
|
|
||||||
kloldnew = oldpi.pd.kl(pi.pd)
|
kloldnew = oldpi.pd.kl(pi.pd)
|
||||||
ent = pi.pd.entropy()
|
ent = pi.pd.entropy()
|
||||||
meankl = U.mean(kloldnew)
|
meankl = tf.reduce_mean(kloldnew)
|
||||||
meanent = U.mean(ent)
|
meanent = tf.reduce_mean(ent)
|
||||||
pol_entpen = (-entcoeff) * meanent
|
pol_entpen = (-entcoeff) * meanent
|
||||||
|
|
||||||
ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold
|
ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold
|
||||||
surr1 = ratio * atarg # surrogate from conservative policy iteration
|
surr1 = ratio * atarg # surrogate from conservative policy iteration
|
||||||
surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg #
|
surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg #
|
||||||
pol_surr = - U.mean(tf.minimum(surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP)
|
pol_surr = - tf.reduce_mean(tf.minimum(surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP)
|
||||||
vf_loss = U.mean(tf.square(pi.vpred - ret))
|
vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret))
|
||||||
total_loss = pol_surr + pol_entpen + vf_loss
|
total_loss = pol_surr + pol_entpen + vf_loss
|
||||||
losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
|
losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
|
||||||
loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]
|
loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]
|
||||||
@@ -181,7 +181,7 @@ def learn(env, policy_func, *,
|
|||||||
losses = [] # list of tuples, each of which gives the loss for a minibatch
|
losses = [] # list of tuples, each of which gives the loss for a minibatch
|
||||||
for batch in d.iterate_once(optim_batchsize):
|
for batch in d.iterate_once(optim_batchsize):
|
||||||
*newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult)
|
*newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult)
|
||||||
adam.update(g, optim_stepsize * cur_lrmult)
|
adam.update(g, optim_stepsize * cur_lrmult)
|
||||||
losses.append(newlosses)
|
losses.append(newlosses)
|
||||||
logger.log(fmt_row(13, np.mean(losses, axis=0)))
|
logger.log(fmt_row(13, np.mean(losses, axis=0)))
|
||||||
|
|
||||||
@@ -189,7 +189,7 @@ def learn(env, policy_func, *,
|
|||||||
losses = []
|
losses = []
|
||||||
for batch in d.iterate_once(optim_batchsize):
|
for batch in d.iterate_once(optim_batchsize):
|
||||||
newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult)
|
newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult)
|
||||||
losses.append(newlosses)
|
losses.append(newlosses)
|
||||||
meanlosses,_,_ = mpi_moments(losses, axis=0)
|
meanlosses,_,_ = mpi_moments(losses, axis=0)
|
||||||
logger.log(fmt_row(13, meanlosses))
|
logger.log(fmt_row(13, meanlosses))
|
||||||
for (lossval, name) in zipsame(meanlosses, loss_names):
|
for (lossval, name) in zipsame(meanlosses, loss_names):
|
||||||
|
@@ -4,9 +4,9 @@ from mpi4py import MPI
|
|||||||
from baselines.common import set_global_seeds
|
from baselines.common import set_global_seeds
|
||||||
from baselines import bench
|
from baselines import bench
|
||||||
import os.path as osp
|
import os.path as osp
|
||||||
import gym, logging
|
|
||||||
from baselines import logger
|
from baselines import logger
|
||||||
from baselines.common.atari_wrappers import make_atari, wrap_deepmind
|
from baselines.common.atari_wrappers import make_atari, wrap_deepmind
|
||||||
|
from baselines.common.cmd_util import atari_arg_parser
|
||||||
|
|
||||||
def train(env_id, num_timesteps, seed):
|
def train(env_id, num_timesteps, seed):
|
||||||
from baselines.ppo1 import pposgd_simple, cnn_policy
|
from baselines.ppo1 import pposgd_simple, cnn_policy
|
||||||
@@ -26,7 +26,6 @@ def train(env_id, num_timesteps, seed):
|
|||||||
env = bench.Monitor(env, logger.get_dir() and
|
env = bench.Monitor(env, logger.get_dir() and
|
||||||
osp.join(logger.get_dir(), str(rank)))
|
osp.join(logger.get_dir(), str(rank)))
|
||||||
env.seed(workerseed)
|
env.seed(workerseed)
|
||||||
gym.logger.setLevel(logging.WARN)
|
|
||||||
|
|
||||||
env = wrap_deepmind(env)
|
env = wrap_deepmind(env)
|
||||||
env.seed(workerseed)
|
env.seed(workerseed)
|
||||||
@@ -42,12 +41,7 @@ def train(env_id, num_timesteps, seed):
|
|||||||
env.close()
|
env.close()
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
import argparse
|
args = atari_arg_parser().parse_args()
|
||||||
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
|
||||||
parser.add_argument('--env', help='environment ID', default='PongNoFrameskip-v4')
|
|
||||||
parser.add_argument('--seed', help='RNG seed', type=int, default=0)
|
|
||||||
parser.add_argument('--num-timesteps', type=int, default=int(10e6))
|
|
||||||
args = parser.parse_args()
|
|
||||||
train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
|
train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
@@ -1,20 +1,16 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
from baselines.common import set_global_seeds, tf_util as U
|
|
||||||
from baselines import bench
|
from baselines.common.cmd_util import make_mujoco_env, mujoco_arg_parser
|
||||||
import gym, logging
|
from baselines.common import tf_util as U
|
||||||
from baselines import logger
|
from baselines import logger
|
||||||
|
|
||||||
def train(env_id, num_timesteps, seed):
|
def train(env_id, num_timesteps, seed):
|
||||||
from baselines.ppo1 import mlp_policy, pposgd_simple
|
from baselines.ppo1 import mlp_policy, pposgd_simple
|
||||||
U.make_session(num_cpu=1).__enter__()
|
U.make_session(num_cpu=1).__enter__()
|
||||||
set_global_seeds(seed)
|
|
||||||
env = gym.make(env_id)
|
|
||||||
def policy_fn(name, ob_space, ac_space):
|
def policy_fn(name, ob_space, ac_space):
|
||||||
return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
|
return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
|
||||||
hid_size=64, num_hid_layers=2)
|
hid_size=64, num_hid_layers=2)
|
||||||
env = bench.Monitor(env, logger.get_dir())
|
env = make_mujoco_env(env_id, seed)
|
||||||
env.seed(seed)
|
|
||||||
gym.logger.setLevel(logging.WARN)
|
|
||||||
pposgd_simple.learn(env, policy_fn,
|
pposgd_simple.learn(env, policy_fn,
|
||||||
max_timesteps=num_timesteps,
|
max_timesteps=num_timesteps,
|
||||||
timesteps_per_actorbatch=2048,
|
timesteps_per_actorbatch=2048,
|
||||||
@@ -25,15 +21,9 @@ def train(env_id, num_timesteps, seed):
|
|||||||
env.close()
|
env.close()
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
import argparse
|
args = mujoco_arg_parser().parse_args()
|
||||||
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
|
||||||
parser.add_argument('--env', help='environment ID', default='Hopper-v1')
|
|
||||||
parser.add_argument('--seed', help='RNG seed', type=int, default=0)
|
|
||||||
parser.add_argument('--num-timesteps', type=int, default=int(1e6))
|
|
||||||
args = parser.parse_args()
|
|
||||||
logger.configure()
|
logger.configure()
|
||||||
train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
|
train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
main()
|
main()
|
||||||
|
@@ -3,6 +3,18 @@ import tensorflow as tf
|
|||||||
from baselines.a2c.utils import conv, fc, conv_to_fc, batch_to_seq, seq_to_batch, lstm, lnlstm
|
from baselines.a2c.utils import conv, fc, conv_to_fc, batch_to_seq, seq_to_batch, lstm, lnlstm
|
||||||
from baselines.common.distributions import make_pdtype
|
from baselines.common.distributions import make_pdtype
|
||||||
|
|
||||||
|
def nature_cnn(unscaled_images):
|
||||||
|
"""
|
||||||
|
CNN from Nature paper.
|
||||||
|
"""
|
||||||
|
scaled_images = tf.cast(unscaled_images, tf.float32) / 255.
|
||||||
|
activ = tf.nn.relu
|
||||||
|
h = activ(conv(scaled_images, 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2)))
|
||||||
|
h2 = activ(conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2)))
|
||||||
|
h3 = activ(conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2)))
|
||||||
|
h3 = conv_to_fc(h3)
|
||||||
|
return activ(fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2)))
|
||||||
|
|
||||||
class LnLstmPolicy(object):
|
class LnLstmPolicy(object):
|
||||||
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False):
|
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False):
|
||||||
nenv = nbatch // nsteps
|
nenv = nbatch // nsteps
|
||||||
@@ -13,17 +25,13 @@ class LnLstmPolicy(object):
|
|||||||
M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
|
M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
|
||||||
S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
|
S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
|
||||||
with tf.variable_scope("model", reuse=reuse):
|
with tf.variable_scope("model", reuse=reuse):
|
||||||
h = conv(tf.cast(X, tf.float32)/255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2))
|
h = nature_cnn(X)
|
||||||
h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))
|
xs = batch_to_seq(h, nenv, nsteps)
|
||||||
h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2))
|
|
||||||
h3 = conv_to_fc(h3)
|
|
||||||
h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
|
|
||||||
xs = batch_to_seq(h4, nenv, nsteps)
|
|
||||||
ms = batch_to_seq(M, nenv, nsteps)
|
ms = batch_to_seq(M, nenv, nsteps)
|
||||||
h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm)
|
h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm)
|
||||||
h5 = seq_to_batch(h5)
|
h5 = seq_to_batch(h5)
|
||||||
pi = fc(h5, 'pi', nact, act=lambda x:x)
|
pi = fc(h5, 'pi', nact)
|
||||||
vf = fc(h5, 'v', 1, act=lambda x:x)
|
vf = fc(h5, 'v', 1)
|
||||||
|
|
||||||
self.pdtype = make_pdtype(ac_space)
|
self.pdtype = make_pdtype(ac_space)
|
||||||
self.pd = self.pdtype.pdfromflat(pi)
|
self.pd = self.pdtype.pdfromflat(pi)
|
||||||
@@ -59,17 +67,13 @@ class LstmPolicy(object):
|
|||||||
M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
|
M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
|
||||||
S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
|
S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
|
||||||
with tf.variable_scope("model", reuse=reuse):
|
with tf.variable_scope("model", reuse=reuse):
|
||||||
h = conv(tf.cast(X, tf.float32)/255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2))
|
h = nature_cnn(X)
|
||||||
h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))
|
xs = batch_to_seq(h, nenv, nsteps)
|
||||||
h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2))
|
|
||||||
h3 = conv_to_fc(h3)
|
|
||||||
h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
|
|
||||||
xs = batch_to_seq(h4, nenv, nsteps)
|
|
||||||
ms = batch_to_seq(M, nenv, nsteps)
|
ms = batch_to_seq(M, nenv, nsteps)
|
||||||
h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
|
h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
|
||||||
h5 = seq_to_batch(h5)
|
h5 = seq_to_batch(h5)
|
||||||
pi = fc(h5, 'pi', nact, act=lambda x:x)
|
pi = fc(h5, 'pi', nact)
|
||||||
vf = fc(h5, 'v', 1, act=lambda x:x)
|
vf = fc(h5, 'v', 1)
|
||||||
|
|
||||||
self.pdtype = make_pdtype(ac_space)
|
self.pdtype = make_pdtype(ac_space)
|
||||||
self.pd = self.pdtype.pdfromflat(pi)
|
self.pd = self.pdtype.pdfromflat(pi)
|
||||||
@@ -101,13 +105,9 @@ class CnnPolicy(object):
|
|||||||
nact = ac_space.n
|
nact = ac_space.n
|
||||||
X = tf.placeholder(tf.uint8, ob_shape) #obs
|
X = tf.placeholder(tf.uint8, ob_shape) #obs
|
||||||
with tf.variable_scope("model", reuse=reuse):
|
with tf.variable_scope("model", reuse=reuse):
|
||||||
h = conv(tf.cast(X, tf.float32)/255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2))
|
h = nature_cnn(X)
|
||||||
h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))
|
pi = fc(h, 'pi', nact, init_scale=0.01)
|
||||||
h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2))
|
vf = fc(h, 'v', 1)[:,0]
|
||||||
h3 = conv_to_fc(h3)
|
|
||||||
h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
|
|
||||||
pi = fc(h4, 'pi', nact, act=lambda x:x, init_scale=0.01)
|
|
||||||
vf = fc(h4, 'v', 1, act=lambda x:x)[:,0]
|
|
||||||
|
|
||||||
self.pdtype = make_pdtype(ac_space)
|
self.pdtype = make_pdtype(ac_space)
|
||||||
self.pd = self.pdtype.pdfromflat(pi)
|
self.pd = self.pdtype.pdfromflat(pi)
|
||||||
@@ -135,13 +135,14 @@ class MlpPolicy(object):
|
|||||||
actdim = ac_space.shape[0]
|
actdim = ac_space.shape[0]
|
||||||
X = tf.placeholder(tf.float32, ob_shape, name='Ob') #obs
|
X = tf.placeholder(tf.float32, ob_shape, name='Ob') #obs
|
||||||
with tf.variable_scope("model", reuse=reuse):
|
with tf.variable_scope("model", reuse=reuse):
|
||||||
h1 = fc(X, 'pi_fc1', nh=64, init_scale=np.sqrt(2), act=tf.tanh)
|
activ = tf.tanh
|
||||||
h2 = fc(h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2), act=tf.tanh)
|
h1 = activ(fc(X, 'pi_fc1', nh=64, init_scale=np.sqrt(2)))
|
||||||
pi = fc(h2, 'pi', actdim, act=lambda x:x, init_scale=0.01)
|
h2 = activ(fc(h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2)))
|
||||||
h1 = fc(X, 'vf_fc1', nh=64, init_scale=np.sqrt(2), act=tf.tanh)
|
pi = fc(h2, 'pi', actdim, init_scale=0.01)
|
||||||
h2 = fc(h1, 'vf_fc2', nh=64, init_scale=np.sqrt(2), act=tf.tanh)
|
h1 = activ(fc(X, 'vf_fc1', nh=64, init_scale=np.sqrt(2)))
|
||||||
vf = fc(h2, 'vf', 1, act=lambda x:x)[:,0]
|
h2 = activ(fc(h1, 'vf_fc2', nh=64, init_scale=np.sqrt(2)))
|
||||||
logstd = tf.get_variable(name="logstd", shape=[1, actdim],
|
vf = fc(h2, 'vf', 1)[:,0]
|
||||||
|
logstd = tf.get_variable(name="logstd", shape=[1, actdim],
|
||||||
initializer=tf.zeros_initializer())
|
initializer=tf.zeros_initializer())
|
||||||
|
|
||||||
pdparam = tf.concat([pi, pi * 0.0 + logstd], axis=1)
|
pdparam = tf.concat([pi, pi * 0.0 + logstd], axis=1)
|
||||||
@@ -164,4 +165,4 @@ class MlpPolicy(object):
|
|||||||
self.pi = pi
|
self.pi = pi
|
||||||
self.vf = vf
|
self.vf = vf
|
||||||
self.step = step
|
self.step = step
|
||||||
self.value = value
|
self.value = value
|
||||||
|
@@ -51,7 +51,7 @@ class Model(object):
|
|||||||
def train(lr, cliprange, obs, returns, masks, actions, values, neglogpacs, states=None):
|
def train(lr, cliprange, obs, returns, masks, actions, values, neglogpacs, states=None):
|
||||||
advs = returns - values
|
advs = returns - values
|
||||||
advs = (advs - advs.mean()) / (advs.std() + 1e-8)
|
advs = (advs - advs.mean()) / (advs.std() + 1e-8)
|
||||||
td_map = {train_model.X:obs, A:actions, ADV:advs, R:returns, LR:lr,
|
td_map = {train_model.X:obs, A:actions, ADV:advs, R:returns, LR:lr,
|
||||||
CLIPRANGE:cliprange, OLDNEGLOGPAC:neglogpacs, OLDVPRED:values}
|
CLIPRANGE:cliprange, OLDNEGLOGPAC:neglogpacs, OLDVPRED:values}
|
||||||
if states is not None:
|
if states is not None:
|
||||||
td_map[train_model.S] = states
|
td_map[train_model.S] = states
|
||||||
@@ -107,7 +107,7 @@ class Runner(object):
|
|||||||
mb_actions.append(actions)
|
mb_actions.append(actions)
|
||||||
mb_values.append(values)
|
mb_values.append(values)
|
||||||
mb_neglogpacs.append(neglogpacs)
|
mb_neglogpacs.append(neglogpacs)
|
||||||
mb_dones.append(self.dones)
|
mb_dones.append(self.dones)
|
||||||
self.obs[:], rewards, self.dones, infos = self.env.step(actions)
|
self.obs[:], rewards, self.dones, infos = self.env.step(actions)
|
||||||
for info in infos:
|
for info in infos:
|
||||||
maybeepinfo = info.get('episode')
|
maybeepinfo = info.get('episode')
|
||||||
@@ -124,7 +124,7 @@ class Runner(object):
|
|||||||
#discount/bootstrap off value fn
|
#discount/bootstrap off value fn
|
||||||
mb_returns = np.zeros_like(mb_rewards)
|
mb_returns = np.zeros_like(mb_rewards)
|
||||||
mb_advs = np.zeros_like(mb_rewards)
|
mb_advs = np.zeros_like(mb_rewards)
|
||||||
lastgaelam = 0
|
lastgaelam = 0
|
||||||
for t in reversed(range(self.nsteps)):
|
for t in reversed(range(self.nsteps)):
|
||||||
if t == self.nsteps - 1:
|
if t == self.nsteps - 1:
|
||||||
nextnonterminal = 1.0 - self.dones
|
nextnonterminal = 1.0 - self.dones
|
||||||
@@ -135,7 +135,7 @@ class Runner(object):
|
|||||||
delta = mb_rewards[t] + self.gamma * nextvalues * nextnonterminal - mb_values[t]
|
delta = mb_rewards[t] + self.gamma * nextvalues * nextnonterminal - mb_values[t]
|
||||||
mb_advs[t] = lastgaelam = delta + self.gamma * self.lam * nextnonterminal * lastgaelam
|
mb_advs[t] = lastgaelam = delta + self.gamma * self.lam * nextnonterminal * lastgaelam
|
||||||
mb_returns = mb_advs + mb_values
|
mb_returns = mb_advs + mb_values
|
||||||
return (*map(sf01, (mb_obs, mb_returns, mb_dones, mb_actions, mb_values, mb_neglogpacs)),
|
return (*map(sf01, (mb_obs, mb_returns, mb_dones, mb_actions, mb_values, mb_neglogpacs)),
|
||||||
mb_states, epinfos)
|
mb_states, epinfos)
|
||||||
# obs, returns, masks, actions, values, neglogpacs, states = runner.run()
|
# obs, returns, masks, actions, values, neglogpacs, states = runner.run()
|
||||||
def sf01(arr):
|
def sf01(arr):
|
||||||
@@ -150,8 +150,8 @@ def constfn(val):
|
|||||||
return val
|
return val
|
||||||
return f
|
return f
|
||||||
|
|
||||||
def learn(*, policy, env, nsteps, total_timesteps, ent_coef, lr,
|
def learn(*, policy, env, nsteps, total_timesteps, ent_coef, lr,
|
||||||
vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95,
|
vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95,
|
||||||
log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2,
|
log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2,
|
||||||
save_interval=0):
|
save_interval=0):
|
||||||
|
|
||||||
@@ -167,7 +167,7 @@ def learn(*, policy, env, nsteps, total_timesteps, ent_coef, lr,
|
|||||||
nbatch = nenvs * nsteps
|
nbatch = nenvs * nsteps
|
||||||
nbatch_train = nbatch // nminibatches
|
nbatch_train = nbatch // nminibatches
|
||||||
|
|
||||||
make_model = lambda : Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train,
|
make_model = lambda : Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train,
|
||||||
nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef,
|
nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef,
|
||||||
max_grad_norm=max_grad_norm)
|
max_grad_norm=max_grad_norm)
|
||||||
if save_interval and logger.get_dir():
|
if save_interval and logger.get_dir():
|
||||||
@@ -214,7 +214,7 @@ def learn(*, policy, env, nsteps, total_timesteps, ent_coef, lr,
|
|||||||
mbflatinds = flatinds[mbenvinds].ravel()
|
mbflatinds = flatinds[mbenvinds].ravel()
|
||||||
slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs))
|
slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs))
|
||||||
mbstates = states[mbenvinds]
|
mbstates = states[mbenvinds]
|
||||||
mblossvals.append(model.train(lrnow, cliprangenow, *slices, mbstates))
|
mblossvals.append(model.train(lrnow, cliprangenow, *slices, mbstates))
|
||||||
|
|
||||||
lossvals = np.mean(mblossvals, axis=0)
|
lossvals = np.mean(mblossvals, axis=0)
|
||||||
tnow = time.time()
|
tnow = time.time()
|
||||||
|
@@ -1,40 +1,25 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python3
|
||||||
import sys
|
import sys
|
||||||
import argparse
|
from baselines import logger
|
||||||
from baselines import bench, logger
|
from baselines.common.cmd_util import make_atari_env, atari_arg_parser
|
||||||
|
from baselines.common.vec_env.vec_frame_stack import VecFrameStack
|
||||||
|
from baselines.ppo2 import ppo2
|
||||||
|
from baselines.ppo2.policies import CnnPolicy, LstmPolicy, LnLstmPolicy
|
||||||
|
import multiprocessing
|
||||||
|
import tensorflow as tf
|
||||||
|
|
||||||
|
|
||||||
def train(env_id, num_timesteps, seed, policy):
|
def train(env_id, num_timesteps, seed, policy):
|
||||||
from baselines.common import set_global_seeds
|
|
||||||
from baselines.common.atari_wrappers import make_atari, wrap_deepmind
|
|
||||||
from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
|
|
||||||
from baselines.common.vec_env.vec_frame_stack import VecFrameStack
|
|
||||||
from baselines.ppo2 import ppo2
|
|
||||||
from baselines.ppo2.policies import CnnPolicy, LstmPolicy, LnLstmPolicy
|
|
||||||
import gym
|
|
||||||
import logging
|
|
||||||
import multiprocessing
|
|
||||||
import os.path as osp
|
|
||||||
import tensorflow as tf
|
|
||||||
ncpu = multiprocessing.cpu_count()
|
ncpu = multiprocessing.cpu_count()
|
||||||
if sys.platform == 'darwin': ncpu //= 2
|
if sys.platform == 'darwin': ncpu //= 2
|
||||||
config = tf.ConfigProto(allow_soft_placement=True,
|
config = tf.ConfigProto(allow_soft_placement=True,
|
||||||
intra_op_parallelism_threads=ncpu,
|
intra_op_parallelism_threads=ncpu,
|
||||||
inter_op_parallelism_threads=ncpu)
|
inter_op_parallelism_threads=ncpu)
|
||||||
config.gpu_options.allow_growth = True #pylint: disable=E1101
|
config.gpu_options.allow_growth = True #pylint: disable=E1101
|
||||||
gym.logger.setLevel(logging.WARN)
|
|
||||||
tf.Session(config=config).__enter__()
|
tf.Session(config=config).__enter__()
|
||||||
|
|
||||||
def make_env(rank):
|
env = VecFrameStack(make_atari_env(env_id, 8, seed), 4)
|
||||||
def env_fn():
|
|
||||||
env = make_atari(env_id)
|
|
||||||
env.seed(seed + rank)
|
|
||||||
env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), str(rank)))
|
|
||||||
return wrap_deepmind(env)
|
|
||||||
return env_fn
|
|
||||||
nenvs = 8
|
|
||||||
env = SubprocVecEnv([make_env(i) for i in range(nenvs)])
|
|
||||||
set_global_seeds(seed)
|
|
||||||
env = VecFrameStack(env, 4)
|
|
||||||
policy = {'cnn' : CnnPolicy, 'lstm' : LstmPolicy, 'lnlstm' : LnLstmPolicy}[policy]
|
policy = {'cnn' : CnnPolicy, 'lstm' : LstmPolicy, 'lnlstm' : LnLstmPolicy}[policy]
|
||||||
ppo2.learn(policy=policy, env=env, nsteps=128, nminibatches=4,
|
ppo2.learn(policy=policy, env=env, nsteps=128, nminibatches=4,
|
||||||
lam=0.95, gamma=0.99, noptepochs=4, log_interval=1,
|
lam=0.95, gamma=0.99, noptepochs=4, log_interval=1,
|
||||||
@@ -44,11 +29,8 @@ def train(env_id, num_timesteps, seed, policy):
|
|||||||
total_timesteps=int(num_timesteps * 1.1))
|
total_timesteps=int(num_timesteps * 1.1))
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
parser = atari_arg_parser()
|
||||||
parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4')
|
|
||||||
parser.add_argument('--seed', help='RNG seed', type=int, default=0)
|
|
||||||
parser.add_argument('--policy', help='Policy architecture', choices=['cnn', 'lstm', 'lnlstm'], default='cnn')
|
parser.add_argument('--policy', help='Policy architecture', choices=['cnn', 'lstm', 'lnlstm'], default='cnn')
|
||||||
parser.add_argument('--num-timesteps', type=int, default=int(10e6))
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
logger.configure()
|
logger.configure()
|
||||||
train(args.env, num_timesteps=args.num_timesteps, seed=args.seed,
|
train(args.env, num_timesteps=args.num_timesteps, seed=args.seed,
|
||||||
|
@@ -1,5 +1,6 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python3
|
||||||
import argparse
|
import argparse
|
||||||
|
from baselines.common.cmd_util import mujoco_arg_parser
|
||||||
from baselines import bench, logger
|
from baselines import bench, logger
|
||||||
|
|
||||||
def train(env_id, num_timesteps, seed):
|
def train(env_id, num_timesteps, seed):
|
||||||
@@ -33,15 +34,10 @@ def train(env_id, num_timesteps, seed):
|
|||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
args = mujoco_arg_parser().parse_args()
|
||||||
parser.add_argument('--env', help='environment ID', default='Hopper-v1')
|
|
||||||
parser.add_argument('--seed', help='RNG seed', type=int, default=0)
|
|
||||||
parser.add_argument('--num-timesteps', type=int, default=int(1e6))
|
|
||||||
args = parser.parse_args()
|
|
||||||
logger.configure()
|
logger.configure()
|
||||||
train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
|
train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
main()
|
main()
|
||||||
|
|
||||||
|
@@ -17,7 +17,7 @@ class CnnPolicy(object):
|
|||||||
sequence_length = None
|
sequence_length = None
|
||||||
|
|
||||||
ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))
|
ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))
|
||||||
|
|
||||||
obscaled = ob / 255.0
|
obscaled = ob / 255.0
|
||||||
|
|
||||||
with tf.variable_scope("pol"):
|
with tf.variable_scope("pol"):
|
||||||
@@ -25,16 +25,16 @@ class CnnPolicy(object):
|
|||||||
x = tf.nn.relu(U.conv2d(x, 8, "l1", [8, 8], [4, 4], pad="VALID"))
|
x = tf.nn.relu(U.conv2d(x, 8, "l1", [8, 8], [4, 4], pad="VALID"))
|
||||||
x = tf.nn.relu(U.conv2d(x, 16, "l2", [4, 4], [2, 2], pad="VALID"))
|
x = tf.nn.relu(U.conv2d(x, 16, "l2", [4, 4], [2, 2], pad="VALID"))
|
||||||
x = U.flattenallbut0(x)
|
x = U.flattenallbut0(x)
|
||||||
x = tf.nn.relu(U.dense(x, 128, 'lin', U.normc_initializer(1.0)))
|
x = tf.nn.relu(tf.layers.dense(x, 128, name='lin', kernel_initializer=U.normc_initializer(1.0)))
|
||||||
logits = U.dense(x, pdtype.param_shape()[0], "logits", U.normc_initializer(0.01))
|
logits = tf.layers.dense(x, pdtype.param_shape()[0], name='logits', kernel_initializer=U.normc_initializer(0.01))
|
||||||
self.pd = pdtype.pdfromflat(logits)
|
self.pd = pdtype.pdfromflat(logits)
|
||||||
with tf.variable_scope("vf"):
|
with tf.variable_scope("vf"):
|
||||||
x = obscaled
|
x = obscaled
|
||||||
x = tf.nn.relu(U.conv2d(x, 8, "l1", [8, 8], [4, 4], pad="VALID"))
|
x = tf.nn.relu(U.conv2d(x, 8, "l1", [8, 8], [4, 4], pad="VALID"))
|
||||||
x = tf.nn.relu(U.conv2d(x, 16, "l2", [4, 4], [2, 2], pad="VALID"))
|
x = tf.nn.relu(U.conv2d(x, 16, "l2", [4, 4], [2, 2], pad="VALID"))
|
||||||
x = U.flattenallbut0(x)
|
x = U.flattenallbut0(x)
|
||||||
x = tf.nn.relu(U.dense(x, 128, 'lin', U.normc_initializer(1.0)))
|
x = tf.nn.relu(tf.layers.dense(x, 128, name='lin', kernel_initializer=U.normc_initializer(1.0)))
|
||||||
self.vpred = U.dense(x, 1, "value", U.normc_initializer(1.0))
|
self.vpred = tf.layers.dense(x, 1, name='value', kernel_initializer=U.normc_initializer(1.0))
|
||||||
self.vpredz = self.vpred
|
self.vpredz = self.vpred
|
||||||
|
|
||||||
self.state_in = []
|
self.state_in = []
|
||||||
|
@@ -1,4 +1,4 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
from mpi4py import MPI
|
from mpi4py import MPI
|
||||||
from baselines.common import set_global_seeds
|
from baselines.common import set_global_seeds
|
||||||
import os.path as osp
|
import os.path as osp
|
||||||
@@ -6,6 +6,7 @@ import gym, logging
|
|||||||
from baselines import logger
|
from baselines import logger
|
||||||
from baselines import bench
|
from baselines import bench
|
||||||
from baselines.common.atari_wrappers import make_atari, wrap_deepmind
|
from baselines.common.atari_wrappers import make_atari, wrap_deepmind
|
||||||
|
from baselines.common.cmd_util import atari_arg_parser
|
||||||
|
|
||||||
def train(env_id, num_timesteps, seed):
|
def train(env_id, num_timesteps, seed):
|
||||||
from baselines.trpo_mpi.nosharing_cnn_policy import CnnPolicy
|
from baselines.trpo_mpi.nosharing_cnn_policy import CnnPolicy
|
||||||
@@ -26,7 +27,6 @@ def train(env_id, num_timesteps, seed):
|
|||||||
return CnnPolicy(name=name, ob_space=env.observation_space, ac_space=env.action_space)
|
return CnnPolicy(name=name, ob_space=env.observation_space, ac_space=env.action_space)
|
||||||
env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), str(rank)))
|
env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), str(rank)))
|
||||||
env.seed(workerseed)
|
env.seed(workerseed)
|
||||||
gym.logger.setLevel(logging.WARN)
|
|
||||||
|
|
||||||
env = wrap_deepmind(env)
|
env = wrap_deepmind(env)
|
||||||
env.seed(workerseed)
|
env.seed(workerseed)
|
||||||
@@ -36,14 +36,8 @@ def train(env_id, num_timesteps, seed):
|
|||||||
env.close()
|
env.close()
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
import argparse
|
args = atari_arg_parser().parse_args()
|
||||||
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
|
||||||
parser.add_argument('--env', help='environment ID', default='PongNoFrameskip-v4')
|
|
||||||
parser.add_argument('--seed', help='RNG seed', type=int, default=0)
|
|
||||||
parser.add_argument('--num-timesteps', type=int, default=int(10e6))
|
|
||||||
args = parser.parse_args()
|
|
||||||
train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
|
train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
@@ -1,17 +1,10 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
# noinspection PyUnresolvedReferences
|
# noinspection PyUnresolvedReferences
|
||||||
import mujoco_py # Mujoco must come before other imports. https://openai.slack.com/archives/C1H6P3R7B/p1492828680631850
|
|
||||||
from mpi4py import MPI
|
from mpi4py import MPI
|
||||||
from baselines.common import set_global_seeds
|
from baselines.common.cmd_util import make_mujoco_env, mujoco_arg_parser
|
||||||
import os.path as osp
|
|
||||||
import gym
|
|
||||||
import logging
|
|
||||||
from baselines import logger
|
from baselines import logger
|
||||||
from baselines.ppo1.mlp_policy import MlpPolicy
|
from baselines.ppo1.mlp_policy import MlpPolicy
|
||||||
from baselines.common.mpi_fork import mpi_fork
|
|
||||||
from baselines import bench
|
|
||||||
from baselines.trpo_mpi import trpo_mpi
|
from baselines.trpo_mpi import trpo_mpi
|
||||||
import sys
|
|
||||||
|
|
||||||
def train(env_id, num_timesteps, seed):
|
def train(env_id, num_timesteps, seed):
|
||||||
import baselines.common.tf_util as U
|
import baselines.common.tf_util as U
|
||||||
@@ -22,27 +15,16 @@ def train(env_id, num_timesteps, seed):
|
|||||||
if rank != 0:
|
if rank != 0:
|
||||||
logger.set_level(logger.DISABLED)
|
logger.set_level(logger.DISABLED)
|
||||||
workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
|
workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
|
||||||
set_global_seeds(workerseed)
|
|
||||||
env = gym.make(env_id)
|
|
||||||
def policy_fn(name, ob_space, ac_space):
|
def policy_fn(name, ob_space, ac_space):
|
||||||
return MlpPolicy(name=name, ob_space=env.observation_space, ac_space=env.action_space,
|
return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
|
||||||
hid_size=32, num_hid_layers=2)
|
hid_size=32, num_hid_layers=2)
|
||||||
env = bench.Monitor(env, logger.get_dir() and
|
env = make_mujoco_env(env_id, workerseed)
|
||||||
osp.join(logger.get_dir(), str(rank)))
|
|
||||||
env.seed(workerseed)
|
|
||||||
gym.logger.setLevel(logging.WARN)
|
|
||||||
|
|
||||||
trpo_mpi.learn(env, policy_fn, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1,
|
trpo_mpi.learn(env, policy_fn, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1,
|
||||||
max_timesteps=num_timesteps, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3)
|
max_timesteps=num_timesteps, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3)
|
||||||
env.close()
|
env.close()
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
import argparse
|
args = mujoco_arg_parser().parse_args()
|
||||||
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
|
||||||
parser.add_argument('--env', help='environment ID', default='Hopper-v1')
|
|
||||||
parser.add_argument('--seed', help='RNG seed', type=int, default=0)
|
|
||||||
parser.add_argument('--num-timesteps', type=int, default=int(1e6))
|
|
||||||
args = parser.parse_args()
|
|
||||||
logger.configure()
|
logger.configure()
|
||||||
train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
|
train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
|
||||||
|
|
||||||
|
@@ -41,7 +41,7 @@ def traj_segment_generator(pi, env, horizon, stochastic):
|
|||||||
yield {"ob" : obs, "rew" : rews, "vpred" : vpreds, "new" : news,
|
yield {"ob" : obs, "rew" : rews, "vpred" : vpreds, "new" : news,
|
||||||
"ac" : acs, "prevac" : prevacs, "nextvpred": vpred * (1 - new),
|
"ac" : acs, "prevac" : prevacs, "nextvpred": vpred * (1 - new),
|
||||||
"ep_rets" : ep_rets, "ep_lens" : ep_lens}
|
"ep_rets" : ep_rets, "ep_lens" : ep_lens}
|
||||||
_, vpred = pi.act(stochastic, ob)
|
_, vpred = pi.act(stochastic, ob)
|
||||||
# Be careful!!! if you change the downstream algorithm to aggregate
|
# Be careful!!! if you change the downstream algorithm to aggregate
|
||||||
# several of these batches, then be sure to do a deepcopy
|
# several of these batches, then be sure to do a deepcopy
|
||||||
ep_rets = []
|
ep_rets = []
|
||||||
@@ -79,7 +79,7 @@ def add_vtarg_and_adv(seg, gamma, lam):
|
|||||||
gaelam[t] = lastgaelam = delta + gamma * lam * nonterminal * lastgaelam
|
gaelam[t] = lastgaelam = delta + gamma * lam * nonterminal * lastgaelam
|
||||||
seg["tdlamret"] = seg["adv"] + seg["vpred"]
|
seg["tdlamret"] = seg["adv"] + seg["vpred"]
|
||||||
|
|
||||||
def learn(env, policy_func, *,
|
def learn(env, policy_fn, *,
|
||||||
timesteps_per_batch, # what to train on
|
timesteps_per_batch, # what to train on
|
||||||
max_kl, cg_iters,
|
max_kl, cg_iters,
|
||||||
gamma, lam, # advantage estimation
|
gamma, lam, # advantage estimation
|
||||||
@@ -92,13 +92,13 @@ def learn(env, policy_func, *,
|
|||||||
):
|
):
|
||||||
nworkers = MPI.COMM_WORLD.Get_size()
|
nworkers = MPI.COMM_WORLD.Get_size()
|
||||||
rank = MPI.COMM_WORLD.Get_rank()
|
rank = MPI.COMM_WORLD.Get_rank()
|
||||||
np.set_printoptions(precision=3)
|
np.set_printoptions(precision=3)
|
||||||
# Setup losses and stuff
|
# Setup losses and stuff
|
||||||
# ----------------------------------------
|
# ----------------------------------------
|
||||||
ob_space = env.observation_space
|
ob_space = env.observation_space
|
||||||
ac_space = env.action_space
|
ac_space = env.action_space
|
||||||
pi = policy_func("pi", ob_space, ac_space)
|
pi = policy_fn("pi", ob_space, ac_space)
|
||||||
oldpi = policy_func("oldpi", ob_space, ac_space)
|
oldpi = policy_fn("oldpi", ob_space, ac_space)
|
||||||
atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable)
|
atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable)
|
||||||
ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return
|
ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return
|
||||||
|
|
||||||
@@ -107,14 +107,14 @@ def learn(env, policy_func, *,
|
|||||||
|
|
||||||
kloldnew = oldpi.pd.kl(pi.pd)
|
kloldnew = oldpi.pd.kl(pi.pd)
|
||||||
ent = pi.pd.entropy()
|
ent = pi.pd.entropy()
|
||||||
meankl = U.mean(kloldnew)
|
meankl = tf.reduce_mean(kloldnew)
|
||||||
meanent = U.mean(ent)
|
meanent = tf.reduce_mean(ent)
|
||||||
entbonus = entcoeff * meanent
|
entbonus = entcoeff * meanent
|
||||||
|
|
||||||
vferr = U.mean(tf.square(pi.vpred - ret))
|
vferr = tf.reduce_mean(tf.square(pi.vpred - ret))
|
||||||
|
|
||||||
ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold
|
ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold
|
||||||
surrgain = U.mean(ratio * atarg)
|
surrgain = tf.reduce_mean(ratio * atarg)
|
||||||
|
|
||||||
optimgain = surrgain + entbonus
|
optimgain = surrgain + entbonus
|
||||||
losses = [optimgain, meankl, entbonus, surrgain, meanent]
|
losses = [optimgain, meankl, entbonus, surrgain, meanent]
|
||||||
@@ -138,7 +138,7 @@ def learn(env, policy_func, *,
|
|||||||
sz = U.intprod(shape)
|
sz = U.intprod(shape)
|
||||||
tangents.append(tf.reshape(flat_tangent[start:start+sz], shape))
|
tangents.append(tf.reshape(flat_tangent[start:start+sz], shape))
|
||||||
start += sz
|
start += sz
|
||||||
gvp = tf.add_n([U.sum(g*tangent) for (g, tangent) in zipsame(klgrads, tangents)]) #pylint: disable=E1111
|
gvp = tf.add_n([tf.reduce_sum(g*tangent) for (g, tangent) in zipsame(klgrads, tangents)]) #pylint: disable=E1111
|
||||||
fvp = U.flatgrad(gvp, var_list)
|
fvp = U.flatgrad(gvp, var_list)
|
||||||
|
|
||||||
assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv)
|
assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv)
|
||||||
@@ -157,7 +157,7 @@ def learn(env, policy_func, *,
|
|||||||
print(colorize("done in %.3f seconds"%(time.time() - tstart), color='magenta'))
|
print(colorize("done in %.3f seconds"%(time.time() - tstart), color='magenta'))
|
||||||
else:
|
else:
|
||||||
yield
|
yield
|
||||||
|
|
||||||
def allmean(x):
|
def allmean(x):
|
||||||
assert isinstance(x, np.ndarray)
|
assert isinstance(x, np.ndarray)
|
||||||
out = np.empty_like(x)
|
out = np.empty_like(x)
|
||||||
@@ -185,7 +185,7 @@ def learn(env, policy_func, *,
|
|||||||
|
|
||||||
assert sum([max_iters>0, max_timesteps>0, max_episodes>0])==1
|
assert sum([max_iters>0, max_timesteps>0, max_episodes>0])==1
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
if callback: callback(locals(), globals())
|
if callback: callback(locals(), globals())
|
||||||
if max_timesteps and timesteps_so_far >= max_timesteps:
|
if max_timesteps and timesteps_so_far >= max_timesteps:
|
||||||
break
|
break
|
||||||
@@ -260,7 +260,7 @@ def learn(env, policy_func, *,
|
|||||||
with timed("vf"):
|
with timed("vf"):
|
||||||
|
|
||||||
for _ in range(vf_iters):
|
for _ in range(vf_iters):
|
||||||
for (mbob, mbret) in dataset.iterbatches((seg["ob"], seg["tdlamret"]),
|
for (mbob, mbret) in dataset.iterbatches((seg["ob"], seg["tdlamret"]),
|
||||||
include_final_partial_batch=False, batch_size=64):
|
include_final_partial_batch=False, batch_size=64):
|
||||||
g = allmean(compute_vflossandgrad(mbob, mbret))
|
g = allmean(compute_vflossandgrad(mbob, mbret))
|
||||||
vfadam.update(g, vf_stepsize)
|
vfadam.update(g, vf_stepsize)
|
||||||
|
Reference in New Issue
Block a user