Compare commits
1 Commits
her-fixes
...
simple_ben
Author | SHA1 | Date | |
---|---|---|---|
|
a4fba209c4 |
3
.gitignore
vendored
@@ -1,8 +1,6 @@
|
||||
*.swp
|
||||
*.pyc
|
||||
*.pkl
|
||||
*.py~
|
||||
.pytest_cache
|
||||
.DS_Store
|
||||
.idea
|
||||
|
||||
@@ -35,4 +33,3 @@ src
|
||||
|
||||
MUJOCO_LOG.TXT
|
||||
|
||||
|
||||
|
@@ -15,20 +15,16 @@ pip install -e .
|
||||
```
|
||||
|
||||
- [A2C](baselines/a2c)
|
||||
- [ACER](baselines/acer)
|
||||
- [ACKTR](baselines/acktr)
|
||||
- [DDPG](baselines/ddpg)
|
||||
- [DQN](baselines/deepq)
|
||||
- [GAIL](baselines/gail)
|
||||
- [HER](baselines/her)
|
||||
- [PPO1](baselines/ppo1) (Multi-CPU using MPI)
|
||||
- [PPO2](baselines/ppo2) (Optimized for GPU)
|
||||
- [PPO](baselines/ppo1)
|
||||
- [TRPO](baselines/trpo_mpi)
|
||||
|
||||
To cite this repository in publications:
|
||||
|
||||
@misc{baselines,
|
||||
author = {Dhariwal, Prafulla and Hesse, Christopher and Klimov, Oleg and Nichol, Alex and Plappert, Matthias and Radford, Alec and Schulman, John and Sidor, Szymon and Wu, Yuhuai},
|
||||
author = {Hesse, Christopher and Plappert, Matthias and Radford, Alec and Schulman, John and Sidor, Szymon and Wu, Yuhuai},
|
||||
title = {OpenAI Baselines},
|
||||
year = {2017},
|
||||
publisher = {GitHub},
|
||||
|
@@ -1,4 +1,3 @@
|
||||
import os
|
||||
import os.path as osp
|
||||
import gym
|
||||
import time
|
||||
@@ -11,19 +10,22 @@ from baselines import logger
|
||||
from baselines.common import set_global_seeds, explained_variance
|
||||
from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
|
||||
from baselines.common.atari_wrappers import wrap_deepmind
|
||||
from baselines.common import tf_util
|
||||
|
||||
from baselines.a2c.utils import discount_with_dones
|
||||
from baselines.a2c.utils import Scheduler, make_path, find_trainable_variables
|
||||
from baselines.a2c.policies import CnnPolicy
|
||||
from baselines.a2c.utils import cat_entropy, mse
|
||||
|
||||
class Model(object):
|
||||
|
||||
def __init__(self, policy, ob_space, ac_space, nenvs, nsteps,
|
||||
def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, nstack, num_procs,
|
||||
ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4,
|
||||
alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear'):
|
||||
|
||||
sess = tf_util.make_session()
|
||||
config = tf.ConfigProto(allow_soft_placement=True,
|
||||
intra_op_parallelism_threads=num_procs,
|
||||
inter_op_parallelism_threads=num_procs)
|
||||
config.gpu_options.allow_growth = True
|
||||
sess = tf.Session(config=config)
|
||||
nact = ac_space.n
|
||||
nbatch = nenvs*nsteps
|
||||
|
||||
@@ -32,8 +34,8 @@ class Model(object):
|
||||
R = tf.placeholder(tf.float32, [nbatch])
|
||||
LR = tf.placeholder(tf.float32, [])
|
||||
|
||||
step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False)
|
||||
train_model = policy(sess, ob_space, ac_space, nenvs*nsteps, nsteps, reuse=True)
|
||||
step_model = policy(sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False)
|
||||
train_model = policy(sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True)
|
||||
|
||||
neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=A)
|
||||
pg_loss = tf.reduce_mean(ADV * neglogpac)
|
||||
@@ -56,7 +58,7 @@ class Model(object):
|
||||
for step in range(len(obs)):
|
||||
cur_lr = lr.value()
|
||||
td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, LR:cur_lr}
|
||||
if states is not None:
|
||||
if states != []:
|
||||
td_map[train_model.S] = states
|
||||
td_map[train_model.M] = masks
|
||||
policy_loss, value_loss, policy_entropy, _ = sess.run(
|
||||
@@ -67,7 +69,7 @@ class Model(object):
|
||||
|
||||
def save(save_path):
|
||||
ps = sess.run(params)
|
||||
make_path(osp.dirname(save_path))
|
||||
make_path(save_path)
|
||||
joblib.dump(ps, save_path)
|
||||
|
||||
def load(load_path):
|
||||
@@ -89,25 +91,32 @@ class Model(object):
|
||||
|
||||
class Runner(object):
|
||||
|
||||
def __init__(self, env, model, nsteps=5, gamma=0.99):
|
||||
def __init__(self, env, model, nsteps=5, nstack=4, gamma=0.99):
|
||||
self.env = env
|
||||
self.model = model
|
||||
nh, nw, nc = env.observation_space.shape
|
||||
nenv = env.num_envs
|
||||
self.batch_ob_shape = (nenv*nsteps, nh, nw, nc)
|
||||
self.obs = np.zeros((nenv, nh, nw, nc), dtype=np.uint8)
|
||||
self.batch_ob_shape = (nenv*nsteps, nh, nw, nc*nstack)
|
||||
self.obs = np.zeros((nenv, nh, nw, nc*nstack), dtype=np.uint8)
|
||||
self.nc = nc
|
||||
obs = env.reset()
|
||||
self.update_obs(obs)
|
||||
self.gamma = gamma
|
||||
self.nsteps = nsteps
|
||||
self.states = model.initial_state
|
||||
self.dones = [False for _ in range(nenv)]
|
||||
|
||||
def update_obs(self, obs):
|
||||
# Do frame-stacking here instead of the FrameStack wrapper to reduce
|
||||
# IPC overhead
|
||||
self.obs = np.roll(self.obs, shift=-self.nc, axis=3)
|
||||
self.obs[:, :, :, -self.nc:] = obs
|
||||
|
||||
def run(self):
|
||||
mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[]
|
||||
mb_states = self.states
|
||||
for n in range(self.nsteps):
|
||||
actions, values, states, _ = self.model.step(self.obs, self.states, self.dones)
|
||||
actions, values, states = self.model.step(self.obs, self.states, self.dones)
|
||||
mb_obs.append(np.copy(self.obs))
|
||||
mb_actions.append(actions)
|
||||
mb_values.append(values)
|
||||
@@ -118,7 +127,7 @@ class Runner(object):
|
||||
for n, done in enumerate(dones):
|
||||
if done:
|
||||
self.obs[n] = self.obs[n]*0
|
||||
self.obs = obs
|
||||
self.update_obs(obs)
|
||||
mb_rewards.append(rewards)
|
||||
mb_dones.append(self.dones)
|
||||
#batch of steps to batch of rollouts
|
||||
@@ -145,16 +154,17 @@ class Runner(object):
|
||||
mb_masks = mb_masks.flatten()
|
||||
return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values
|
||||
|
||||
def learn(policy, env, seed, nsteps=5, total_timesteps=int(80e6), vf_coef=0.5, ent_coef=0.01, max_grad_norm=0.5, lr=7e-4, lrschedule='linear', epsilon=1e-5, alpha=0.99, gamma=0.99, log_interval=100):
|
||||
def learn(policy, env, seed, nsteps=5, nstack=4, total_timesteps=int(80e6), vf_coef=0.5, ent_coef=0.01, max_grad_norm=0.5, lr=7e-4, lrschedule='linear', epsilon=1e-5, alpha=0.99, gamma=0.99, log_interval=100):
|
||||
tf.reset_default_graph()
|
||||
set_global_seeds(seed)
|
||||
|
||||
nenvs = env.num_envs
|
||||
ob_space = env.observation_space
|
||||
ac_space = env.action_space
|
||||
model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nenvs=nenvs, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef,
|
||||
num_procs = len(env.remotes) # HACK
|
||||
model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nenvs=nenvs, nsteps=nsteps, nstack=nstack, num_procs=num_procs, ent_coef=ent_coef, vf_coef=vf_coef,
|
||||
max_grad_norm=max_grad_norm, lr=lr, alpha=alpha, epsilon=epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule)
|
||||
runner = Runner(env, model, nsteps=nsteps, gamma=gamma)
|
||||
runner = Runner(env, model, nsteps=nsteps, nstack=nstack, gamma=gamma)
|
||||
|
||||
nbatch = nenvs*nsteps
|
||||
tstart = time.time()
|
||||
@@ -173,3 +183,6 @@ def learn(policy, env, seed, nsteps=5, total_timesteps=int(80e6), vf_coef=0.5, e
|
||||
logger.record_tabular("explained_variance", float(ev))
|
||||
logger.dump_tabular()
|
||||
env.close()
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
@@ -1,48 +1,36 @@
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
from baselines.a2c.utils import conv, fc, conv_to_fc, batch_to_seq, seq_to_batch, lstm, lnlstm
|
||||
from baselines.common.distributions import make_pdtype
|
||||
|
||||
def nature_cnn(unscaled_images):
|
||||
"""
|
||||
CNN from Nature paper.
|
||||
"""
|
||||
scaled_images = tf.cast(unscaled_images, tf.float32) / 255.
|
||||
activ = tf.nn.relu
|
||||
h = activ(conv(scaled_images, 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2)))
|
||||
h2 = activ(conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2)))
|
||||
h3 = activ(conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2)))
|
||||
h3 = conv_to_fc(h3)
|
||||
return activ(fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2)))
|
||||
from baselines.a2c.utils import conv, fc, conv_to_fc, batch_to_seq, seq_to_batch, lstm, lnlstm, sample
|
||||
|
||||
class LnLstmPolicy(object):
|
||||
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False):
|
||||
nenv = nbatch // nsteps
|
||||
def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, nlstm=256, reuse=False):
|
||||
nbatch = nenv*nsteps
|
||||
nh, nw, nc = ob_space.shape
|
||||
ob_shape = (nbatch, nh, nw, nc)
|
||||
ob_shape = (nbatch, nh, nw, nc*nstack)
|
||||
nact = ac_space.n
|
||||
X = tf.placeholder(tf.uint8, ob_shape) #obs
|
||||
M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
|
||||
S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
|
||||
with tf.variable_scope("model", reuse=reuse):
|
||||
h = nature_cnn(X)
|
||||
xs = batch_to_seq(h, nenv, nsteps)
|
||||
h = conv(tf.cast(X, tf.float32)/255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2))
|
||||
h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))
|
||||
h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2))
|
||||
h3 = conv_to_fc(h3)
|
||||
h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
|
||||
xs = batch_to_seq(h4, nenv, nsteps)
|
||||
ms = batch_to_seq(M, nenv, nsteps)
|
||||
h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm)
|
||||
h5 = seq_to_batch(h5)
|
||||
pi = fc(h5, 'pi', nact)
|
||||
vf = fc(h5, 'v', 1)
|
||||
|
||||
self.pdtype = make_pdtype(ac_space)
|
||||
self.pd = self.pdtype.pdfromflat(pi)
|
||||
pi = fc(h5, 'pi', nact, act=lambda x:x)
|
||||
vf = fc(h5, 'v', 1, act=lambda x:x)
|
||||
|
||||
v0 = vf[:, 0]
|
||||
a0 = self.pd.sample()
|
||||
neglogp0 = self.pd.neglogp(a0)
|
||||
a0 = sample(pi)
|
||||
self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)
|
||||
|
||||
def step(ob, state, mask):
|
||||
return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask})
|
||||
a, v, s = sess.run([a0, v0, snew], {X:ob, S:state, M:mask})
|
||||
return a, v, s
|
||||
|
||||
def value(ob, state, mask):
|
||||
return sess.run(v0, {X:ob, S:state, M:mask})
|
||||
@@ -57,34 +45,34 @@ class LnLstmPolicy(object):
|
||||
|
||||
class LstmPolicy(object):
|
||||
|
||||
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False):
|
||||
nenv = nbatch // nsteps
|
||||
|
||||
def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, nlstm=256, reuse=False):
|
||||
nbatch = nenv*nsteps
|
||||
nh, nw, nc = ob_space.shape
|
||||
ob_shape = (nbatch, nh, nw, nc)
|
||||
ob_shape = (nbatch, nh, nw, nc*nstack)
|
||||
nact = ac_space.n
|
||||
X = tf.placeholder(tf.uint8, ob_shape) #obs
|
||||
M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
|
||||
S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
|
||||
with tf.variable_scope("model", reuse=reuse):
|
||||
h = nature_cnn(X)
|
||||
xs = batch_to_seq(h, nenv, nsteps)
|
||||
h = conv(tf.cast(X, tf.float32)/255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2))
|
||||
h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))
|
||||
h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2))
|
||||
h3 = conv_to_fc(h3)
|
||||
h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
|
||||
xs = batch_to_seq(h4, nenv, nsteps)
|
||||
ms = batch_to_seq(M, nenv, nsteps)
|
||||
h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
|
||||
h5 = seq_to_batch(h5)
|
||||
pi = fc(h5, 'pi', nact)
|
||||
vf = fc(h5, 'v', 1)
|
||||
|
||||
self.pdtype = make_pdtype(ac_space)
|
||||
self.pd = self.pdtype.pdfromflat(pi)
|
||||
pi = fc(h5, 'pi', nact, act=lambda x:x)
|
||||
vf = fc(h5, 'v', 1, act=lambda x:x)
|
||||
|
||||
v0 = vf[:, 0]
|
||||
a0 = self.pd.sample()
|
||||
neglogp0 = self.pd.neglogp(a0)
|
||||
a0 = sample(pi)
|
||||
self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)
|
||||
|
||||
def step(ob, state, mask):
|
||||
return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask})
|
||||
a, v, s = sess.run([a0, v0, snew], {X:ob, S:state, M:mask})
|
||||
return a, v, s
|
||||
|
||||
def value(ob, state, mask):
|
||||
return sess.run(v0, {X:ob, S:state, M:mask})
|
||||
@@ -99,67 +87,31 @@ class LstmPolicy(object):
|
||||
|
||||
class CnnPolicy(object):
|
||||
|
||||
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False): #pylint: disable=W0613
|
||||
def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False):
|
||||
nbatch = nenv*nsteps
|
||||
nh, nw, nc = ob_space.shape
|
||||
ob_shape = (nbatch, nh, nw, nc)
|
||||
ob_shape = (nbatch, nh, nw, nc*nstack)
|
||||
nact = ac_space.n
|
||||
X = tf.placeholder(tf.uint8, ob_shape) #obs
|
||||
with tf.variable_scope("model", reuse=reuse):
|
||||
h = nature_cnn(X)
|
||||
pi = fc(h, 'pi', nact, init_scale=0.01)
|
||||
vf = fc(h, 'v', 1)[:,0]
|
||||
h = conv(tf.cast(X, tf.float32)/255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2))
|
||||
h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))
|
||||
h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2))
|
||||
h3 = conv_to_fc(h3)
|
||||
h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
|
||||
pi = fc(h4, 'pi', nact, act=lambda x:x)
|
||||
vf = fc(h4, 'v', 1, act=lambda x:x)
|
||||
|
||||
self.pdtype = make_pdtype(ac_space)
|
||||
self.pd = self.pdtype.pdfromflat(pi)
|
||||
|
||||
a0 = self.pd.sample()
|
||||
neglogp0 = self.pd.neglogp(a0)
|
||||
self.initial_state = None
|
||||
v0 = vf[:, 0]
|
||||
a0 = sample(pi)
|
||||
self.initial_state = [] #not stateful
|
||||
|
||||
def step(ob, *_args, **_kwargs):
|
||||
a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob})
|
||||
return a, v, self.initial_state, neglogp
|
||||
a, v = sess.run([a0, v0], {X:ob})
|
||||
return a, v, [] #dummy state
|
||||
|
||||
def value(ob, *_args, **_kwargs):
|
||||
return sess.run(vf, {X:ob})
|
||||
|
||||
self.X = X
|
||||
self.pi = pi
|
||||
self.vf = vf
|
||||
self.step = step
|
||||
self.value = value
|
||||
|
||||
class MlpPolicy(object):
|
||||
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False): #pylint: disable=W0613
|
||||
ob_shape = (nbatch,) + ob_space.shape
|
||||
actdim = ac_space.shape[0]
|
||||
X = tf.placeholder(tf.float32, ob_shape, name='Ob') #obs
|
||||
with tf.variable_scope("model", reuse=reuse):
|
||||
activ = tf.tanh
|
||||
h1 = activ(fc(X, 'pi_fc1', nh=64, init_scale=np.sqrt(2)))
|
||||
h2 = activ(fc(h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2)))
|
||||
pi = fc(h2, 'pi', actdim, init_scale=0.01)
|
||||
h1 = activ(fc(X, 'vf_fc1', nh=64, init_scale=np.sqrt(2)))
|
||||
h2 = activ(fc(h1, 'vf_fc2', nh=64, init_scale=np.sqrt(2)))
|
||||
vf = fc(h2, 'vf', 1)[:,0]
|
||||
logstd = tf.get_variable(name="logstd", shape=[1, actdim],
|
||||
initializer=tf.zeros_initializer())
|
||||
|
||||
pdparam = tf.concat([pi, pi * 0.0 + logstd], axis=1)
|
||||
|
||||
self.pdtype = make_pdtype(ac_space)
|
||||
self.pd = self.pdtype.pdfromflat(pdparam)
|
||||
|
||||
a0 = self.pd.sample()
|
||||
neglogp0 = self.pd.neglogp(a0)
|
||||
self.initial_state = None
|
||||
|
||||
def step(ob, *_args, **_kwargs):
|
||||
a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob})
|
||||
return a, v, self.initial_state, neglogp
|
||||
|
||||
def value(ob, *_args, **_kwargs):
|
||||
return sess.run(vf, {X:ob})
|
||||
return sess.run(v0, {X:ob})
|
||||
|
||||
self.X = X
|
||||
self.pi = pi
|
||||
|
@@ -1,30 +1,45 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
#!/usr/bin/env python
|
||||
import os, logging, gym
|
||||
from baselines import logger
|
||||
from baselines.common.cmd_util import make_atari_env, atari_arg_parser
|
||||
from baselines.common.vec_env.vec_frame_stack import VecFrameStack
|
||||
from baselines.common import set_global_seeds
|
||||
from baselines import bench
|
||||
from baselines.a2c.a2c import learn
|
||||
from baselines.ppo2.policies import CnnPolicy, LstmPolicy, LnLstmPolicy
|
||||
from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
|
||||
from baselines.common.atari_wrappers import make_atari, wrap_deepmind
|
||||
from baselines.a2c.policies import CnnPolicy, LstmPolicy, LnLstmPolicy
|
||||
|
||||
def train(env_id, num_timesteps, seed, policy, lrschedule, num_env):
|
||||
def train(env_id, num_timesteps, seed, policy, lrschedule, num_cpu):
|
||||
def make_env(rank):
|
||||
def _thunk():
|
||||
env = make_atari(env_id)
|
||||
env.seed(seed + rank)
|
||||
env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
|
||||
gym.logger.setLevel(logging.WARN)
|
||||
return wrap_deepmind(env)
|
||||
return _thunk
|
||||
set_global_seeds(seed)
|
||||
env = SubprocVecEnv([make_env(i) for i in range(num_cpu)])
|
||||
if policy == 'cnn':
|
||||
policy_fn = CnnPolicy
|
||||
elif policy == 'lstm':
|
||||
policy_fn = LstmPolicy
|
||||
elif policy == 'lnlstm':
|
||||
policy_fn = LnLstmPolicy
|
||||
env = VecFrameStack(make_atari_env(env_id, num_env, seed), 4)
|
||||
learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), lrschedule=lrschedule)
|
||||
env.close()
|
||||
|
||||
def main():
|
||||
parser = atari_arg_parser()
|
||||
import argparse
|
||||
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||
parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4')
|
||||
parser.add_argument('--seed', help='RNG seed', type=int, default=0)
|
||||
parser.add_argument('--policy', help='Policy architecture', choices=['cnn', 'lstm', 'lnlstm'], default='cnn')
|
||||
parser.add_argument('--lrschedule', help='Learning rate schedule', choices=['constant', 'linear'], default='constant')
|
||||
parser.add_argument('--num-timesteps', type=int, default=int(10e6))
|
||||
args = parser.parse_args()
|
||||
logger.configure()
|
||||
train(args.env, num_timesteps=args.num_timesteps, seed=args.seed,
|
||||
policy=args.policy, lrschedule=args.lrschedule, num_env=16)
|
||||
policy=args.policy, lrschedule=args.lrschedule, num_cpu=16)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
@@ -39,31 +39,23 @@ def ortho_init(scale=1.0):
|
||||
return (scale * q[:shape[0], :shape[1]]).astype(np.float32)
|
||||
return _ortho_init
|
||||
|
||||
def conv(x, scope, *, nf, rf, stride, pad='VALID', init_scale=1.0, data_format='NHWC'):
|
||||
if data_format == 'NHWC':
|
||||
channel_ax = 3
|
||||
strides = [1, stride, stride, 1]
|
||||
bshape = [1, 1, 1, nf]
|
||||
elif data_format == 'NCHW':
|
||||
channel_ax = 1
|
||||
strides = [1, 1, stride, stride]
|
||||
bshape = [1, nf, 1, 1]
|
||||
else:
|
||||
raise NotImplementedError
|
||||
nin = x.get_shape()[channel_ax].value
|
||||
wshape = [rf, rf, nin, nf]
|
||||
def conv(x, scope, nf, rf, stride, pad='VALID', act=tf.nn.relu, init_scale=1.0):
|
||||
with tf.variable_scope(scope):
|
||||
w = tf.get_variable("w", wshape, initializer=ortho_init(init_scale))
|
||||
b = tf.get_variable("b", [1, nf, 1, 1], initializer=tf.constant_initializer(0.0))
|
||||
if data_format == 'NHWC': b = tf.reshape(b, bshape)
|
||||
return b + tf.nn.conv2d(x, w, strides=strides, padding=pad, data_format=data_format)
|
||||
nin = x.get_shape()[3].value
|
||||
w = tf.get_variable("w", [rf, rf, nin, nf], initializer=ortho_init(init_scale))
|
||||
b = tf.get_variable("b", [nf], initializer=tf.constant_initializer(0.0))
|
||||
z = tf.nn.conv2d(x, w, strides=[1, stride, stride, 1], padding=pad)+b
|
||||
h = act(z)
|
||||
return h
|
||||
|
||||
def fc(x, scope, nh, *, init_scale=1.0, init_bias=0.0):
|
||||
def fc(x, scope, nh, act=tf.nn.relu, init_scale=1.0):
|
||||
with tf.variable_scope(scope):
|
||||
nin = x.get_shape()[1].value
|
||||
w = tf.get_variable("w", [nin, nh], initializer=ortho_init(init_scale))
|
||||
b = tf.get_variable("b", [nh], initializer=tf.constant_initializer(init_bias))
|
||||
return tf.matmul(x, w)+b
|
||||
b = tf.get_variable("b", [nh], initializer=tf.constant_initializer(0.0))
|
||||
z = tf.matmul(x, w)+b
|
||||
h = act(z)
|
||||
return h
|
||||
|
||||
def batch_to_seq(h, nbatch, nsteps, flat=False):
|
||||
if flat:
|
||||
@@ -170,34 +162,9 @@ def constant(p):
|
||||
def linear(p):
|
||||
return 1-p
|
||||
|
||||
def middle_drop(p):
|
||||
eps = 0.75
|
||||
if 1-p<eps:
|
||||
return eps*0.1
|
||||
return 1-p
|
||||
|
||||
def double_linear_con(p):
|
||||
p *= 2
|
||||
eps = 0.125
|
||||
if 1-p<eps:
|
||||
return eps
|
||||
return 1-p
|
||||
|
||||
def double_middle_drop(p):
|
||||
eps1 = 0.75
|
||||
eps2 = 0.25
|
||||
if 1-p<eps1:
|
||||
if 1-p<eps2:
|
||||
return eps2*0.5
|
||||
return eps1*0.1
|
||||
return 1-p
|
||||
|
||||
schedules = {
|
||||
'linear':linear,
|
||||
'constant':constant,
|
||||
'double_linear_con': double_linear_con,
|
||||
'middle_drop': middle_drop,
|
||||
'double_middle_drop': double_middle_drop
|
||||
'constant':constant
|
||||
}
|
||||
|
||||
class Scheduler(object):
|
||||
@@ -271,7 +238,7 @@ def check_shape(ts,shapes):
|
||||
def avg_norm(t):
|
||||
return tf.reduce_mean(tf.sqrt(tf.reduce_sum(tf.square(t), axis=-1)))
|
||||
|
||||
def gradient_add(g1, g2, param):
|
||||
def myadd(g1, g2, param):
|
||||
print([g1, g2, param.name])
|
||||
assert (not (g1 is None and g2 is None)), param.name
|
||||
if g1 is None:
|
||||
@@ -281,7 +248,7 @@ def gradient_add(g1, g2, param):
|
||||
else:
|
||||
return g1 + g2
|
||||
|
||||
def q_explained_variance(qpred, q):
|
||||
def my_explained_variance(qpred, q):
|
||||
_, vary = tf.nn.moments(q, axes=[0, 1])
|
||||
_, varpred = tf.nn.moments(q - qpred, axes=[0, 1])
|
||||
check_shape([vary, varpred], [[]] * 2)
|
||||
|
@@ -1,4 +0,0 @@
|
||||
# ACER
|
||||
|
||||
- Original paper: https://arxiv.org/abs/1611.01224
|
||||
- `python -m baselines.acer.run_atari` runs the algorithm for 40M frames = 10M timesteps on an Atari game. See help (`-h`) for more options.
|
@@ -1,349 +0,0 @@
|
||||
import time
|
||||
import joblib
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
from baselines import logger
|
||||
|
||||
from baselines.common import set_global_seeds
|
||||
|
||||
from baselines.a2c.utils import batch_to_seq, seq_to_batch
|
||||
from baselines.a2c.utils import Scheduler, make_path, find_trainable_variables
|
||||
from baselines.a2c.utils import cat_entropy_softmax
|
||||
from baselines.a2c.utils import EpisodeStats
|
||||
from baselines.a2c.utils import get_by_index, check_shape, avg_norm, gradient_add, q_explained_variance
|
||||
from baselines.acer.buffer import Buffer
|
||||
|
||||
# remove last step
|
||||
def strip(var, nenvs, nsteps, flat = False):
|
||||
vars = batch_to_seq(var, nenvs, nsteps + 1, flat)
|
||||
return seq_to_batch(vars[:-1], flat)
|
||||
|
||||
def q_retrace(R, D, q_i, v, rho_i, nenvs, nsteps, gamma):
|
||||
"""
|
||||
Calculates q_retrace targets
|
||||
|
||||
:param R: Rewards
|
||||
:param D: Dones
|
||||
:param q_i: Q values for actions taken
|
||||
:param v: V values
|
||||
:param rho_i: Importance weight for each action
|
||||
:return: Q_retrace values
|
||||
"""
|
||||
rho_bar = batch_to_seq(tf.minimum(1.0, rho_i), nenvs, nsteps, True) # list of len steps, shape [nenvs]
|
||||
rs = batch_to_seq(R, nenvs, nsteps, True) # list of len steps, shape [nenvs]
|
||||
ds = batch_to_seq(D, nenvs, nsteps, True) # list of len steps, shape [nenvs]
|
||||
q_is = batch_to_seq(q_i, nenvs, nsteps, True)
|
||||
vs = batch_to_seq(v, nenvs, nsteps + 1, True)
|
||||
v_final = vs[-1]
|
||||
qret = v_final
|
||||
qrets = []
|
||||
for i in range(nsteps - 1, -1, -1):
|
||||
check_shape([qret, ds[i], rs[i], rho_bar[i], q_is[i], vs[i]], [[nenvs]] * 6)
|
||||
qret = rs[i] + gamma * qret * (1.0 - ds[i])
|
||||
qrets.append(qret)
|
||||
qret = (rho_bar[i] * (qret - q_is[i])) + vs[i]
|
||||
qrets = qrets[::-1]
|
||||
qret = seq_to_batch(qrets, flat=True)
|
||||
return qret
|
||||
|
||||
# For ACER with PPO clipping instead of trust region
|
||||
# def clip(ratio, eps_clip):
|
||||
# # assume 0 <= eps_clip <= 1
|
||||
# return tf.minimum(1 + eps_clip, tf.maximum(1 - eps_clip, ratio))
|
||||
|
||||
class Model(object):
|
||||
def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, nstack, num_procs,
|
||||
ent_coef, q_coef, gamma, max_grad_norm, lr,
|
||||
rprop_alpha, rprop_epsilon, total_timesteps, lrschedule,
|
||||
c, trust_region, alpha, delta):
|
||||
config = tf.ConfigProto(allow_soft_placement=True,
|
||||
intra_op_parallelism_threads=num_procs,
|
||||
inter_op_parallelism_threads=num_procs)
|
||||
sess = tf.Session(config=config)
|
||||
nact = ac_space.n
|
||||
nbatch = nenvs * nsteps
|
||||
|
||||
A = tf.placeholder(tf.int32, [nbatch]) # actions
|
||||
D = tf.placeholder(tf.float32, [nbatch]) # dones
|
||||
R = tf.placeholder(tf.float32, [nbatch]) # rewards, not returns
|
||||
MU = tf.placeholder(tf.float32, [nbatch, nact]) # mu's
|
||||
LR = tf.placeholder(tf.float32, [])
|
||||
eps = 1e-6
|
||||
|
||||
step_model = policy(sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False)
|
||||
train_model = policy(sess, ob_space, ac_space, nenvs, nsteps + 1, nstack, reuse=True)
|
||||
|
||||
params = find_trainable_variables("model")
|
||||
print("Params {}".format(len(params)))
|
||||
for var in params:
|
||||
print(var)
|
||||
|
||||
# create polyak averaged model
|
||||
ema = tf.train.ExponentialMovingAverage(alpha)
|
||||
ema_apply_op = ema.apply(params)
|
||||
|
||||
def custom_getter(getter, *args, **kwargs):
|
||||
v = ema.average(getter(*args, **kwargs))
|
||||
print(v.name)
|
||||
return v
|
||||
|
||||
with tf.variable_scope("", custom_getter=custom_getter, reuse=True):
|
||||
polyak_model = policy(sess, ob_space, ac_space, nenvs, nsteps + 1, nstack, reuse=True)
|
||||
|
||||
# Notation: (var) = batch variable, (var)s = seqeuence variable, (var)_i = variable index by action at step i
|
||||
v = tf.reduce_sum(train_model.pi * train_model.q, axis = -1) # shape is [nenvs * (nsteps + 1)]
|
||||
|
||||
# strip off last step
|
||||
f, f_pol, q = map(lambda var: strip(var, nenvs, nsteps), [train_model.pi, polyak_model.pi, train_model.q])
|
||||
# Get pi and q values for actions taken
|
||||
f_i = get_by_index(f, A)
|
||||
q_i = get_by_index(q, A)
|
||||
|
||||
# Compute ratios for importance truncation
|
||||
rho = f / (MU + eps)
|
||||
rho_i = get_by_index(rho, A)
|
||||
|
||||
# Calculate Q_retrace targets
|
||||
qret = q_retrace(R, D, q_i, v, rho_i, nenvs, nsteps, gamma)
|
||||
|
||||
# Calculate losses
|
||||
# Entropy
|
||||
entropy = tf.reduce_mean(cat_entropy_softmax(f))
|
||||
|
||||
# Policy Graident loss, with truncated importance sampling & bias correction
|
||||
v = strip(v, nenvs, nsteps, True)
|
||||
check_shape([qret, v, rho_i, f_i], [[nenvs * nsteps]] * 4)
|
||||
check_shape([rho, f, q], [[nenvs * nsteps, nact]] * 2)
|
||||
|
||||
# Truncated importance sampling
|
||||
adv = qret - v
|
||||
logf = tf.log(f_i + eps)
|
||||
gain_f = logf * tf.stop_gradient(adv * tf.minimum(c, rho_i)) # [nenvs * nsteps]
|
||||
loss_f = -tf.reduce_mean(gain_f)
|
||||
|
||||
# Bias correction for the truncation
|
||||
adv_bc = (q - tf.reshape(v, [nenvs * nsteps, 1])) # [nenvs * nsteps, nact]
|
||||
logf_bc = tf.log(f + eps) # / (f_old + eps)
|
||||
check_shape([adv_bc, logf_bc], [[nenvs * nsteps, nact]]*2)
|
||||
gain_bc = tf.reduce_sum(logf_bc * tf.stop_gradient(adv_bc * tf.nn.relu(1.0 - (c / (rho + eps))) * f), axis = 1) #IMP: This is sum, as expectation wrt f
|
||||
loss_bc= -tf.reduce_mean(gain_bc)
|
||||
|
||||
loss_policy = loss_f + loss_bc
|
||||
|
||||
# Value/Q function loss, and explained variance
|
||||
check_shape([qret, q_i], [[nenvs * nsteps]]*2)
|
||||
ev = q_explained_variance(tf.reshape(q_i, [nenvs, nsteps]), tf.reshape(qret, [nenvs, nsteps]))
|
||||
loss_q = tf.reduce_mean(tf.square(tf.stop_gradient(qret) - q_i)*0.5)
|
||||
|
||||
# Net loss
|
||||
check_shape([loss_policy, loss_q, entropy], [[]] * 3)
|
||||
loss = loss_policy + q_coef * loss_q - ent_coef * entropy
|
||||
|
||||
if trust_region:
|
||||
g = tf.gradients(- (loss_policy - ent_coef * entropy) * nsteps * nenvs, f) #[nenvs * nsteps, nact]
|
||||
# k = tf.gradients(KL(f_pol || f), f)
|
||||
k = - f_pol / (f + eps) #[nenvs * nsteps, nact] # Directly computed gradient of KL divergence wrt f
|
||||
k_dot_g = tf.reduce_sum(k * g, axis=-1)
|
||||
adj = tf.maximum(0.0, (tf.reduce_sum(k * g, axis=-1) - delta) / (tf.reduce_sum(tf.square(k), axis=-1) + eps)) #[nenvs * nsteps]
|
||||
|
||||
# Calculate stats (before doing adjustment) for logging.
|
||||
avg_norm_k = avg_norm(k)
|
||||
avg_norm_g = avg_norm(g)
|
||||
avg_norm_k_dot_g = tf.reduce_mean(tf.abs(k_dot_g))
|
||||
avg_norm_adj = tf.reduce_mean(tf.abs(adj))
|
||||
|
||||
g = g - tf.reshape(adj, [nenvs * nsteps, 1]) * k
|
||||
grads_f = -g/(nenvs*nsteps) # These are turst region adjusted gradients wrt f ie statistics of policy pi
|
||||
grads_policy = tf.gradients(f, params, grads_f)
|
||||
grads_q = tf.gradients(loss_q * q_coef, params)
|
||||
grads = [gradient_add(g1, g2, param) for (g1, g2, param) in zip(grads_policy, grads_q, params)]
|
||||
|
||||
avg_norm_grads_f = avg_norm(grads_f) * (nsteps * nenvs)
|
||||
norm_grads_q = tf.global_norm(grads_q)
|
||||
norm_grads_policy = tf.global_norm(grads_policy)
|
||||
else:
|
||||
grads = tf.gradients(loss, params)
|
||||
|
||||
if max_grad_norm is not None:
|
||||
grads, norm_grads = tf.clip_by_global_norm(grads, max_grad_norm)
|
||||
grads = list(zip(grads, params))
|
||||
trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=rprop_alpha, epsilon=rprop_epsilon)
|
||||
_opt_op = trainer.apply_gradients(grads)
|
||||
|
||||
# so when you call _train, you first do the gradient step, then you apply ema
|
||||
with tf.control_dependencies([_opt_op]):
|
||||
_train = tf.group(ema_apply_op)
|
||||
|
||||
lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)
|
||||
|
||||
# Ops/Summaries to run, and their names for logging
|
||||
run_ops = [_train, loss, loss_q, entropy, loss_policy, loss_f, loss_bc, ev, norm_grads]
|
||||
names_ops = ['loss', 'loss_q', 'entropy', 'loss_policy', 'loss_f', 'loss_bc', 'explained_variance',
|
||||
'norm_grads']
|
||||
if trust_region:
|
||||
run_ops = run_ops + [norm_grads_q, norm_grads_policy, avg_norm_grads_f, avg_norm_k, avg_norm_g, avg_norm_k_dot_g,
|
||||
avg_norm_adj]
|
||||
names_ops = names_ops + ['norm_grads_q', 'norm_grads_policy', 'avg_norm_grads_f', 'avg_norm_k', 'avg_norm_g',
|
||||
'avg_norm_k_dot_g', 'avg_norm_adj']
|
||||
|
||||
def train(obs, actions, rewards, dones, mus, states, masks, steps):
|
||||
cur_lr = lr.value_steps(steps)
|
||||
td_map = {train_model.X: obs, polyak_model.X: obs, A: actions, R: rewards, D: dones, MU: mus, LR: cur_lr}
|
||||
if states != []:
|
||||
td_map[train_model.S] = states
|
||||
td_map[train_model.M] = masks
|
||||
td_map[polyak_model.S] = states
|
||||
td_map[polyak_model.M] = masks
|
||||
return names_ops, sess.run(run_ops, td_map)[1:] # strip off _train
|
||||
|
||||
def save(save_path):
|
||||
ps = sess.run(params)
|
||||
make_path(osp.dirname(save_path))
|
||||
joblib.dump(ps, save_path)
|
||||
|
||||
self.train = train
|
||||
self.save = save
|
||||
self.train_model = train_model
|
||||
self.step_model = step_model
|
||||
self.step = step_model.step
|
||||
self.initial_state = step_model.initial_state
|
||||
tf.global_variables_initializer().run(session=sess)
|
||||
|
||||
class Runner(object):
|
||||
def __init__(self, env, model, nsteps, nstack):
|
||||
self.env = env
|
||||
self.nstack = nstack
|
||||
self.model = model
|
||||
nh, nw, nc = env.observation_space.shape
|
||||
self.nc = nc # nc = 1 for atari, but just in case
|
||||
self.nenv = nenv = env.num_envs
|
||||
self.nact = env.action_space.n
|
||||
self.nbatch = nenv * nsteps
|
||||
self.batch_ob_shape = (nenv*(nsteps+1), nh, nw, nc*nstack)
|
||||
self.obs = np.zeros((nenv, nh, nw, nc * nstack), dtype=np.uint8)
|
||||
obs = env.reset()
|
||||
self.update_obs(obs)
|
||||
self.nsteps = nsteps
|
||||
self.states = model.initial_state
|
||||
self.dones = [False for _ in range(nenv)]
|
||||
|
||||
def update_obs(self, obs, dones=None):
|
||||
if dones is not None:
|
||||
self.obs *= (1 - dones.astype(np.uint8))[:, None, None, None]
|
||||
self.obs = np.roll(self.obs, shift=-self.nc, axis=3)
|
||||
self.obs[:, :, :, -self.nc:] = obs[:, :, :, :]
|
||||
|
||||
def run(self):
|
||||
enc_obs = np.split(self.obs, self.nstack, axis=3) # so now list of obs steps
|
||||
mb_obs, mb_actions, mb_mus, mb_dones, mb_rewards = [], [], [], [], []
|
||||
for _ in range(self.nsteps):
|
||||
actions, mus, states = self.model.step(self.obs, state=self.states, mask=self.dones)
|
||||
mb_obs.append(np.copy(self.obs))
|
||||
mb_actions.append(actions)
|
||||
mb_mus.append(mus)
|
||||
mb_dones.append(self.dones)
|
||||
obs, rewards, dones, _ = self.env.step(actions)
|
||||
# states information for statefull models like LSTM
|
||||
self.states = states
|
||||
self.dones = dones
|
||||
self.update_obs(obs, dones)
|
||||
mb_rewards.append(rewards)
|
||||
enc_obs.append(obs)
|
||||
mb_obs.append(np.copy(self.obs))
|
||||
mb_dones.append(self.dones)
|
||||
|
||||
enc_obs = np.asarray(enc_obs, dtype=np.uint8).swapaxes(1, 0)
|
||||
mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0)
|
||||
mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0)
|
||||
mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0)
|
||||
mb_mus = np.asarray(mb_mus, dtype=np.float32).swapaxes(1, 0)
|
||||
|
||||
mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
|
||||
|
||||
mb_masks = mb_dones # Used for statefull models like LSTM's to mask state when done
|
||||
mb_dones = mb_dones[:, 1:] # Used for calculating returns. The dones array is now aligned with rewards
|
||||
|
||||
# shapes are now [nenv, nsteps, []]
|
||||
# When pulling from buffer, arrays will now be reshaped in place, preventing a deep copy.
|
||||
|
||||
return enc_obs, mb_obs, mb_actions, mb_rewards, mb_mus, mb_dones, mb_masks
|
||||
|
||||
class Acer():
|
||||
def __init__(self, runner, model, buffer, log_interval):
|
||||
self.runner = runner
|
||||
self.model = model
|
||||
self.buffer = buffer
|
||||
self.log_interval = log_interval
|
||||
self.tstart = None
|
||||
self.episode_stats = EpisodeStats(runner.nsteps, runner.nenv)
|
||||
self.steps = None
|
||||
|
||||
def call(self, on_policy):
|
||||
runner, model, buffer, steps = self.runner, self.model, self.buffer, self.steps
|
||||
if on_policy:
|
||||
enc_obs, obs, actions, rewards, mus, dones, masks = runner.run()
|
||||
self.episode_stats.feed(rewards, dones)
|
||||
if buffer is not None:
|
||||
buffer.put(enc_obs, actions, rewards, mus, dones, masks)
|
||||
else:
|
||||
# get obs, actions, rewards, mus, dones from buffer.
|
||||
obs, actions, rewards, mus, dones, masks = buffer.get()
|
||||
|
||||
# reshape stuff correctly
|
||||
obs = obs.reshape(runner.batch_ob_shape)
|
||||
actions = actions.reshape([runner.nbatch])
|
||||
rewards = rewards.reshape([runner.nbatch])
|
||||
mus = mus.reshape([runner.nbatch, runner.nact])
|
||||
dones = dones.reshape([runner.nbatch])
|
||||
masks = masks.reshape([runner.batch_ob_shape[0]])
|
||||
|
||||
names_ops, values_ops = model.train(obs, actions, rewards, dones, mus, model.initial_state, masks, steps)
|
||||
|
||||
if on_policy and (int(steps/runner.nbatch) % self.log_interval == 0):
|
||||
logger.record_tabular("total_timesteps", steps)
|
||||
logger.record_tabular("fps", int(steps/(time.time() - self.tstart)))
|
||||
# IMP: In EpisodicLife env, during training, we get done=True at each loss of life, not just at the terminal state.
|
||||
# Thus, this is mean until end of life, not end of episode.
|
||||
# For true episode rewards, see the monitor files in the log folder.
|
||||
logger.record_tabular("mean_episode_length", self.episode_stats.mean_length())
|
||||
logger.record_tabular("mean_episode_reward", self.episode_stats.mean_reward())
|
||||
for name, val in zip(names_ops, values_ops):
|
||||
logger.record_tabular(name, float(val))
|
||||
logger.dump_tabular()
|
||||
|
||||
|
||||
def learn(policy, env, seed, nsteps=20, nstack=4, total_timesteps=int(80e6), q_coef=0.5, ent_coef=0.01,
|
||||
max_grad_norm=10, lr=7e-4, lrschedule='linear', rprop_epsilon=1e-5, rprop_alpha=0.99, gamma=0.99,
|
||||
log_interval=100, buffer_size=50000, replay_ratio=4, replay_start=10000, c=10.0,
|
||||
trust_region=True, alpha=0.99, delta=1):
|
||||
print("Running Acer Simple")
|
||||
print(locals())
|
||||
tf.reset_default_graph()
|
||||
set_global_seeds(seed)
|
||||
|
||||
nenvs = env.num_envs
|
||||
ob_space = env.observation_space
|
||||
ac_space = env.action_space
|
||||
num_procs = len(env.remotes) # HACK
|
||||
model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nenvs=nenvs, nsteps=nsteps, nstack=nstack,
|
||||
num_procs=num_procs, ent_coef=ent_coef, q_coef=q_coef, gamma=gamma,
|
||||
max_grad_norm=max_grad_norm, lr=lr, rprop_alpha=rprop_alpha, rprop_epsilon=rprop_epsilon,
|
||||
total_timesteps=total_timesteps, lrschedule=lrschedule, c=c,
|
||||
trust_region=trust_region, alpha=alpha, delta=delta)
|
||||
|
||||
runner = Runner(env=env, model=model, nsteps=nsteps, nstack=nstack)
|
||||
if replay_ratio > 0:
|
||||
buffer = Buffer(env=env, nsteps=nsteps, nstack=nstack, size=buffer_size)
|
||||
else:
|
||||
buffer = None
|
||||
nbatch = nenvs*nsteps
|
||||
acer = Acer(runner, model, buffer, log_interval)
|
||||
acer.tstart = time.time()
|
||||
for acer.steps in range(0, total_timesteps, nbatch): #nbatch samples, 1 on_policy call and multiple off-policy calls
|
||||
acer.call(on_policy=True)
|
||||
if replay_ratio > 0 and buffer.has_atleast(replay_start):
|
||||
n = np.random.poisson(replay_ratio)
|
||||
for _ in range(n):
|
||||
acer.call(on_policy=False) # no simulation steps in this
|
||||
|
||||
env.close()
|
@@ -1,103 +0,0 @@
|
||||
import numpy as np
|
||||
|
||||
class Buffer(object):
|
||||
# gets obs, actions, rewards, mu's, (states, masks), dones
|
||||
def __init__(self, env, nsteps, nstack, size=50000):
|
||||
self.nenv = env.num_envs
|
||||
self.nsteps = nsteps
|
||||
self.nh, self.nw, self.nc = env.observation_space.shape
|
||||
self.nstack = nstack
|
||||
self.nbatch = self.nenv * self.nsteps
|
||||
self.size = size // (self.nsteps) # Each loc contains nenv * nsteps frames, thus total buffer is nenv * size frames
|
||||
|
||||
# Memory
|
||||
self.enc_obs = None
|
||||
self.actions = None
|
||||
self.rewards = None
|
||||
self.mus = None
|
||||
self.dones = None
|
||||
self.masks = None
|
||||
|
||||
# Size indexes
|
||||
self.next_idx = 0
|
||||
self.num_in_buffer = 0
|
||||
|
||||
def has_atleast(self, frames):
|
||||
# Frames per env, so total (nenv * frames) Frames needed
|
||||
# Each buffer loc has nenv * nsteps frames
|
||||
return self.num_in_buffer >= (frames // self.nsteps)
|
||||
|
||||
def can_sample(self):
|
||||
return self.num_in_buffer > 0
|
||||
|
||||
# Generate stacked frames
|
||||
def decode(self, enc_obs, dones):
|
||||
# enc_obs has shape [nenvs, nsteps + nstack, nh, nw, nc]
|
||||
# dones has shape [nenvs, nsteps, nh, nw, nc]
|
||||
# returns stacked obs of shape [nenv, (nsteps + 1), nh, nw, nstack*nc]
|
||||
nstack, nenv, nsteps, nh, nw, nc = self.nstack, self.nenv, self.nsteps, self.nh, self.nw, self.nc
|
||||
y = np.empty([nsteps + nstack - 1, nenv, 1, 1, 1], dtype=np.float32)
|
||||
obs = np.zeros([nstack, nsteps + nstack, nenv, nh, nw, nc], dtype=np.uint8)
|
||||
x = np.reshape(enc_obs, [nenv, nsteps + nstack, nh, nw, nc]).swapaxes(1,
|
||||
0) # [nsteps + nstack, nenv, nh, nw, nc]
|
||||
y[3:] = np.reshape(1.0 - dones, [nenv, nsteps, 1, 1, 1]).swapaxes(1, 0) # keep
|
||||
y[:3] = 1.0
|
||||
# y = np.reshape(1 - dones, [nenvs, nsteps, 1, 1, 1])
|
||||
for i in range(nstack):
|
||||
obs[-(i + 1), i:] = x
|
||||
# obs[:,i:,:,:,-(i+1),:] = x
|
||||
x = x[:-1] * y
|
||||
y = y[1:]
|
||||
return np.reshape(obs[:, 3:].transpose((2, 1, 3, 4, 0, 5)), [nenv, (nsteps + 1), nh, nw, nstack * nc])
|
||||
|
||||
def put(self, enc_obs, actions, rewards, mus, dones, masks):
|
||||
# enc_obs [nenv, (nsteps + nstack), nh, nw, nc]
|
||||
# actions, rewards, dones [nenv, nsteps]
|
||||
# mus [nenv, nsteps, nact]
|
||||
|
||||
if self.enc_obs is None:
|
||||
self.enc_obs = np.empty([self.size] + list(enc_obs.shape), dtype=np.uint8)
|
||||
self.actions = np.empty([self.size] + list(actions.shape), dtype=np.int32)
|
||||
self.rewards = np.empty([self.size] + list(rewards.shape), dtype=np.float32)
|
||||
self.mus = np.empty([self.size] + list(mus.shape), dtype=np.float32)
|
||||
self.dones = np.empty([self.size] + list(dones.shape), dtype=np.bool)
|
||||
self.masks = np.empty([self.size] + list(masks.shape), dtype=np.bool)
|
||||
|
||||
self.enc_obs[self.next_idx] = enc_obs
|
||||
self.actions[self.next_idx] = actions
|
||||
self.rewards[self.next_idx] = rewards
|
||||
self.mus[self.next_idx] = mus
|
||||
self.dones[self.next_idx] = dones
|
||||
self.masks[self.next_idx] = masks
|
||||
|
||||
self.next_idx = (self.next_idx + 1) % self.size
|
||||
self.num_in_buffer = min(self.size, self.num_in_buffer + 1)
|
||||
|
||||
def take(self, x, idx, envx):
|
||||
nenv = self.nenv
|
||||
out = np.empty([nenv] + list(x.shape[2:]), dtype=x.dtype)
|
||||
for i in range(nenv):
|
||||
out[i] = x[idx[i], envx[i]]
|
||||
return out
|
||||
|
||||
def get(self):
|
||||
# returns
|
||||
# obs [nenv, (nsteps + 1), nh, nw, nstack*nc]
|
||||
# actions, rewards, dones [nenv, nsteps]
|
||||
# mus [nenv, nsteps, nact]
|
||||
nenv = self.nenv
|
||||
assert self.can_sample()
|
||||
|
||||
# Sample exactly one id per env. If you sample across envs, then higher correlation in samples from same env.
|
||||
idx = np.random.randint(0, self.num_in_buffer, nenv)
|
||||
envx = np.arange(nenv)
|
||||
|
||||
take = lambda x: self.take(x, idx, envx) # for i in range(nenv)], axis = 0)
|
||||
dones = take(self.dones)
|
||||
enc_obs = take(self.enc_obs)
|
||||
obs = self.decode(enc_obs, dones)
|
||||
actions = take(self.actions)
|
||||
rewards = take(self.rewards)
|
||||
mus = take(self.mus)
|
||||
masks = take(self.masks)
|
||||
return obs, actions, rewards, mus, dones, masks
|
@@ -1,79 +0,0 @@
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
from baselines.ppo2.policies import nature_cnn
|
||||
from baselines.a2c.utils import fc, batch_to_seq, seq_to_batch, lstm, sample
|
||||
|
||||
|
||||
class AcerCnnPolicy(object):
|
||||
|
||||
def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False):
|
||||
nbatch = nenv * nsteps
|
||||
nh, nw, nc = ob_space.shape
|
||||
ob_shape = (nbatch, nh, nw, nc * nstack)
|
||||
nact = ac_space.n
|
||||
X = tf.placeholder(tf.uint8, ob_shape) # obs
|
||||
with tf.variable_scope("model", reuse=reuse):
|
||||
h = nature_cnn(X)
|
||||
pi_logits = fc(h, 'pi', nact, init_scale=0.01)
|
||||
pi = tf.nn.softmax(pi_logits)
|
||||
q = fc(h, 'q', nact)
|
||||
|
||||
a = sample(pi_logits) # could change this to use self.pi instead
|
||||
self.initial_state = [] # not stateful
|
||||
self.X = X
|
||||
self.pi = pi # actual policy params now
|
||||
self.q = q
|
||||
|
||||
def step(ob, *args, **kwargs):
|
||||
# returns actions, mus, states
|
||||
a0, pi0 = sess.run([a, pi], {X: ob})
|
||||
return a0, pi0, [] # dummy state
|
||||
|
||||
def out(ob, *args, **kwargs):
|
||||
pi0, q0 = sess.run([pi, q], {X: ob})
|
||||
return pi0, q0
|
||||
|
||||
def act(ob, *args, **kwargs):
|
||||
return sess.run(a, {X: ob})
|
||||
|
||||
self.step = step
|
||||
self.out = out
|
||||
self.act = act
|
||||
|
||||
class AcerLstmPolicy(object):
|
||||
|
||||
def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False, nlstm=256):
|
||||
nbatch = nenv * nsteps
|
||||
nh, nw, nc = ob_space.shape
|
||||
ob_shape = (nbatch, nh, nw, nc * nstack)
|
||||
nact = ac_space.n
|
||||
X = tf.placeholder(tf.uint8, ob_shape) # obs
|
||||
M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
|
||||
S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
|
||||
with tf.variable_scope("model", reuse=reuse):
|
||||
h = nature_cnn(X)
|
||||
|
||||
# lstm
|
||||
xs = batch_to_seq(h, nenv, nsteps)
|
||||
ms = batch_to_seq(M, nenv, nsteps)
|
||||
h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
|
||||
h5 = seq_to_batch(h5)
|
||||
|
||||
pi_logits = fc(h5, 'pi', nact, init_scale=0.01)
|
||||
pi = tf.nn.softmax(pi_logits)
|
||||
q = fc(h5, 'q', nact)
|
||||
|
||||
a = sample(pi_logits) # could change this to use self.pi instead
|
||||
self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)
|
||||
self.X = X
|
||||
self.M = M
|
||||
self.S = S
|
||||
self.pi = pi # actual policy params now
|
||||
self.q = q
|
||||
|
||||
def step(ob, state, mask, *args, **kwargs):
|
||||
# returns actions, mus, states
|
||||
a0, pi0, s = sess.run([a, pi, snew], {X: ob, S: state, M: mask})
|
||||
return a0, pi0, s
|
||||
|
||||
self.step = step
|
@@ -1,30 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
from baselines import logger
|
||||
from baselines.acer.acer_simple import learn
|
||||
from baselines.acer.policies import AcerCnnPolicy, AcerLstmPolicy
|
||||
from baselines.common.cmd_util import make_atari_env, atari_arg_parser
|
||||
|
||||
def train(env_id, num_timesteps, seed, policy, lrschedule, num_cpu):
|
||||
env = make_atari_env(env_id, num_cpu, seed)
|
||||
if policy == 'cnn':
|
||||
policy_fn = AcerCnnPolicy
|
||||
elif policy == 'lstm':
|
||||
policy_fn = AcerLstmPolicy
|
||||
else:
|
||||
print("Policy {} not implemented".format(policy))
|
||||
return
|
||||
learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), lrschedule=lrschedule)
|
||||
env.close()
|
||||
|
||||
def main():
|
||||
parser = atari_arg_parser()
|
||||
parser.add_argument('--policy', help='Policy architecture', choices=['cnn', 'lstm', 'lnlstm'], default='cnn')
|
||||
parser.add_argument('--lrschedule', help='Learning rate schedule', choices=['constant', 'linear'], default='constant')
|
||||
parser.add_argument('--logdir', help ='Directory for logging')
|
||||
args = parser.parse_args()
|
||||
logger.configure(args.logdir)
|
||||
train(args.env, num_timesteps=args.num_timesteps, seed=args.seed,
|
||||
policy=args.policy, lrschedule=args.lrschedule, num_cpu=16)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
@@ -1,10 +1,10 @@
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
from baselines import logger
|
||||
import baselines.common as common
|
||||
from baselines import common
|
||||
from baselines.common import tf_util as U
|
||||
from baselines.acktr import kfac
|
||||
from baselines.common.filters import ZFilter
|
||||
from baselines.acktr.filters import ZFilter
|
||||
|
||||
def pathlength(path):
|
||||
return path["reward"].shape[0]# Loss function that we'll differentiate to get the policy gradient
|
||||
@@ -70,7 +70,7 @@ def learn(env, policy, vf, gamma, lam, timesteps_per_batch, num_timesteps,
|
||||
coord = tf.train.Coordinator()
|
||||
for qr in [q_runner, vf.q_runner]:
|
||||
assert (qr != None)
|
||||
enqueue_threads.extend(qr.create_threads(tf.get_default_session(), coord=coord, start=True))
|
||||
enqueue_threads.extend(qr.create_threads(U.get_session(), coord=coord, start=True))
|
||||
|
||||
i = 0
|
||||
timesteps_so_far = 0
|
||||
@@ -122,10 +122,10 @@ def learn(env, policy, vf, gamma, lam, timesteps_per_batch, num_timesteps,
|
||||
kl = policy.compute_kl(ob_no, oldac_dist)
|
||||
if kl > desired_kl * 2:
|
||||
logger.log("kl too high")
|
||||
tf.assign(stepsize, tf.maximum(min_stepsize, stepsize / 1.5)).eval()
|
||||
U.eval(tf.assign(stepsize, tf.maximum(min_stepsize, stepsize / 1.5)))
|
||||
elif kl < desired_kl / 2:
|
||||
logger.log("kl too low")
|
||||
tf.assign(stepsize, tf.minimum(max_stepsize, stepsize * 1.5)).eval()
|
||||
U.eval(tf.assign(stepsize, tf.minimum(max_stepsize, stepsize * 1.5)))
|
||||
else:
|
||||
logger.log("kl just right!")
|
||||
|
||||
|
@@ -7,17 +7,16 @@ from baselines import logger
|
||||
|
||||
from baselines.common import set_global_seeds, explained_variance
|
||||
|
||||
from baselines.a2c.a2c import Runner
|
||||
from baselines.a2c.utils import discount_with_dones
|
||||
from baselines.a2c.utils import Scheduler, find_trainable_variables
|
||||
from baselines.a2c.utils import cat_entropy, mse
|
||||
from baselines.acktr.utils import discount_with_dones
|
||||
from baselines.acktr.utils import Scheduler, find_trainable_variables
|
||||
from baselines.acktr.utils import cat_entropy, mse
|
||||
from baselines.acktr import kfac
|
||||
|
||||
|
||||
class Model(object):
|
||||
|
||||
def __init__(self, policy, ob_space, ac_space, nenvs,total_timesteps, nprocs=32, nsteps=20,
|
||||
ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5,
|
||||
nstack=4, ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5,
|
||||
kfac_clip=0.001, lrschedule='linear'):
|
||||
config = tf.ConfigProto(allow_soft_placement=True,
|
||||
intra_op_parallelism_threads=nprocs,
|
||||
@@ -32,8 +31,8 @@ class Model(object):
|
||||
PG_LR = tf.placeholder(tf.float32, [])
|
||||
VF_LR = tf.placeholder(tf.float32, [])
|
||||
|
||||
self.model = step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False)
|
||||
self.model2 = train_model = policy(sess, ob_space, ac_space, nenvs*nsteps, nsteps, reuse=True)
|
||||
self.model = step_model = policy(sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False)
|
||||
self.model2 = train_model = policy(sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True)
|
||||
|
||||
logpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=A)
|
||||
self.logits = logits = train_model.pi
|
||||
@@ -72,7 +71,7 @@ class Model(object):
|
||||
cur_lr = self.lr.value()
|
||||
|
||||
td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, PG_LR:cur_lr}
|
||||
if states is not None:
|
||||
if states != []:
|
||||
td_map[train_model.S] = states
|
||||
td_map[train_model.M] = masks
|
||||
|
||||
@@ -105,8 +104,70 @@ class Model(object):
|
||||
self.initial_state = step_model.initial_state
|
||||
tf.global_variables_initializer().run(session=sess)
|
||||
|
||||
class Runner(object):
|
||||
|
||||
def __init__(self, env, model, nsteps, nstack, gamma):
|
||||
self.env = env
|
||||
self.model = model
|
||||
nh, nw, nc = env.observation_space.shape
|
||||
nenv = env.num_envs
|
||||
self.batch_ob_shape = (nenv*nsteps, nh, nw, nc*nstack)
|
||||
self.obs = np.zeros((nenv, nh, nw, nc*nstack), dtype=np.uint8)
|
||||
obs = env.reset()
|
||||
self.update_obs(obs)
|
||||
self.gamma = gamma
|
||||
self.nsteps = nsteps
|
||||
self.states = model.initial_state
|
||||
self.dones = [False for _ in range(nenv)]
|
||||
|
||||
def update_obs(self, obs):
|
||||
self.obs = np.roll(self.obs, shift=-1, axis=3)
|
||||
self.obs[:, :, :, -1] = obs[:, :, :, 0]
|
||||
|
||||
def run(self):
|
||||
mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[]
|
||||
mb_states = self.states
|
||||
for n in range(self.nsteps):
|
||||
actions, values, states = self.model.step(self.obs, self.states, self.dones)
|
||||
mb_obs.append(np.copy(self.obs))
|
||||
mb_actions.append(actions)
|
||||
mb_values.append(values)
|
||||
mb_dones.append(self.dones)
|
||||
obs, rewards, dones, _ = self.env.step(actions)
|
||||
self.states = states
|
||||
self.dones = dones
|
||||
for n, done in enumerate(dones):
|
||||
if done:
|
||||
self.obs[n] = self.obs[n]*0
|
||||
self.update_obs(obs)
|
||||
mb_rewards.append(rewards)
|
||||
mb_dones.append(self.dones)
|
||||
#batch of steps to batch of rollouts
|
||||
mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape(self.batch_ob_shape)
|
||||
mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0)
|
||||
mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0)
|
||||
mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0)
|
||||
mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
|
||||
mb_masks = mb_dones[:, :-1]
|
||||
mb_dones = mb_dones[:, 1:]
|
||||
last_values = self.model.value(self.obs, self.states, self.dones).tolist()
|
||||
#discount/bootstrap off value fn
|
||||
for n, (rewards, dones, value) in enumerate(zip(mb_rewards, mb_dones, last_values)):
|
||||
rewards = rewards.tolist()
|
||||
dones = dones.tolist()
|
||||
if dones[-1] == 0:
|
||||
rewards = discount_with_dones(rewards+[value], dones+[0], self.gamma)[:-1]
|
||||
else:
|
||||
rewards = discount_with_dones(rewards, dones, self.gamma)
|
||||
mb_rewards[n] = rewards
|
||||
mb_rewards = mb_rewards.flatten()
|
||||
mb_actions = mb_actions.flatten()
|
||||
mb_values = mb_values.flatten()
|
||||
mb_masks = mb_masks.flatten()
|
||||
return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values
|
||||
|
||||
def learn(policy, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interval=1, nprocs=32, nsteps=20,
|
||||
ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5,
|
||||
nstack=4, ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5,
|
||||
kfac_clip=0.001, save_interval=None, lrschedule='linear'):
|
||||
tf.reset_default_graph()
|
||||
set_global_seeds(seed)
|
||||
@@ -115,7 +176,7 @@ def learn(policy, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interval
|
||||
ob_space = env.observation_space
|
||||
ac_space = env.action_space
|
||||
make_model = lambda : Model(policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=nprocs, nsteps
|
||||
=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, vf_fisher_coef=
|
||||
=nsteps, nstack=nstack, ent_coef=ent_coef, vf_coef=vf_coef, vf_fisher_coef=
|
||||
vf_fisher_coef, lr=lr, max_grad_norm=max_grad_norm, kfac_clip=kfac_clip,
|
||||
lrschedule=lrschedule)
|
||||
if save_interval and logger.get_dir():
|
||||
@@ -124,7 +185,7 @@ def learn(policy, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interval
|
||||
fh.write(cloudpickle.dumps(make_model))
|
||||
model = make_model()
|
||||
|
||||
runner = Runner(env, model, nsteps=nsteps, gamma=gamma)
|
||||
runner = Runner(env, model, nsteps=nsteps, nstack=nstack, gamma=gamma)
|
||||
nbatch = nenvs*nsteps
|
||||
tstart = time.time()
|
||||
coord = tf.train.Coordinator()
|
||||
|
@@ -1,4 +1,4 @@
|
||||
from .running_stat import RunningStat
|
||||
from baselines.acktr.running_stat import RunningStat
|
||||
from collections import deque
|
||||
import numpy as np
|
||||
|
||||
|
@@ -228,7 +228,7 @@ class KfacOptimizer():
|
||||
Ow = bpropFactor.get_shape()[2]
|
||||
if Oh == 1 and Ow == 1 and self._channel_fac:
|
||||
# factorization along the channels
|
||||
# assume independence between input channels and spatial
|
||||
# assume independence bewteen input channels and spatial
|
||||
# 2K-1 x 2K-1 covariance matrix and C x C covariance matrix
|
||||
# factorization along the channels do not
|
||||
# support homogeneous coordinate, assnBias
|
||||
|
@@ -1,55 +1,93 @@
|
||||
import tensorflow as tf
|
||||
import numpy as np
|
||||
|
||||
|
||||
def gmatmul(a, b, transpose_a=False, transpose_b=False, reduce_dim=None):
|
||||
assert reduce_dim is not None
|
||||
if reduce_dim == None:
|
||||
# general batch matmul
|
||||
if len(a.get_shape()) == 3 and len(b.get_shape()) == 3:
|
||||
return tf.batch_matmul(a, b, adj_x=transpose_a, adj_y=transpose_b)
|
||||
elif len(a.get_shape()) == 3 and len(b.get_shape()) == 2:
|
||||
if transpose_b:
|
||||
N = b.get_shape()[0].value
|
||||
else:
|
||||
N = b.get_shape()[1].value
|
||||
B = a.get_shape()[0].value
|
||||
if transpose_a:
|
||||
K = a.get_shape()[1].value
|
||||
a = tf.reshape(tf.transpose(a, [0, 2, 1]), [-1, K])
|
||||
else:
|
||||
K = a.get_shape()[-1].value
|
||||
a = tf.reshape(a, [-1, K])
|
||||
result = tf.matmul(a, b, transpose_b=transpose_b)
|
||||
result = tf.reshape(result, [B, -1, N])
|
||||
return result
|
||||
elif len(a.get_shape()) == 2 and len(b.get_shape()) == 3:
|
||||
if transpose_a:
|
||||
M = a.get_shape()[1].value
|
||||
else:
|
||||
M = a.get_shape()[0].value
|
||||
B = b.get_shape()[0].value
|
||||
if transpose_b:
|
||||
K = b.get_shape()[-1].value
|
||||
b = tf.transpose(tf.reshape(b, [-1, K]), [1, 0])
|
||||
else:
|
||||
K = b.get_shape()[1].value
|
||||
b = tf.transpose(tf.reshape(
|
||||
tf.transpose(b, [0, 2, 1]), [-1, K]), [1, 0])
|
||||
result = tf.matmul(a, b, transpose_a=transpose_a)
|
||||
result = tf.transpose(tf.reshape(result, [M, B, -1]), [1, 0, 2])
|
||||
return result
|
||||
else:
|
||||
return tf.matmul(a, b, transpose_a=transpose_a, transpose_b=transpose_b)
|
||||
else:
|
||||
# weird batch matmul
|
||||
if len(a.get_shape()) == 2 and len(b.get_shape()) > 2:
|
||||
# reshape reduce_dim to the left most dim in b
|
||||
b_shape = b.get_shape()
|
||||
if reduce_dim != 0:
|
||||
b_dims = list(range(len(b_shape)))
|
||||
b_dims.remove(reduce_dim)
|
||||
b_dims.insert(0, reduce_dim)
|
||||
b = tf.transpose(b, b_dims)
|
||||
b_t_shape = b.get_shape()
|
||||
b = tf.reshape(b, [int(b_shape[reduce_dim]), -1])
|
||||
result = tf.matmul(a, b, transpose_a=transpose_a,
|
||||
transpose_b=transpose_b)
|
||||
result = tf.reshape(result, b_t_shape)
|
||||
if reduce_dim != 0:
|
||||
b_dims = list(range(len(b_shape)))
|
||||
b_dims.remove(0)
|
||||
b_dims.insert(reduce_dim, 0)
|
||||
result = tf.transpose(result, b_dims)
|
||||
return result
|
||||
|
||||
# weird batch matmul
|
||||
if len(a.get_shape()) == 2 and len(b.get_shape()) > 2:
|
||||
# reshape reduce_dim to the left most dim in b
|
||||
b_shape = b.get_shape()
|
||||
if reduce_dim != 0:
|
||||
b_dims = list(range(len(b_shape)))
|
||||
b_dims.remove(reduce_dim)
|
||||
b_dims.insert(0, reduce_dim)
|
||||
b = tf.transpose(b, b_dims)
|
||||
b_t_shape = b.get_shape()
|
||||
b = tf.reshape(b, [int(b_shape[reduce_dim]), -1])
|
||||
result = tf.matmul(a, b, transpose_a=transpose_a,
|
||||
transpose_b=transpose_b)
|
||||
result = tf.reshape(result, b_t_shape)
|
||||
if reduce_dim != 0:
|
||||
b_dims = list(range(len(b_shape)))
|
||||
b_dims.remove(0)
|
||||
b_dims.insert(reduce_dim, 0)
|
||||
result = tf.transpose(result, b_dims)
|
||||
return result
|
||||
elif len(a.get_shape()) > 2 and len(b.get_shape()) == 2:
|
||||
# reshape reduce_dim to the right most dim in a
|
||||
a_shape = a.get_shape()
|
||||
outter_dim = len(a_shape) - 1
|
||||
reduce_dim = len(a_shape) - reduce_dim - 1
|
||||
if reduce_dim != outter_dim:
|
||||
a_dims = list(range(len(a_shape)))
|
||||
a_dims.remove(reduce_dim)
|
||||
a_dims.insert(outter_dim, reduce_dim)
|
||||
a = tf.transpose(a, a_dims)
|
||||
a_t_shape = a.get_shape()
|
||||
a = tf.reshape(a, [-1, int(a_shape[reduce_dim])])
|
||||
result = tf.matmul(a, b, transpose_a=transpose_a,
|
||||
transpose_b=transpose_b)
|
||||
result = tf.reshape(result, a_t_shape)
|
||||
if reduce_dim != outter_dim:
|
||||
a_dims = list(range(len(a_shape)))
|
||||
a_dims.remove(outter_dim)
|
||||
a_dims.insert(reduce_dim, outter_dim)
|
||||
result = tf.transpose(result, a_dims)
|
||||
return result
|
||||
|
||||
elif len(a.get_shape()) > 2 and len(b.get_shape()) == 2:
|
||||
# reshape reduce_dim to the right most dim in a
|
||||
a_shape = a.get_shape()
|
||||
outter_dim = len(a_shape) - 1
|
||||
reduce_dim = len(a_shape) - reduce_dim - 1
|
||||
if reduce_dim != outter_dim:
|
||||
a_dims = list(range(len(a_shape)))
|
||||
a_dims.remove(reduce_dim)
|
||||
a_dims.insert(outter_dim, reduce_dim)
|
||||
a = tf.transpose(a, a_dims)
|
||||
a_t_shape = a.get_shape()
|
||||
a = tf.reshape(a, [-1, int(a_shape[reduce_dim])])
|
||||
result = tf.matmul(a, b, transpose_a=transpose_a,
|
||||
transpose_b=transpose_b)
|
||||
result = tf.reshape(result, a_t_shape)
|
||||
if reduce_dim != outter_dim:
|
||||
a_dims = list(range(len(a_shape)))
|
||||
a_dims.remove(outter_dim)
|
||||
a_dims.insert(reduce_dim, outter_dim)
|
||||
result = tf.transpose(result, a_dims)
|
||||
return result
|
||||
elif len(a.get_shape()) == 2 and len(b.get_shape()) == 2:
|
||||
return tf.matmul(a, b, transpose_a=transpose_a, transpose_b=transpose_b)
|
||||
|
||||
elif len(a.get_shape()) == 2 and len(b.get_shape()) == 2:
|
||||
return tf.matmul(a, b, transpose_a=transpose_a, transpose_b=transpose_b)
|
||||
|
||||
assert False, 'something went wrong'
|
||||
assert False, 'something went wrong'
|
||||
|
||||
|
||||
def clipoutNeg(vec, threshold=1e-6):
|
||||
|
@@ -1,8 +1,43 @@
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
from baselines.acktr.utils import dense, kl_div
|
||||
from baselines.acktr.utils import conv, fc, dense, conv_to_fc, sample, kl_div
|
||||
import baselines.common.tf_util as U
|
||||
|
||||
class CnnPolicy(object):
|
||||
|
||||
def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False):
|
||||
nbatch = nenv*nsteps
|
||||
nh, nw, nc = ob_space.shape
|
||||
ob_shape = (nbatch, nh, nw, nc*nstack)
|
||||
nact = ac_space.n
|
||||
X = tf.placeholder(tf.uint8, ob_shape) #obs
|
||||
with tf.variable_scope("model", reuse=reuse):
|
||||
h = conv(tf.cast(X, tf.float32)/255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2))
|
||||
h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))
|
||||
h3 = conv(h2, 'c3', nf=32, rf=3, stride=1, init_scale=np.sqrt(2))
|
||||
h3 = conv_to_fc(h3)
|
||||
h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
|
||||
pi = fc(h4, 'pi', nact, act=lambda x:x)
|
||||
vf = fc(h4, 'v', 1, act=lambda x:x)
|
||||
|
||||
v0 = vf[:, 0]
|
||||
a0 = sample(pi)
|
||||
self.initial_state = [] #not stateful
|
||||
|
||||
def step(ob, *_args, **_kwargs):
|
||||
a, v = sess.run([a0, v0], {X:ob})
|
||||
return a, v, [] #dummy state
|
||||
|
||||
def value(ob, *_args, **_kwargs):
|
||||
return sess.run(v0, {X:ob})
|
||||
|
||||
self.X = X
|
||||
self.pi = pi
|
||||
self.vf = vf
|
||||
self.step = step
|
||||
self.value = value
|
||||
|
||||
|
||||
class GaussianMlpPolicy(object):
|
||||
def __init__(self, ob_dim, ac_dim):
|
||||
# Here we'll construct a bunch of expressions, which will be used in two places:
|
||||
@@ -25,12 +60,12 @@ class GaussianMlpPolicy(object):
|
||||
std_na = tf.tile(std_1a, [tf.shape(mean_na)[0], 1])
|
||||
ac_dist = tf.concat([tf.reshape(mean_na, [-1, ac_dim]), tf.reshape(std_na, [-1, ac_dim])], 1)
|
||||
sampled_ac_na = tf.random_normal(tf.shape(ac_dist[:,ac_dim:])) * ac_dist[:,ac_dim:] + ac_dist[:,:ac_dim] # This is the sampled action we'll perform.
|
||||
logprobsampled_n = - tf.reduce_sum(tf.log(ac_dist[:,ac_dim:]), axis=1) - 0.5 * tf.log(2.0*np.pi)*ac_dim - 0.5 * tf.reduce_sum(tf.square(ac_dist[:,:ac_dim] - sampled_ac_na) / (tf.square(ac_dist[:,ac_dim:])), axis=1) # Logprob of sampled action
|
||||
logprob_n = - tf.reduce_sum(tf.log(ac_dist[:,ac_dim:]), axis=1) - 0.5 * tf.log(2.0*np.pi)*ac_dim - 0.5 * tf.reduce_sum(tf.square(ac_dist[:,:ac_dim] - oldac_na) / (tf.square(ac_dist[:,ac_dim:])), axis=1) # Logprob of previous actions under CURRENT policy (whereas oldlogprob_n is under OLD policy)
|
||||
kl = tf.reduce_mean(kl_div(oldac_dist, ac_dist, ac_dim))
|
||||
#kl = .5 * tf.reduce_mean(tf.square(logprob_n - oldlogprob_n)) # Approximation of KL divergence between old policy used to generate actions, and new policy used to compute logprob_n
|
||||
surr = - tf.reduce_mean(adv_n * logprob_n) # Loss function that we'll differentiate to get the policy gradient
|
||||
surr_sampled = - tf.reduce_mean(logprob_n) # Sampled loss of the policy
|
||||
logprobsampled_n = - U.sum(tf.log(ac_dist[:,ac_dim:]), axis=1) - 0.5 * tf.log(2.0*np.pi)*ac_dim - 0.5 * U.sum(tf.square(ac_dist[:,:ac_dim] - sampled_ac_na) / (tf.square(ac_dist[:,ac_dim:])), axis=1) # Logprob of sampled action
|
||||
logprob_n = - U.sum(tf.log(ac_dist[:,ac_dim:]), axis=1) - 0.5 * tf.log(2.0*np.pi)*ac_dim - 0.5 * U.sum(tf.square(ac_dist[:,:ac_dim] - oldac_na) / (tf.square(ac_dist[:,ac_dim:])), axis=1) # Logprob of previous actions under CURRENT policy (whereas oldlogprob_n is under OLD policy)
|
||||
kl = U.mean(kl_div(oldac_dist, ac_dist, ac_dim))
|
||||
#kl = .5 * U.mean(tf.square(logprob_n - oldlogprob_n)) # Approximation of KL divergence between old policy used to generate actions, and new policy used to compute logprob_n
|
||||
surr = - U.mean(adv_n * logprob_n) # Loss function that we'll differentiate to get the policy gradient
|
||||
surr_sampled = - U.mean(logprob_n) # Sampled loss of the policy
|
||||
self._act = U.function([ob_no], [sampled_ac_na, ac_dist, logprobsampled_n]) # Generate a new action and its logprob
|
||||
#self.compute_kl = U.function([ob_no, oldac_na, oldlogprob_n], kl) # Compute (approximate) KL divergence between old policy and new policy
|
||||
self.compute_kl = U.function([ob_no, oldac_dist], kl)
|
||||
|
@@ -1,21 +1,38 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
#!/usr/bin/env python
|
||||
import os, logging, gym
|
||||
from baselines import logger
|
||||
from baselines.common import set_global_seeds
|
||||
from baselines import bench
|
||||
from baselines.acktr.acktr_disc import learn
|
||||
from baselines.common.cmd_util import make_atari_env, atari_arg_parser
|
||||
from baselines.common.vec_env.vec_frame_stack import VecFrameStack
|
||||
from baselines.ppo2.policies import CnnPolicy
|
||||
from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
|
||||
from baselines.common.atari_wrappers import make_atari, wrap_deepmind
|
||||
from baselines.acktr.policies import CnnPolicy
|
||||
|
||||
def train(env_id, num_timesteps, seed, num_cpu):
|
||||
env = VecFrameStack(make_atari_env(env_id, num_cpu, seed), 4)
|
||||
def make_env(rank):
|
||||
def _thunk():
|
||||
env = make_atari(env_id)
|
||||
env.seed(seed + rank)
|
||||
env = bench.Monitor(env, logger.get_dir() and logger.get_dir())
|
||||
gym.logger.setLevel(logging.WARN)
|
||||
return wrap_deepmind(env)
|
||||
return _thunk
|
||||
set_global_seeds(seed)
|
||||
env = SubprocVecEnv([make_env(i) for i in range(num_cpu)])
|
||||
policy_fn = CnnPolicy
|
||||
learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), nprocs=num_cpu)
|
||||
env.close()
|
||||
|
||||
def main():
|
||||
args = atari_arg_parser().parse_args()
|
||||
import argparse
|
||||
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||
parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4')
|
||||
parser.add_argument('--seed', help='RNG seed', type=int, default=0)
|
||||
parser.add_argument('--num-timesteps', type=int, default=int(10e6))
|
||||
args = parser.parse_args()
|
||||
logger.configure()
|
||||
train(args.env, num_timesteps=args.num_timesteps, seed=args.seed, num_cpu=32)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
@@ -1,14 +1,22 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
#!/usr/bin/env python
|
||||
import argparse
|
||||
import logging
|
||||
import os
|
||||
import tensorflow as tf
|
||||
import gym
|
||||
from baselines import logger
|
||||
from baselines.common.cmd_util import make_mujoco_env, mujoco_arg_parser
|
||||
from baselines.common import set_global_seeds
|
||||
from baselines import bench
|
||||
from baselines.acktr.acktr_cont import learn
|
||||
from baselines.acktr.policies import GaussianMlpPolicy
|
||||
from baselines.acktr.value_functions import NeuralNetValueFunction
|
||||
|
||||
def train(env_id, num_timesteps, seed):
|
||||
env = make_mujoco_env(env_id, seed)
|
||||
env=gym.make(env_id)
|
||||
env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
|
||||
set_global_seeds(seed)
|
||||
env.seed(seed)
|
||||
gym.logger.setLevel(logging.WARN)
|
||||
|
||||
with tf.Session(config=tf.ConfigProto()):
|
||||
ob_dim = env.observation_space.shape[0]
|
||||
@@ -25,10 +33,11 @@ def train(env_id, num_timesteps, seed):
|
||||
|
||||
env.close()
|
||||
|
||||
def main():
|
||||
args = mujoco_arg_parser().parse_args()
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description='Run Mujoco benchmark.')
|
||||
parser.add_argument('--seed', help='RNG seed', type=int, default=0)
|
||||
parser.add_argument('--env', help='environment ID', type=str, default="Reacher-v1")
|
||||
parser.add_argument('--num-timesteps', type=int, default=int(1e6))
|
||||
args = parser.parse_args()
|
||||
logger.configure()
|
||||
train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
@@ -1,8 +1,69 @@
|
||||
import os
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
import baselines.common.tf_util as U
|
||||
from collections import deque
|
||||
|
||||
def sample(logits):
|
||||
noise = tf.random_uniform(tf.shape(logits))
|
||||
return tf.argmax(logits - tf.log(-tf.log(noise)), 1)
|
||||
|
||||
def std(x):
|
||||
mean = tf.reduce_mean(x)
|
||||
var = tf.reduce_mean(tf.square(x-mean))
|
||||
return tf.sqrt(var)
|
||||
|
||||
def cat_entropy(logits):
|
||||
a0 = logits - tf.reduce_max(logits, 1, keep_dims=True)
|
||||
ea0 = tf.exp(a0)
|
||||
z0 = tf.reduce_sum(ea0, 1, keep_dims=True)
|
||||
p0 = ea0 / z0
|
||||
return tf.reduce_sum(p0 * (tf.log(z0) - a0), 1)
|
||||
|
||||
def cat_entropy_softmax(p0):
|
||||
return - tf.reduce_sum(p0 * tf.log(p0 + 1e-6), axis = 1)
|
||||
|
||||
def mse(pred, target):
|
||||
return tf.square(pred-target)/2.
|
||||
|
||||
def ortho_init(scale=1.0):
|
||||
def _ortho_init(shape, dtype, partition_info=None):
|
||||
#lasagne ortho init for tf
|
||||
shape = tuple(shape)
|
||||
if len(shape) == 2:
|
||||
flat_shape = shape
|
||||
elif len(shape) == 4: # assumes NHWC
|
||||
flat_shape = (np.prod(shape[:-1]), shape[-1])
|
||||
else:
|
||||
raise NotImplementedError
|
||||
a = np.random.normal(0.0, 1.0, flat_shape)
|
||||
u, _, v = np.linalg.svd(a, full_matrices=False)
|
||||
q = u if u.shape == flat_shape else v # pick the one with the correct shape
|
||||
q = q.reshape(shape)
|
||||
return (scale * q[:shape[0], :shape[1]]).astype(np.float32)
|
||||
return _ortho_init
|
||||
|
||||
def conv(x, scope, nf, rf, stride, pad='VALID', act=tf.nn.relu, init_scale=1.0):
|
||||
with tf.variable_scope(scope):
|
||||
nin = x.get_shape()[3].value
|
||||
w = tf.get_variable("w", [rf, rf, nin, nf], initializer=ortho_init(init_scale))
|
||||
b = tf.get_variable("b", [nf], initializer=tf.constant_initializer(0.0))
|
||||
z = tf.nn.conv2d(x, w, strides=[1, stride, stride, 1], padding=pad)+b
|
||||
h = act(z)
|
||||
return h
|
||||
|
||||
def fc(x, scope, nh, act=tf.nn.relu, init_scale=1.0):
|
||||
with tf.variable_scope(scope):
|
||||
nin = x.get_shape()[1].value
|
||||
w = tf.get_variable("w", [nin, nh], initializer=ortho_init(init_scale))
|
||||
b = tf.get_variable("b", [nh], initializer=tf.constant_initializer(0.0))
|
||||
z = tf.matmul(x, w)+b
|
||||
h = act(z)
|
||||
return h
|
||||
|
||||
def dense(x, size, name, weight_init=None, bias_init=0, weight_loss_dict=None, reuse=None):
|
||||
with tf.variable_scope(name, reuse=reuse):
|
||||
assert (len(tf.get_variable_scope().name.split('/')) == 2)
|
||||
assert (len(U.scope_name().split('/')) == 2)
|
||||
|
||||
w = tf.get_variable("w", [x.get_shape()[1], size], initializer=weight_init)
|
||||
b = tf.get_variable("b", [size], initializer=tf.constant_initializer(bias_init))
|
||||
@@ -14,10 +75,15 @@ def dense(x, size, name, weight_init=None, bias_init=0, weight_loss_dict=None, r
|
||||
weight_loss_dict[w] = weight_decay_fc
|
||||
weight_loss_dict[b] = 0.0
|
||||
|
||||
tf.add_to_collection(tf.get_variable_scope().name.split('/')[0] + '_' + 'losses', weight_decay)
|
||||
tf.add_to_collection(U.scope_name().split('/')[0] + '_' + 'losses', weight_decay)
|
||||
|
||||
return tf.nn.bias_add(tf.matmul(x, w), b)
|
||||
|
||||
def conv_to_fc(x):
|
||||
nh = np.prod([v.value for v in x.get_shape()[1:]])
|
||||
x = tf.reshape(x, [-1, nh])
|
||||
return x
|
||||
|
||||
def kl_div(action_dist1, action_dist2, action_size):
|
||||
mean1, std1 = action_dist1[:, :action_size], action_dist1[:, action_size:]
|
||||
mean2, std2 = action_dist2[:, :action_size], action_dist2[:, action_size:]
|
||||
@@ -26,3 +92,109 @@ def kl_div(action_dist1, action_dist2, action_size):
|
||||
denominator = 2 * tf.square(std2) + 1e-8
|
||||
return tf.reduce_sum(
|
||||
numerator/denominator + tf.log(std2) - tf.log(std1),reduction_indices=-1)
|
||||
|
||||
def discount_with_dones(rewards, dones, gamma):
|
||||
discounted = []
|
||||
r = 0
|
||||
for reward, done in zip(rewards[::-1], dones[::-1]):
|
||||
r = reward + gamma*r*(1.-done) # fixed off by one bug
|
||||
discounted.append(r)
|
||||
return discounted[::-1]
|
||||
|
||||
def find_trainable_variables(key):
|
||||
with tf.variable_scope(key):
|
||||
return tf.trainable_variables()
|
||||
|
||||
def make_path(f):
|
||||
return os.makedirs(f, exist_ok=True)
|
||||
|
||||
def constant(p):
|
||||
return 1
|
||||
|
||||
def linear(p):
|
||||
return 1-p
|
||||
|
||||
|
||||
def middle_drop(p):
|
||||
eps = 0.75
|
||||
if 1-p<eps:
|
||||
return eps*0.1
|
||||
return 1-p
|
||||
|
||||
def double_linear_con(p):
|
||||
p *= 2
|
||||
eps = 0.125
|
||||
if 1-p<eps:
|
||||
return eps
|
||||
return 1-p
|
||||
|
||||
|
||||
def double_middle_drop(p):
|
||||
eps1 = 0.75
|
||||
eps2 = 0.25
|
||||
if 1-p<eps1:
|
||||
if 1-p<eps2:
|
||||
return eps2*0.5
|
||||
return eps1*0.1
|
||||
return 1-p
|
||||
|
||||
|
||||
schedules = {
|
||||
'linear':linear,
|
||||
'constant':constant,
|
||||
'double_linear_con':double_linear_con,
|
||||
'middle_drop':middle_drop,
|
||||
'double_middle_drop':double_middle_drop
|
||||
}
|
||||
|
||||
class Scheduler(object):
|
||||
|
||||
def __init__(self, v, nvalues, schedule):
|
||||
self.n = 0.
|
||||
self.v = v
|
||||
self.nvalues = nvalues
|
||||
self.schedule = schedules[schedule]
|
||||
|
||||
def value(self):
|
||||
current_value = self.v*self.schedule(self.n/self.nvalues)
|
||||
self.n += 1.
|
||||
return current_value
|
||||
|
||||
def value_steps(self, steps):
|
||||
return self.v*self.schedule(steps/self.nvalues)
|
||||
|
||||
|
||||
class EpisodeStats:
|
||||
def __init__(self, nsteps, nenvs):
|
||||
self.episode_rewards = []
|
||||
for i in range(nenvs):
|
||||
self.episode_rewards.append([])
|
||||
self.lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths
|
||||
self.rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards
|
||||
self.nsteps = nsteps
|
||||
self.nenvs = nenvs
|
||||
|
||||
def feed(self, rewards, masks):
|
||||
rewards = np.reshape(rewards, [self.nenvs, self.nsteps])
|
||||
masks = np.reshape(masks, [self.nenvs, self.nsteps])
|
||||
for i in range(0, self.nenvs):
|
||||
for j in range(0, self.nsteps):
|
||||
self.episode_rewards[i].append(rewards[i][j])
|
||||
if masks[i][j]:
|
||||
l = len(self.episode_rewards[i])
|
||||
s = sum(self.episode_rewards[i])
|
||||
self.lenbuffer.append(l)
|
||||
self.rewbuffer.append(s)
|
||||
self.episode_rewards[i] = []
|
||||
|
||||
def mean_length(self):
|
||||
if self.lenbuffer:
|
||||
return np.mean(self.lenbuffer)
|
||||
else:
|
||||
return 0 # on the first params dump, no episodes are finished
|
||||
|
||||
def mean_reward(self):
|
||||
if self.rewbuffer:
|
||||
return np.mean(self.rewbuffer)
|
||||
else:
|
||||
return 0
|
||||
|
@@ -1,6 +1,6 @@
|
||||
from baselines import logger
|
||||
import numpy as np
|
||||
import baselines.common as common
|
||||
from baselines import common
|
||||
from baselines.common import tf_util as U
|
||||
import tensorflow as tf
|
||||
from baselines.acktr import kfac
|
||||
@@ -16,8 +16,8 @@ class NeuralNetValueFunction(object):
|
||||
vpred_n = dense(h2, 1, "hfinal", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)[:,0]
|
||||
sample_vpred_n = vpred_n + tf.random_normal(tf.shape(vpred_n))
|
||||
wd_loss = tf.get_collection("vf_losses", None)
|
||||
loss = tf.reduce_mean(tf.square(vpred_n - vtarg_n)) + tf.add_n(wd_loss)
|
||||
loss_sampled = tf.reduce_mean(tf.square(vpred_n - tf.stop_gradient(sample_vpred_n)))
|
||||
loss = U.mean(tf.square(vpred_n - vtarg_n)) + tf.add_n(wd_loss)
|
||||
loss_sampled = U.mean(tf.square(vpred_n - tf.stop_gradient(sample_vpred_n)))
|
||||
self._predict = U.function([X], vpred_n)
|
||||
optim = kfac.KfacOptimizer(learning_rate=0.001, cold_lr=0.001*(1-0.9), momentum=0.9, \
|
||||
clip_kl=0.3, epsilon=0.1, stats_decay=0.95, \
|
||||
|
@@ -1,2 +1,2 @@
|
||||
from baselines.bench.benchmarks import *
|
||||
from baselines.bench.monitor import *
|
||||
from baselines.bench.monitor import *
|
||||
|
@@ -1,24 +1,15 @@
|
||||
import re
|
||||
import os.path as osp
|
||||
import os
|
||||
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||
|
||||
_atari7 = ['BeamRider', 'Breakout', 'Enduro', 'Pong', 'Qbert', 'Seaquest', 'SpaceInvaders']
|
||||
_atariexpl7 = ['Freeway', 'Gravitar', 'MontezumaRevenge', 'Pitfall', 'PrivateEye', 'Solaris', 'Venture']
|
||||
|
||||
_BENCHMARKS = []
|
||||
|
||||
remove_version_re = re.compile(r'-v\d+$')
|
||||
|
||||
def register_benchmark(benchmark):
|
||||
for b in _BENCHMARKS:
|
||||
if b['name'] == benchmark['name']:
|
||||
raise ValueError('Benchmark with name %s already registered!' % b['name'])
|
||||
|
||||
# automatically add a description if it is not present
|
||||
if 'tasks' in benchmark:
|
||||
for t in benchmark['tasks']:
|
||||
if 'desc' not in t:
|
||||
t['desc'] = remove_version_re.sub('', t['env_id'])
|
||||
_BENCHMARKS.append(benchmark)
|
||||
|
||||
|
||||
@@ -51,34 +42,36 @@ _ATARI_SUFFIX = 'NoFrameskip-v4'
|
||||
register_benchmark({
|
||||
'name': 'Atari50M',
|
||||
'description': '7 Atari games from Mnih et al. (2013), with pixel observations, 50M timesteps',
|
||||
'tasks': [{'desc': _game, 'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(50e6)} for _game in _atari7]
|
||||
'tasks': [{'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(50e6)} for _game in _atari7]
|
||||
})
|
||||
|
||||
register_benchmark({
|
||||
'name': 'Atari10M',
|
||||
'description': '7 Atari games from Mnih et al. (2013), with pixel observations, 10M timesteps',
|
||||
'tasks': [{'desc': _game, 'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(10e6)} for _game in _atari7]
|
||||
'tasks': [{'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(10e6)} for _game in _atari7]
|
||||
})
|
||||
|
||||
register_benchmark({
|
||||
'name': 'Atari1Hr',
|
||||
'description': '7 Atari games from Mnih et al. (2013), with pixel observations, 1 hour of walltime',
|
||||
'tasks': [{'desc': _game, 'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_seconds': 60 * 60} for _game in _atari7]
|
||||
'tasks': [{'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_seconds': 60 * 60} for _game in _atari7]
|
||||
})
|
||||
|
||||
register_benchmark({
|
||||
'name': 'AtariExploration10M',
|
||||
'description': '7 Atari games emphasizing exploration, with pixel observations, 10M timesteps',
|
||||
'tasks': [{'desc': _game, 'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(10e6)} for _game in _atariexpl7]
|
||||
'tasks': [{'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(10e6)} for _game in _atariexpl7]
|
||||
})
|
||||
|
||||
|
||||
|
||||
|
||||
# MuJoCo
|
||||
|
||||
_mujocosmall = [
|
||||
'InvertedDoublePendulum-v2', 'InvertedPendulum-v2',
|
||||
'HalfCheetah-v2', 'Hopper-v2', 'Walker2d-v2',
|
||||
'Reacher-v2', 'Swimmer-v2']
|
||||
'InvertedDoublePendulum-v1', 'InvertedPendulum-v1',
|
||||
'HalfCheetah-v1', 'Hopper-v1', 'Walker2d-v1',
|
||||
'Reacher-v1', 'Swimmer-v1']
|
||||
register_benchmark({
|
||||
'name': 'Mujoco1M',
|
||||
'description': 'Some small 2D MuJoCo tasks, run for 1M timesteps',
|
||||
@@ -135,6 +128,5 @@ _atari50 = [ # actually 47
|
||||
register_benchmark({
|
||||
'name': 'Atari50_10M',
|
||||
'description': '47 Atari games from Mnih et al. (2013), with pixel observations, 10M timesteps',
|
||||
'tasks': [{'desc': _game, 'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(10e6)} for _game in _atari50]
|
||||
'tasks': [{'env_id': _game + _ATARI_SUFFIX, 'trials': 3, 'num_timesteps': int(10e6)} for _game in _atari50]
|
||||
})
|
||||
|
||||
|
@@ -7,13 +7,12 @@ from glob import glob
|
||||
import csv
|
||||
import os.path as osp
|
||||
import json
|
||||
import numpy as np
|
||||
|
||||
class Monitor(Wrapper):
|
||||
EXT = "monitor.csv"
|
||||
f = None
|
||||
|
||||
def __init__(self, env, filename, allow_early_resets=False, reset_keywords=(), info_keywords=()):
|
||||
def __init__(self, env, filename, allow_early_resets=False, reset_keywords=()):
|
||||
Wrapper.__init__(self, env=env)
|
||||
self.tstart = time.time()
|
||||
if filename is None:
|
||||
@@ -26,23 +25,21 @@ class Monitor(Wrapper):
|
||||
else:
|
||||
filename = filename + "." + Monitor.EXT
|
||||
self.f = open(filename, "wt")
|
||||
self.f.write('#%s\n'%json.dumps({"t_start": self.tstart, 'env_id' : env.spec and env.spec.id}))
|
||||
self.logger = csv.DictWriter(self.f, fieldnames=('r', 'l', 't')+reset_keywords+info_keywords)
|
||||
self.f.write('#%s\n'%json.dumps({"t_start": self.tstart, "gym_version": gym.__version__,
|
||||
"env_id": env.spec.id if env.spec else 'Unknown'}))
|
||||
self.logger = csv.DictWriter(self.f, fieldnames=('r', 'l', 't')+reset_keywords)
|
||||
self.logger.writeheader()
|
||||
self.f.flush()
|
||||
|
||||
self.reset_keywords = reset_keywords
|
||||
self.info_keywords = info_keywords
|
||||
self.allow_early_resets = allow_early_resets
|
||||
self.rewards = None
|
||||
self.needs_reset = True
|
||||
self.episode_rewards = []
|
||||
self.episode_lengths = []
|
||||
self.episode_times = []
|
||||
self.total_steps = 0
|
||||
self.current_reset_info = {} # extra info about the current episode, that was passed in during reset()
|
||||
|
||||
def reset(self, **kwargs):
|
||||
def _reset(self, **kwargs):
|
||||
if not self.allow_early_resets and not self.needs_reset:
|
||||
raise RuntimeError("Tried to reset an environment before done. If you want to allow early resets, wrap your env with Monitor(env, path, allow_early_resets=True)")
|
||||
self.rewards = []
|
||||
@@ -54,7 +51,7 @@ class Monitor(Wrapper):
|
||||
self.current_reset_info[k] = v
|
||||
return self.env.reset(**kwargs)
|
||||
|
||||
def step(self, action):
|
||||
def _step(self, action):
|
||||
if self.needs_reset:
|
||||
raise RuntimeError("Tried to step environment that needs reset")
|
||||
ob, rew, done, info = self.env.step(action)
|
||||
@@ -64,15 +61,12 @@ class Monitor(Wrapper):
|
||||
eprew = sum(self.rewards)
|
||||
eplen = len(self.rewards)
|
||||
epinfo = {"r": round(eprew, 6), "l": eplen, "t": round(time.time() - self.tstart, 6)}
|
||||
for k in self.info_keywords:
|
||||
epinfo[k] = info[k]
|
||||
self.episode_rewards.append(eprew)
|
||||
self.episode_lengths.append(eplen)
|
||||
self.episode_times.append(time.time() - self.tstart)
|
||||
epinfo.update(self.current_reset_info)
|
||||
if self.logger:
|
||||
self.logger.writerow(epinfo)
|
||||
self.f.flush()
|
||||
self.episode_rewards.append(eprew)
|
||||
self.episode_lengths.append(eplen)
|
||||
info['episode'] = epinfo
|
||||
self.total_steps += 1
|
||||
return (ob, rew, done, info)
|
||||
@@ -90,9 +84,6 @@ class Monitor(Wrapper):
|
||||
def get_episode_lengths(self):
|
||||
return self.episode_lengths
|
||||
|
||||
def get_episode_times(self):
|
||||
return self.episode_times
|
||||
|
||||
class LoadMonitorResultsError(Exception):
|
||||
pass
|
||||
|
||||
@@ -101,9 +92,7 @@ def get_monitor_files(dir):
|
||||
|
||||
def load_results(dir):
|
||||
import pandas
|
||||
monitor_files = (
|
||||
glob(osp.join(dir, "*monitor.json")) +
|
||||
glob(osp.join(dir, "*monitor.csv"))) # get both csv and (old) json files
|
||||
monitor_files = glob(osp.join(dir, "*monitor.*")) # get both csv and (old) json files
|
||||
if not monitor_files:
|
||||
raise LoadMonitorResultsError("no monitor files of the form *%s found in %s" % (Monitor.EXT, dir))
|
||||
dfs = []
|
||||
@@ -125,37 +114,10 @@ def load_results(dir):
|
||||
episode = json.loads(line)
|
||||
episodes.append(episode)
|
||||
df = pandas.DataFrame(episodes)
|
||||
else:
|
||||
assert 0, 'unreachable'
|
||||
df['t'] += header['t_start']
|
||||
df['t'] += header['t_start']
|
||||
dfs.append(df)
|
||||
df = pandas.concat(dfs)
|
||||
df.sort_values('t', inplace=True)
|
||||
df.reset_index(inplace=True)
|
||||
df['t'] -= min(header['t_start'] for header in headers)
|
||||
df.headers = headers # HACK to preserve backwards compatibility
|
||||
return df
|
||||
|
||||
def test_monitor():
|
||||
env = gym.make("CartPole-v1")
|
||||
env.seed(0)
|
||||
mon_file = "/tmp/baselines-test-%s.monitor.csv" % uuid.uuid4()
|
||||
menv = Monitor(env, mon_file)
|
||||
menv.reset()
|
||||
for _ in range(1000):
|
||||
_, _, done, _ = menv.step(0)
|
||||
if done:
|
||||
menv.reset()
|
||||
|
||||
f = open(mon_file, 'rt')
|
||||
|
||||
firstline = f.readline()
|
||||
assert firstline.startswith('#')
|
||||
metadata = json.loads(firstline[1:])
|
||||
assert metadata['env_id'] == "CartPole-v1"
|
||||
assert set(metadata.keys()) == {'env_id', 'gym_version', 't_start'}, "Incorrect keys in monitor metadata"
|
||||
|
||||
last_logline = pandas.read_csv(f, index_col=None)
|
||||
assert set(last_logline.keys()) == {'l', 't', 'r'}, "Incorrect keys in monitor logline"
|
||||
f.close()
|
||||
os.remove(mon_file)
|
||||
return df
|
@@ -3,7 +3,6 @@ from collections import deque
|
||||
import gym
|
||||
from gym import spaces
|
||||
import cv2
|
||||
cv2.ocl.setUseOpenCL(False)
|
||||
|
||||
class NoopResetEnv(gym.Wrapper):
|
||||
def __init__(self, env, noop_max=30):
|
||||
@@ -13,10 +12,14 @@ class NoopResetEnv(gym.Wrapper):
|
||||
gym.Wrapper.__init__(self, env)
|
||||
self.noop_max = noop_max
|
||||
self.override_num_noops = None
|
||||
self.noop_action = 0
|
||||
assert env.unwrapped.get_action_meanings()[0] == 'NOOP'
|
||||
if isinstance(env.action_space, gym.spaces.MultiBinary):
|
||||
self.noop_action = np.zeros(self.env.action_space.n, dtype=np.int64)
|
||||
else:
|
||||
# used for atari environments
|
||||
self.noop_action = 0
|
||||
assert env.unwrapped.get_action_meanings()[0] == 'NOOP'
|
||||
|
||||
def reset(self, **kwargs):
|
||||
def _reset(self, **kwargs):
|
||||
""" Do no-op action for a number of steps in [1, noop_max]."""
|
||||
self.env.reset(**kwargs)
|
||||
if self.override_num_noops is not None:
|
||||
@@ -31,9 +34,6 @@ class NoopResetEnv(gym.Wrapper):
|
||||
obs = self.env.reset(**kwargs)
|
||||
return obs
|
||||
|
||||
def step(self, ac):
|
||||
return self.env.step(ac)
|
||||
|
||||
class FireResetEnv(gym.Wrapper):
|
||||
def __init__(self, env):
|
||||
"""Take action on reset for environments that are fixed until firing."""
|
||||
@@ -41,7 +41,7 @@ class FireResetEnv(gym.Wrapper):
|
||||
assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
|
||||
assert len(env.unwrapped.get_action_meanings()) >= 3
|
||||
|
||||
def reset(self, **kwargs):
|
||||
def _reset(self, **kwargs):
|
||||
self.env.reset(**kwargs)
|
||||
obs, _, done, _ = self.env.step(1)
|
||||
if done:
|
||||
@@ -51,9 +51,6 @@ class FireResetEnv(gym.Wrapper):
|
||||
self.env.reset(**kwargs)
|
||||
return obs
|
||||
|
||||
def step(self, ac):
|
||||
return self.env.step(ac)
|
||||
|
||||
class EpisodicLifeEnv(gym.Wrapper):
|
||||
def __init__(self, env):
|
||||
"""Make end-of-life == end-of-episode, but only reset on true game over.
|
||||
@@ -63,21 +60,21 @@ class EpisodicLifeEnv(gym.Wrapper):
|
||||
self.lives = 0
|
||||
self.was_real_done = True
|
||||
|
||||
def step(self, action):
|
||||
def _step(self, action):
|
||||
obs, reward, done, info = self.env.step(action)
|
||||
self.was_real_done = done
|
||||
# check current lives, make loss of life terminal,
|
||||
# then update lives to handle bonus lives
|
||||
lives = self.env.unwrapped.ale.lives()
|
||||
if lives < self.lives and lives > 0:
|
||||
# for Qbert sometimes we stay in lives == 0 condtion for a few frames
|
||||
# for Qbert somtimes we stay in lives == 0 condtion for a few frames
|
||||
# so its important to keep lives > 0, so that we only reset once
|
||||
# the environment advertises done.
|
||||
done = True
|
||||
self.lives = lives
|
||||
return obs, reward, done, info
|
||||
|
||||
def reset(self, **kwargs):
|
||||
def _reset(self, **kwargs):
|
||||
"""Reset only when lives are exhausted.
|
||||
This way all states are still reachable even though lives are episodic,
|
||||
and the learner need not know about any of this behind-the-scenes.
|
||||
@@ -95,13 +92,10 @@ class MaxAndSkipEnv(gym.Wrapper):
|
||||
"""Return only every `skip`-th frame"""
|
||||
gym.Wrapper.__init__(self, env)
|
||||
# most recent raw observations (for max pooling across time steps)
|
||||
self._obs_buffer = np.zeros((2,)+env.observation_space.shape, dtype=np.uint8)
|
||||
self._obs_buffer = np.zeros((2,)+env.observation_space.shape, dtype='uint8')
|
||||
self._skip = skip
|
||||
|
||||
def reset(self):
|
||||
return self.env.reset()
|
||||
|
||||
def step(self, action):
|
||||
def _step(self, action):
|
||||
"""Repeat action, sum reward, and max over last observations."""
|
||||
total_reward = 0.0
|
||||
done = None
|
||||
@@ -118,14 +112,8 @@ class MaxAndSkipEnv(gym.Wrapper):
|
||||
|
||||
return max_frame, total_reward, done, info
|
||||
|
||||
def reset(self, **kwargs):
|
||||
return self.env.reset(**kwargs)
|
||||
|
||||
class ClipRewardEnv(gym.RewardWrapper):
|
||||
def __init__(self, env):
|
||||
gym.RewardWrapper.__init__(self, env)
|
||||
|
||||
def reward(self, reward):
|
||||
def _reward(self, reward):
|
||||
"""Bin reward to {+1, 0, -1} by its sign."""
|
||||
return np.sign(reward)
|
||||
|
||||
@@ -135,10 +123,9 @@ class WarpFrame(gym.ObservationWrapper):
|
||||
gym.ObservationWrapper.__init__(self, env)
|
||||
self.width = 84
|
||||
self.height = 84
|
||||
self.observation_space = spaces.Box(low=0, high=255,
|
||||
shape=(self.height, self.width, 1), dtype=np.uint8)
|
||||
self.observation_space = spaces.Box(low=0, high=255, shape=(self.height, self.width, 1))
|
||||
|
||||
def observation(self, frame):
|
||||
def _observation(self, frame):
|
||||
frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
|
||||
frame = cv2.resize(frame, (self.width, self.height), interpolation=cv2.INTER_AREA)
|
||||
return frame[:, :, None]
|
||||
@@ -157,15 +144,15 @@ class FrameStack(gym.Wrapper):
|
||||
self.k = k
|
||||
self.frames = deque([], maxlen=k)
|
||||
shp = env.observation_space.shape
|
||||
self.observation_space = spaces.Box(low=0, high=255, shape=(shp[0], shp[1], shp[2] * k), dtype=np.uint8)
|
||||
self.observation_space = spaces.Box(low=0, high=255, shape=(shp[0], shp[1], shp[2] * k))
|
||||
|
||||
def reset(self):
|
||||
def _reset(self):
|
||||
ob = self.env.reset()
|
||||
for _ in range(self.k):
|
||||
self.frames.append(ob)
|
||||
return self._get_ob()
|
||||
|
||||
def step(self, action):
|
||||
def _step(self, action):
|
||||
ob, reward, done, info = self.env.step(action)
|
||||
self.frames.append(ob)
|
||||
return self._get_ob(), reward, done, info
|
||||
@@ -175,10 +162,7 @@ class FrameStack(gym.Wrapper):
|
||||
return LazyFrames(list(self.frames))
|
||||
|
||||
class ScaledFloatFrame(gym.ObservationWrapper):
|
||||
def __init__(self, env):
|
||||
gym.ObservationWrapper.__init__(self, env)
|
||||
|
||||
def observation(self, observation):
|
||||
def _observation(self, observation):
|
||||
# careful! This undoes the memory optimization, use
|
||||
# with smaller replay buffers only.
|
||||
return np.array(observation).astype(np.float32) / 255.0
|
||||
@@ -191,28 +175,15 @@ class LazyFrames(object):
|
||||
|
||||
This object should only be converted to numpy array before being passed to the model.
|
||||
|
||||
You'd not believe how complex the previous solution was."""
|
||||
You'd not belive how complex the previous solution was."""
|
||||
self._frames = frames
|
||||
self._out = None
|
||||
|
||||
def _force(self):
|
||||
if self._out is None:
|
||||
self._out = np.concatenate(self._frames, axis=2)
|
||||
self._frames = None
|
||||
return self._out
|
||||
|
||||
def __array__(self, dtype=None):
|
||||
out = self._force()
|
||||
out = np.concatenate(self._frames, axis=2)
|
||||
if dtype is not None:
|
||||
out = out.astype(dtype)
|
||||
return out
|
||||
|
||||
def __len__(self):
|
||||
return len(self._force())
|
||||
|
||||
def __getitem__(self, i):
|
||||
return self._force()[i]
|
||||
|
||||
def make_atari(env_id):
|
||||
env = gym.make(env_id)
|
||||
assert 'NoFrameskip' in env.spec.id
|
||||
|
154
baselines/common/azure_utils.py
Normal file
@@ -0,0 +1,154 @@
|
||||
import os
|
||||
import tempfile
|
||||
import zipfile
|
||||
|
||||
from azure.common import AzureMissingResourceHttpError
|
||||
try:
|
||||
from azure.storage.blob import BlobService
|
||||
except ImportError:
|
||||
from azure.storage.blob import BlockBlobService as BlobService
|
||||
from shutil import unpack_archive
|
||||
from threading import Event
|
||||
|
||||
# TODOS: use Azure snapshots instead of hacky backups
|
||||
|
||||
def fixed_list_blobs(service, *args, **kwargs):
|
||||
"""By defualt list_containers only returns a subset of results.
|
||||
|
||||
This function attempts to fix this.
|
||||
"""
|
||||
res = []
|
||||
next_marker = None
|
||||
while next_marker is None or len(next_marker) > 0:
|
||||
kwargs['marker'] = next_marker
|
||||
gen = service.list_blobs(*args, **kwargs)
|
||||
for b in gen:
|
||||
res.append(b.name)
|
||||
next_marker = gen.next_marker
|
||||
return res
|
||||
|
||||
|
||||
def make_archive(source_path, dest_path):
|
||||
if source_path.endswith(os.path.sep):
|
||||
source_path = source_path.rstrip(os.path.sep)
|
||||
prefix_path = os.path.dirname(source_path)
|
||||
with zipfile.ZipFile(dest_path, "w", compression=zipfile.ZIP_STORED) as zf:
|
||||
if os.path.isdir(source_path):
|
||||
for dirname, _subdirs, files in os.walk(source_path):
|
||||
zf.write(dirname, os.path.relpath(dirname, prefix_path))
|
||||
for filename in files:
|
||||
filepath = os.path.join(dirname, filename)
|
||||
zf.write(filepath, os.path.relpath(filepath, prefix_path))
|
||||
else:
|
||||
zf.write(source_path, os.path.relpath(source_path, prefix_path))
|
||||
|
||||
|
||||
class Container(object):
|
||||
services = {}
|
||||
|
||||
def __init__(self, account_name, account_key, container_name, maybe_create=False):
|
||||
self._account_name = account_name
|
||||
self._container_name = container_name
|
||||
if account_name not in Container.services:
|
||||
Container.services[account_name] = BlobService(account_name, account_key)
|
||||
self._service = Container.services[account_name]
|
||||
if maybe_create:
|
||||
self._service.create_container(self._container_name, fail_on_exist=False)
|
||||
|
||||
def put(self, source_path, blob_name, callback=None):
|
||||
"""Upload a file or directory from `source_path` to azure blob `blob_name`.
|
||||
|
||||
Upload progress can be traced by an optional callback.
|
||||
"""
|
||||
upload_done = Event()
|
||||
|
||||
def progress_callback(current, total):
|
||||
if callback:
|
||||
callback(current, total)
|
||||
if current >= total:
|
||||
upload_done.set()
|
||||
|
||||
# Attempt to make backup if an existing version is already available
|
||||
try:
|
||||
x_ms_copy_source = "https://{}.blob.core.windows.net/{}/{}".format(
|
||||
self._account_name,
|
||||
self._container_name,
|
||||
blob_name
|
||||
)
|
||||
self._service.copy_blob(
|
||||
container_name=self._container_name,
|
||||
blob_name=blob_name + ".backup",
|
||||
x_ms_copy_source=x_ms_copy_source
|
||||
)
|
||||
except AzureMissingResourceHttpError:
|
||||
pass
|
||||
|
||||
with tempfile.TemporaryDirectory() as td:
|
||||
arcpath = os.path.join(td, "archive.zip")
|
||||
make_archive(source_path, arcpath)
|
||||
self._service.put_block_blob_from_path(
|
||||
container_name=self._container_name,
|
||||
blob_name=blob_name,
|
||||
file_path=arcpath,
|
||||
max_connections=4,
|
||||
progress_callback=progress_callback,
|
||||
max_retries=10)
|
||||
upload_done.wait()
|
||||
|
||||
def get(self, dest_path, blob_name, callback=None):
|
||||
"""Download a file or directory to `dest_path` to azure blob `blob_name`.
|
||||
|
||||
Warning! If directory is downloaded the `dest_path` is the parent directory.
|
||||
|
||||
Upload progress can be traced by an optional callback.
|
||||
"""
|
||||
download_done = Event()
|
||||
|
||||
def progress_callback(current, total):
|
||||
if callback:
|
||||
callback(current, total)
|
||||
if current >= total:
|
||||
download_done.set()
|
||||
|
||||
with tempfile.TemporaryDirectory() as td:
|
||||
arcpath = os.path.join(td, "archive.zip")
|
||||
for backup_blob_name in [blob_name, blob_name + '.backup']:
|
||||
try:
|
||||
properties = self._service.get_blob_properties(
|
||||
blob_name=backup_blob_name,
|
||||
container_name=self._container_name
|
||||
)
|
||||
if hasattr(properties, 'properties'):
|
||||
# Annoyingly, Azure has changed the API and this now returns a blob
|
||||
# instead of it's properties with up-to-date azure package.
|
||||
blob_size = properties.properties.content_length
|
||||
else:
|
||||
blob_size = properties['content-length']
|
||||
if int(blob_size) > 0:
|
||||
self._service.get_blob_to_path(
|
||||
container_name=self._container_name,
|
||||
blob_name=backup_blob_name,
|
||||
file_path=arcpath,
|
||||
max_connections=4,
|
||||
progress_callback=progress_callback)
|
||||
unpack_archive(arcpath, dest_path)
|
||||
download_done.wait()
|
||||
return True
|
||||
except AzureMissingResourceHttpError:
|
||||
pass
|
||||
return False
|
||||
|
||||
def list(self, prefix=None):
|
||||
"""List all blobs in the container."""
|
||||
return fixed_list_blobs(self._service, self._container_name, prefix=prefix)
|
||||
|
||||
def exists(self, blob_name):
|
||||
"""Returns true if `blob_name` exists in container."""
|
||||
try:
|
||||
self._service.get_blob_properties(
|
||||
blob_name=blob_name,
|
||||
container_name=self._container_name
|
||||
)
|
||||
return True
|
||||
except AzureMissingResourceHttpError:
|
||||
return False
|
@@ -1,88 +0,0 @@
|
||||
"""
|
||||
Helpers for scripts like run_atari.py.
|
||||
"""
|
||||
|
||||
import os
|
||||
import gym
|
||||
from gym.wrappers import FlattenDictWrapper
|
||||
from baselines import logger
|
||||
from baselines.bench import Monitor
|
||||
from baselines.common import set_global_seeds
|
||||
from baselines.common.atari_wrappers import make_atari, wrap_deepmind
|
||||
from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
|
||||
from mpi4py import MPI
|
||||
|
||||
def make_atari_env(env_id, num_env, seed, wrapper_kwargs=None, start_index=0):
|
||||
"""
|
||||
Create a wrapped, monitored SubprocVecEnv for Atari.
|
||||
"""
|
||||
if wrapper_kwargs is None: wrapper_kwargs = {}
|
||||
def make_env(rank): # pylint: disable=C0111
|
||||
def _thunk():
|
||||
env = make_atari(env_id)
|
||||
env.seed(seed + rank)
|
||||
env = Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
|
||||
return wrap_deepmind(env, **wrapper_kwargs)
|
||||
return _thunk
|
||||
set_global_seeds(seed)
|
||||
return SubprocVecEnv([make_env(i + start_index) for i in range(num_env)])
|
||||
|
||||
def make_mujoco_env(env_id, seed):
|
||||
"""
|
||||
Create a wrapped, monitored gym.Env for MuJoCo.
|
||||
"""
|
||||
set_global_seeds(seed)
|
||||
env = gym.make(env_id)
|
||||
env = Monitor(env, logger.get_dir())
|
||||
env.seed(seed)
|
||||
return env
|
||||
|
||||
def make_robotics_env(env_id, seed, rank=0):
|
||||
"""
|
||||
Create a wrapped, monitored gym.Env for MuJoCo.
|
||||
"""
|
||||
set_global_seeds(seed)
|
||||
env = gym.make(env_id)
|
||||
env = FlattenDictWrapper(env, ['observation', 'desired_goal'])
|
||||
env = Monitor(
|
||||
env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)),
|
||||
info_keywords=('is_success',))
|
||||
env.seed(seed)
|
||||
return env
|
||||
|
||||
def arg_parser():
|
||||
"""
|
||||
Create an empty argparse.ArgumentParser.
|
||||
"""
|
||||
import argparse
|
||||
return argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||
|
||||
def atari_arg_parser():
|
||||
"""
|
||||
Create an argparse.ArgumentParser for run_atari.py.
|
||||
"""
|
||||
parser = arg_parser()
|
||||
parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4')
|
||||
parser.add_argument('--seed', help='RNG seed', type=int, default=0)
|
||||
parser.add_argument('--num-timesteps', type=int, default=int(10e6))
|
||||
return parser
|
||||
|
||||
def mujoco_arg_parser():
|
||||
"""
|
||||
Create an argparse.ArgumentParser for run_mujoco.py.
|
||||
"""
|
||||
parser = arg_parser()
|
||||
parser.add_argument('--env', help='environment ID', type=str, default='Reacher-v2')
|
||||
parser.add_argument('--seed', help='RNG seed', type=int, default=0)
|
||||
parser.add_argument('--num-timesteps', type=int, default=int(1e6))
|
||||
return parser
|
||||
|
||||
def robotics_arg_parser():
|
||||
"""
|
||||
Create an argparse.ArgumentParser for run_mujoco.py.
|
||||
"""
|
||||
parser = arg_parser()
|
||||
parser.add_argument('--env', help='environment ID', type=str, default='FetchReach-v0')
|
||||
parser.add_argument('--seed', help='RNG seed', type=int, default=0)
|
||||
parser.add_argument('--num-timesteps', type=int, default=int(1e6))
|
||||
return parser
|
@@ -16,12 +16,7 @@ def fmt_item(x, l):
|
||||
if isinstance(x, np.ndarray):
|
||||
assert x.ndim==0
|
||||
x = x.item()
|
||||
if isinstance(x, (float, np.float32, np.float64)):
|
||||
v = abs(x)
|
||||
if (v < 1e-4 or v > 1e+4) and v > 0:
|
||||
rep = "%7.2e" % x
|
||||
else:
|
||||
rep = "%7.5f" % x
|
||||
if isinstance(x, float): rep = "%g"%x
|
||||
else: rep = str(x)
|
||||
return " "*(l - len(rep)) + rep
|
||||
|
||||
|
@@ -57,12 +57,14 @@ class CategoricalPdType(PdType):
|
||||
|
||||
|
||||
class MultiCategoricalPdType(PdType):
|
||||
def __init__(self, nvec):
|
||||
self.ncats = nvec
|
||||
def __init__(self, low, high):
|
||||
self.low = low
|
||||
self.high = high
|
||||
self.ncats = high - low + 1
|
||||
def pdclass(self):
|
||||
return MultiCategoricalPd
|
||||
def pdfromflat(self, flat):
|
||||
return MultiCategoricalPd(self.ncats, flat)
|
||||
return MultiCategoricalPd(self.low, self.high, flat)
|
||||
def param_shape(self):
|
||||
return [sum(self.ncats)]
|
||||
def sample_shape(self):
|
||||
@@ -123,7 +125,7 @@ class CategoricalPd(Pd):
|
||||
def flatparam(self):
|
||||
return self.logits
|
||||
def mode(self):
|
||||
return tf.argmax(self.logits, axis=-1)
|
||||
return U.argmax(self.logits, axis=-1)
|
||||
def neglogp(self, x):
|
||||
# return tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=x)
|
||||
# Note: we can't use sparse_softmax_cross_entropy_with_logits because
|
||||
@@ -133,20 +135,20 @@ class CategoricalPd(Pd):
|
||||
logits=self.logits,
|
||||
labels=one_hot_actions)
|
||||
def kl(self, other):
|
||||
a0 = self.logits - tf.reduce_max(self.logits, axis=-1, keep_dims=True)
|
||||
a1 = other.logits - tf.reduce_max(other.logits, axis=-1, keep_dims=True)
|
||||
a0 = self.logits - U.max(self.logits, axis=-1, keepdims=True)
|
||||
a1 = other.logits - U.max(other.logits, axis=-1, keepdims=True)
|
||||
ea0 = tf.exp(a0)
|
||||
ea1 = tf.exp(a1)
|
||||
z0 = tf.reduce_sum(ea0, axis=-1, keep_dims=True)
|
||||
z1 = tf.reduce_sum(ea1, axis=-1, keep_dims=True)
|
||||
z0 = U.sum(ea0, axis=-1, keepdims=True)
|
||||
z1 = U.sum(ea1, axis=-1, keepdims=True)
|
||||
p0 = ea0 / z0
|
||||
return tf.reduce_sum(p0 * (a0 - tf.log(z0) - a1 + tf.log(z1)), axis=-1)
|
||||
return U.sum(p0 * (a0 - tf.log(z0) - a1 + tf.log(z1)), axis=-1)
|
||||
def entropy(self):
|
||||
a0 = self.logits - tf.reduce_max(self.logits, axis=-1, keep_dims=True)
|
||||
a0 = self.logits - U.max(self.logits, axis=-1, keepdims=True)
|
||||
ea0 = tf.exp(a0)
|
||||
z0 = tf.reduce_sum(ea0, axis=-1, keep_dims=True)
|
||||
z0 = U.sum(ea0, axis=-1, keepdims=True)
|
||||
p0 = ea0 / z0
|
||||
return tf.reduce_sum(p0 * (tf.log(z0) - a0), axis=-1)
|
||||
return U.sum(p0 * (tf.log(z0) - a0), axis=-1)
|
||||
def sample(self):
|
||||
u = tf.random_uniform(tf.shape(self.logits))
|
||||
return tf.argmax(self.logits - tf.log(-tf.log(u)), axis=-1)
|
||||
@@ -155,21 +157,24 @@ class CategoricalPd(Pd):
|
||||
return cls(flat)
|
||||
|
||||
class MultiCategoricalPd(Pd):
|
||||
def __init__(self, nvec, flat):
|
||||
def __init__(self, low, high, flat):
|
||||
self.flat = flat
|
||||
self.categoricals = list(map(CategoricalPd, tf.split(flat, nvec, axis=-1)))
|
||||
self.low = tf.constant(low, dtype=tf.int32)
|
||||
self.categoricals = list(map(CategoricalPd, tf.split(flat, high - low + 1, axis=len(flat.get_shape()) - 1)))
|
||||
def flatparam(self):
|
||||
return self.flat
|
||||
def mode(self):
|
||||
return tf.cast(tf.stack([p.mode() for p in self.categoricals], axis=-1), tf.int32)
|
||||
return self.low + tf.cast(tf.stack([p.mode() for p in self.categoricals], axis=-1), tf.int32)
|
||||
def neglogp(self, x):
|
||||
return tf.add_n([p.neglogp(px) for p, px in zip(self.categoricals, tf.unstack(x, axis=-1))])
|
||||
return tf.add_n([p.neglogp(px) for p, px in zip(self.categoricals, tf.unstack(x - self.low, axis=len(x.get_shape()) - 1))])
|
||||
def kl(self, other):
|
||||
return tf.add_n([p.kl(q) for p, q in zip(self.categoricals, other.categoricals)])
|
||||
return tf.add_n([
|
||||
p.kl(q) for p, q in zip(self.categoricals, other.categoricals)
|
||||
])
|
||||
def entropy(self):
|
||||
return tf.add_n([p.entropy() for p in self.categoricals])
|
||||
def sample(self):
|
||||
return tf.cast(tf.stack([p.sample() for p in self.categoricals], axis=-1), tf.int32)
|
||||
return self.low + tf.cast(tf.stack([p.sample() for p in self.categoricals], axis=-1), tf.int32)
|
||||
@classmethod
|
||||
def fromflat(cls, flat):
|
||||
raise NotImplementedError
|
||||
@@ -186,14 +191,14 @@ class DiagGaussianPd(Pd):
|
||||
def mode(self):
|
||||
return self.mean
|
||||
def neglogp(self, x):
|
||||
return 0.5 * tf.reduce_sum(tf.square((x - self.mean) / self.std), axis=-1) \
|
||||
return 0.5 * U.sum(tf.square((x - self.mean) / self.std), axis=-1) \
|
||||
+ 0.5 * np.log(2.0 * np.pi) * tf.to_float(tf.shape(x)[-1]) \
|
||||
+ tf.reduce_sum(self.logstd, axis=-1)
|
||||
+ U.sum(self.logstd, axis=-1)
|
||||
def kl(self, other):
|
||||
assert isinstance(other, DiagGaussianPd)
|
||||
return tf.reduce_sum(other.logstd - self.logstd + (tf.square(self.std) + tf.square(self.mean - other.mean)) / (2.0 * tf.square(other.std)) - 0.5, axis=-1)
|
||||
return U.sum(other.logstd - self.logstd + (tf.square(self.std) + tf.square(self.mean - other.mean)) / (2.0 * tf.square(other.std)) - 0.5, axis=-1)
|
||||
def entropy(self):
|
||||
return tf.reduce_sum(self.logstd + .5 * np.log(2.0 * np.pi * np.e), axis=-1)
|
||||
return U.sum(self.logstd + .5 * np.log(2.0 * np.pi * np.e), axis=-1)
|
||||
def sample(self):
|
||||
return self.mean + self.std * tf.random_normal(tf.shape(self.mean))
|
||||
@classmethod
|
||||
@@ -209,11 +214,11 @@ class BernoulliPd(Pd):
|
||||
def mode(self):
|
||||
return tf.round(self.ps)
|
||||
def neglogp(self, x):
|
||||
return tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=tf.to_float(x)), axis=-1)
|
||||
return U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=tf.to_float(x)), axis=-1)
|
||||
def kl(self, other):
|
||||
return tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=other.logits, labels=self.ps), axis=-1) - tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps), axis=-1)
|
||||
return U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=other.logits, labels=self.ps), axis=-1) - U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps), axis=-1)
|
||||
def entropy(self):
|
||||
return tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps), axis=-1)
|
||||
return U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps), axis=-1)
|
||||
def sample(self):
|
||||
u = tf.random_uniform(tf.shape(self.ps))
|
||||
return tf.to_float(math_ops.less(u, self.ps))
|
||||
@@ -229,7 +234,7 @@ def make_pdtype(ac_space):
|
||||
elif isinstance(ac_space, spaces.Discrete):
|
||||
return CategoricalPdType(ac_space.n)
|
||||
elif isinstance(ac_space, spaces.MultiDiscrete):
|
||||
return MultiCategoricalPdType(ac_space.nvec)
|
||||
return MultiCategoricalPdType(ac_space.low, ac_space.high)
|
||||
elif isinstance(ac_space, spaces.MultiBinary):
|
||||
return BernoulliPdType(ac_space.n)
|
||||
else:
|
||||
@@ -254,11 +259,6 @@ def test_probtypes():
|
||||
categorical = CategoricalPdType(pdparam_categorical.size) #pylint: disable=E1101
|
||||
validate_probtype(categorical, pdparam_categorical)
|
||||
|
||||
nvec = [1,2,3]
|
||||
pdparam_multicategorical = np.array([-.2, .3, .5, .1, 1, -.1])
|
||||
multicategorical = MultiCategoricalPdType(nvec) #pylint: disable=E1101
|
||||
validate_probtype(multicategorical, pdparam_multicategorical)
|
||||
|
||||
pdparam_bernoulli = np.array([-.2, .3, .5])
|
||||
bernoulli = BernoulliPdType(pdparam_bernoulli.size) #pylint: disable=E1101
|
||||
validate_probtype(bernoulli, pdparam_bernoulli)
|
||||
@@ -270,10 +270,10 @@ def validate_probtype(probtype, pdparam):
|
||||
Mval = np.repeat(pdparam[None, :], N, axis=0)
|
||||
M = probtype.param_placeholder([N])
|
||||
X = probtype.sample_placeholder([N])
|
||||
pd = probtype.pdfromflat(M)
|
||||
pd = probtype.pdclass()(M)
|
||||
calcloglik = U.function([X, M], pd.logp(X))
|
||||
calcent = U.function([M], pd.entropy())
|
||||
Xval = tf.get_default_session().run(pd.sample(), feed_dict={M:Mval})
|
||||
Xval = U.eval(pd.sample(), feed_dict={M:Mval})
|
||||
logliks = calcloglik(Xval, Mval)
|
||||
entval_ll = - logliks.mean() #pylint: disable=E1101
|
||||
entval_ll_stderr = logliks.std() / np.sqrt(N) #pylint: disable=E1101
|
||||
@@ -282,7 +282,7 @@ def validate_probtype(probtype, pdparam):
|
||||
|
||||
# Check to see if kldiv[p,q] = - ent[p] - E_p[log q]
|
||||
M2 = probtype.param_placeholder([N])
|
||||
pd2 = probtype.pdfromflat(M2)
|
||||
pd2 = probtype.pdclass()(M2)
|
||||
q = pdparam + np.random.randn(pdparam.size) * 0.1
|
||||
Mval2 = np.repeat(q[None, :], N, axis=0)
|
||||
calckl = U.function([M, M2], pd.kl(pd2))
|
||||
@@ -291,5 +291,3 @@ def validate_probtype(probtype, pdparam):
|
||||
klval_ll = - entval - logliks.mean() #pylint: disable=E1101
|
||||
klval_ll_stderr = logliks.std() / np.sqrt(N) #pylint: disable=E1101
|
||||
assert np.abs(klval - klval_ll) < 3 * klval_ll_stderr # within 3 sigmas
|
||||
print('ok on', probtype, pdparam)
|
||||
|
||||
|
@@ -224,7 +224,6 @@ def relatively_safe_pickle_dump(obj, path, compression=False):
|
||||
# Using gzip here would be simpler, but the size is limited to 2GB
|
||||
with tempfile.NamedTemporaryFile() as uncompressed_file:
|
||||
pickle.dump(obj, uncompressed_file)
|
||||
uncompressed_file.file.flush()
|
||||
with zipfile.ZipFile(temp_storage, "w", compression=zipfile.ZIP_DEFLATED) as myzip:
|
||||
myzip.write(uncompressed_file.name, "data")
|
||||
else:
|
||||
|
@@ -53,7 +53,7 @@ class MpiAdam(object):
|
||||
def test_MpiAdam():
|
||||
np.random.seed(0)
|
||||
tf.set_random_seed(0)
|
||||
|
||||
|
||||
a = tf.Variable(np.random.randn(3).astype('float32'))
|
||||
b = tf.Variable(np.random.randn(2,5).astype('float32'))
|
||||
loss = tf.reduce_sum(tf.square(a)) + tf.reduce_sum(tf.sin(b))
|
||||
|
@@ -2,41 +2,29 @@ from mpi4py import MPI
|
||||
import numpy as np
|
||||
from baselines.common import zipsame
|
||||
|
||||
def mpi_mean(x, axis=0, comm=None, keepdims=False):
|
||||
x = np.asarray(x)
|
||||
assert x.ndim > 0
|
||||
if comm is None: comm = MPI.COMM_WORLD
|
||||
xsum = x.sum(axis=axis, keepdims=keepdims)
|
||||
n = xsum.size
|
||||
localsum = np.zeros(n+1, x.dtype)
|
||||
localsum[:n] = xsum.ravel()
|
||||
localsum[n] = x.shape[axis]
|
||||
globalsum = np.zeros_like(localsum)
|
||||
comm.Allreduce(localsum, globalsum, op=MPI.SUM)
|
||||
return globalsum[:n].reshape(xsum.shape) / globalsum[n], globalsum[n]
|
||||
|
||||
def mpi_moments(x, axis=0, comm=None, keepdims=False):
|
||||
x = np.asarray(x)
|
||||
assert x.ndim > 0
|
||||
mean, count = mpi_mean(x, axis=axis, comm=comm, keepdims=True)
|
||||
sqdiffs = np.square(x - mean)
|
||||
meansqdiff, count1 = mpi_mean(sqdiffs, axis=axis, comm=comm, keepdims=True)
|
||||
assert count1 == count
|
||||
std = np.sqrt(meansqdiff)
|
||||
if not keepdims:
|
||||
newshape = mean.shape[:axis] + mean.shape[axis+1:]
|
||||
mean = mean.reshape(newshape)
|
||||
std = std.reshape(newshape)
|
||||
def mpi_moments(x, axis=0):
|
||||
x = np.asarray(x, dtype='float64')
|
||||
newshape = list(x.shape)
|
||||
newshape.pop(axis)
|
||||
n = np.prod(newshape,dtype=int)
|
||||
totalvec = np.zeros(n*2+1, 'float64')
|
||||
addvec = np.concatenate([x.sum(axis=axis).ravel(),
|
||||
np.square(x).sum(axis=axis).ravel(),
|
||||
np.array([x.shape[axis]],dtype='float64')])
|
||||
MPI.COMM_WORLD.Allreduce(addvec, totalvec, op=MPI.SUM)
|
||||
sum = totalvec[:n]
|
||||
sumsq = totalvec[n:2*n]
|
||||
count = totalvec[2*n]
|
||||
if count == 0:
|
||||
mean = np.empty(newshape); mean[:] = np.nan
|
||||
std = np.empty(newshape); std[:] = np.nan
|
||||
else:
|
||||
mean = sum/count
|
||||
std = np.sqrt(np.maximum(sumsq/count - np.square(mean),0))
|
||||
return mean, std, count
|
||||
|
||||
|
||||
def test_runningmeanstd():
|
||||
import subprocess
|
||||
subprocess.check_call(['mpirun', '-np', '3',
|
||||
'python','-c',
|
||||
'from baselines.common.mpi_moments import _helper_runningmeanstd; _helper_runningmeanstd()'])
|
||||
|
||||
def _helper_runningmeanstd():
|
||||
comm = MPI.COMM_WORLD
|
||||
np.random.seed(0)
|
||||
for (triple,axis) in [
|
||||
@@ -57,3 +45,6 @@ def _helper_runningmeanstd():
|
||||
assert np.allclose(a1, a2)
|
||||
print("ok!")
|
||||
|
||||
if __name__ == "__main__":
|
||||
#mpirun -np 3 python <script>
|
||||
test_runningmeanstd()
|
@@ -57,7 +57,7 @@ def test_runningmeanstd():
|
||||
rms.update(x1)
|
||||
rms.update(x2)
|
||||
rms.update(x3)
|
||||
ms2 = [rms.mean.eval(), rms.std.eval()]
|
||||
ms2 = U.eval([rms.mean, rms.std])
|
||||
|
||||
assert np.allclose(ms1, ms2)
|
||||
|
||||
@@ -94,11 +94,11 @@ def test_dist():
|
||||
|
||||
assert checkallclose(
|
||||
bigvec.mean(axis=0),
|
||||
rms.mean.eval(),
|
||||
U.eval(rms.mean)
|
||||
)
|
||||
assert checkallclose(
|
||||
bigvec.std(axis=0),
|
||||
rms.std.eval(),
|
||||
U.eval(rms.std)
|
||||
)
|
||||
|
||||
|
||||
|
@@ -1,46 +0,0 @@
|
||||
import numpy as np
|
||||
class RunningMeanStd(object):
|
||||
# https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
|
||||
def __init__(self, epsilon=1e-4, shape=()):
|
||||
self.mean = np.zeros(shape, 'float64')
|
||||
self.var = np.ones(shape, 'float64')
|
||||
self.count = epsilon
|
||||
|
||||
def update(self, x):
|
||||
batch_mean = np.mean(x, axis=0)
|
||||
batch_var = np.var(x, axis=0)
|
||||
batch_count = x.shape[0]
|
||||
self.update_from_moments(batch_mean, batch_var, batch_count)
|
||||
|
||||
def update_from_moments(self, batch_mean, batch_var, batch_count):
|
||||
delta = batch_mean - self.mean
|
||||
tot_count = self.count + batch_count
|
||||
|
||||
new_mean = self.mean + delta * batch_count / tot_count
|
||||
m_a = self.var * (self.count)
|
||||
m_b = batch_var * (batch_count)
|
||||
M2 = m_a + m_b + np.square(delta) * self.count * batch_count / (self.count + batch_count)
|
||||
new_var = M2 / (self.count + batch_count)
|
||||
|
||||
new_count = batch_count + self.count
|
||||
|
||||
self.mean = new_mean
|
||||
self.var = new_var
|
||||
self.count = new_count
|
||||
|
||||
def test_runningmeanstd():
|
||||
for (x1, x2, x3) in [
|
||||
(np.random.randn(3), np.random.randn(4), np.random.randn(5)),
|
||||
(np.random.randn(3,2), np.random.randn(4,2), np.random.randn(5,2)),
|
||||
]:
|
||||
|
||||
rms = RunningMeanStd(epsilon=0.0, shape=x1.shape[1:])
|
||||
|
||||
x = np.concatenate([x1, x2, x3], axis=0)
|
||||
ms1 = [x.mean(axis=0), x.var(axis=0)]
|
||||
rms.update(x1)
|
||||
rms.update(x2)
|
||||
rms.update(x3)
|
||||
ms2 = [rms.mean, rms.var]
|
||||
|
||||
assert np.allclose(ms1, ms2)
|
@@ -3,39 +3,67 @@ import tensorflow as tf
|
||||
from baselines.common.tf_util import (
|
||||
function,
|
||||
initialize,
|
||||
set_value,
|
||||
single_threaded_session
|
||||
)
|
||||
|
||||
|
||||
def test_set_value():
|
||||
a = tf.Variable(42.)
|
||||
with single_threaded_session():
|
||||
set_value(a, 5)
|
||||
assert a.eval() == 5
|
||||
g = tf.get_default_graph()
|
||||
g.finalize()
|
||||
set_value(a, 6)
|
||||
assert a.eval() == 6
|
||||
|
||||
# test the test
|
||||
try:
|
||||
assert a.eval() == 7
|
||||
except AssertionError:
|
||||
pass
|
||||
else:
|
||||
assert False, "assertion should have failed"
|
||||
|
||||
|
||||
def test_function():
|
||||
with tf.Graph().as_default():
|
||||
x = tf.placeholder(tf.int32, (), name="x")
|
||||
y = tf.placeholder(tf.int32, (), name="y")
|
||||
z = 3 * x + 2 * y
|
||||
lin = function([x, y], z, givens={y: 0})
|
||||
tf.reset_default_graph()
|
||||
x = tf.placeholder(tf.int32, (), name="x")
|
||||
y = tf.placeholder(tf.int32, (), name="y")
|
||||
z = 3 * x + 2 * y
|
||||
lin = function([x, y], z, givens={y: 0})
|
||||
|
||||
with single_threaded_session():
|
||||
initialize()
|
||||
with single_threaded_session():
|
||||
initialize()
|
||||
|
||||
assert lin(2) == 6
|
||||
assert lin(2, 2) == 10
|
||||
assert lin(2) == 6
|
||||
assert lin(x=3) == 9
|
||||
assert lin(2, 2) == 10
|
||||
assert lin(x=2, y=3) == 12
|
||||
|
||||
|
||||
def test_multikwargs():
|
||||
with tf.Graph().as_default():
|
||||
x = tf.placeholder(tf.int32, (), name="x")
|
||||
with tf.variable_scope("other"):
|
||||
x2 = tf.placeholder(tf.int32, (), name="x")
|
||||
z = 3 * x + 2 * x2
|
||||
tf.reset_default_graph()
|
||||
x = tf.placeholder(tf.int32, (), name="x")
|
||||
with tf.variable_scope("other"):
|
||||
x2 = tf.placeholder(tf.int32, (), name="x")
|
||||
z = 3 * x + 2 * x2
|
||||
|
||||
lin = function([x, x2], z, givens={x2: 0})
|
||||
with single_threaded_session():
|
||||
initialize()
|
||||
assert lin(2) == 6
|
||||
assert lin(2, 2) == 10
|
||||
expt_caught = False
|
||||
lin = function([x, x2], z, givens={x2: 0})
|
||||
with single_threaded_session():
|
||||
initialize()
|
||||
assert lin(2) == 6
|
||||
assert lin(2, 2) == 10
|
||||
expt_caught = False
|
||||
try:
|
||||
lin(x=2)
|
||||
except AssertionError:
|
||||
expt_caught = True
|
||||
assert expt_caught
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
test_set_value()
|
||||
test_function()
|
||||
test_multikwargs()
|
||||
|
@@ -1,10 +1,45 @@
|
||||
import numpy as np
|
||||
import tensorflow as tf # pylint: ignore-module
|
||||
import builtins
|
||||
import functools
|
||||
import copy
|
||||
import os
|
||||
import functools
|
||||
import collections
|
||||
import multiprocessing
|
||||
|
||||
# ================================================================
|
||||
# Make consistent with numpy
|
||||
# ================================================================
|
||||
|
||||
clip = tf.clip_by_value
|
||||
|
||||
def sum(x, axis=None, keepdims=False):
|
||||
axis = None if axis is None else [axis]
|
||||
return tf.reduce_sum(x, axis=axis, keep_dims=keepdims)
|
||||
|
||||
def mean(x, axis=None, keepdims=False):
|
||||
axis = None if axis is None else [axis]
|
||||
return tf.reduce_mean(x, axis=axis, keep_dims=keepdims)
|
||||
|
||||
def var(x, axis=None, keepdims=False):
|
||||
meanx = mean(x, axis=axis, keepdims=keepdims)
|
||||
return mean(tf.square(x - meanx), axis=axis, keepdims=keepdims)
|
||||
|
||||
def std(x, axis=None, keepdims=False):
|
||||
return tf.sqrt(var(x, axis=axis, keepdims=keepdims))
|
||||
|
||||
def max(x, axis=None, keepdims=False):
|
||||
axis = None if axis is None else [axis]
|
||||
return tf.reduce_max(x, axis=axis, keep_dims=keepdims)
|
||||
|
||||
def min(x, axis=None, keepdims=False):
|
||||
axis = None if axis is None else [axis]
|
||||
return tf.reduce_min(x, axis=axis, keep_dims=keepdims)
|
||||
|
||||
def concatenate(arrs, axis=0):
|
||||
return tf.concat(axis=axis, values=arrs)
|
||||
|
||||
def argmax(x, axis=None):
|
||||
return tf.argmax(x, axis=axis)
|
||||
|
||||
def switch(condition, then_expression, else_expression):
|
||||
"""Switches between two operations depending on a scalar value (int or bool).
|
||||
@@ -27,11 +62,105 @@ def switch(condition, then_expression, else_expression):
|
||||
# Extras
|
||||
# ================================================================
|
||||
|
||||
def l2loss(params):
|
||||
if len(params) == 0:
|
||||
return tf.constant(0.0)
|
||||
else:
|
||||
return tf.add_n([sum(tf.square(p)) for p in params])
|
||||
|
||||
def lrelu(x, leak=0.2):
|
||||
f1 = 0.5 * (1 + leak)
|
||||
f2 = 0.5 * (1 - leak)
|
||||
return f1 * x + f2 * abs(x)
|
||||
|
||||
def categorical_sample_logits(X):
|
||||
# https://github.com/tensorflow/tensorflow/issues/456
|
||||
U = tf.random_uniform(tf.shape(X))
|
||||
return argmax(X - tf.log(-tf.log(U)), axis=1)
|
||||
|
||||
# ================================================================
|
||||
# Inputs
|
||||
# ================================================================
|
||||
|
||||
def is_placeholder(x):
|
||||
return type(x) is tf.Tensor and len(x.op.inputs) == 0
|
||||
|
||||
class TfInput(object):
|
||||
def __init__(self, name="(unnamed)"):
|
||||
"""Generalized Tensorflow placeholder. The main differences are:
|
||||
- possibly uses multiple placeholders internally and returns multiple values
|
||||
- can apply light postprocessing to the value feed to placeholder.
|
||||
"""
|
||||
self.name = name
|
||||
|
||||
def get(self):
|
||||
"""Return the tf variable(s) representing the possibly postprocessed value
|
||||
of placeholder(s).
|
||||
"""
|
||||
raise NotImplemented()
|
||||
|
||||
def make_feed_dict(data):
|
||||
"""Given data input it to the placeholder(s)."""
|
||||
raise NotImplemented()
|
||||
|
||||
class PlacholderTfInput(TfInput):
|
||||
def __init__(self, placeholder):
|
||||
"""Wrapper for regular tensorflow placeholder."""
|
||||
super().__init__(placeholder.name)
|
||||
self._placeholder = placeholder
|
||||
|
||||
def get(self):
|
||||
return self._placeholder
|
||||
|
||||
def make_feed_dict(self, data):
|
||||
return {self._placeholder: data}
|
||||
|
||||
class BatchInput(PlacholderTfInput):
|
||||
def __init__(self, shape, dtype=tf.float32, name=None):
|
||||
"""Creates a placeholder for a batch of tensors of a given shape and dtype
|
||||
|
||||
Parameters
|
||||
----------
|
||||
shape: [int]
|
||||
shape of a single elemenet of the batch
|
||||
dtype: tf.dtype
|
||||
number representation used for tensor contents
|
||||
name: str
|
||||
name of the underlying placeholder
|
||||
"""
|
||||
super().__init__(tf.placeholder(dtype, [None] + list(shape), name=name))
|
||||
|
||||
class Uint8Input(PlacholderTfInput):
|
||||
def __init__(self, shape, name=None):
|
||||
"""Takes input in uint8 format which is cast to float32 and divided by 255
|
||||
before passing it to the model.
|
||||
|
||||
On GPU this ensures lower data transfer times.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
shape: [int]
|
||||
shape of the tensor.
|
||||
name: str
|
||||
name of the underlying placeholder
|
||||
"""
|
||||
|
||||
super().__init__(tf.placeholder(tf.uint8, [None] + list(shape), name=name))
|
||||
self._shape = shape
|
||||
self._output = tf.cast(super().get(), tf.float32) / 255.0
|
||||
|
||||
def get(self):
|
||||
return self._output
|
||||
|
||||
def ensure_tf_input(thing):
|
||||
"""Takes either tf.placeholder of TfInput and outputs equivalent TfInput"""
|
||||
if isinstance(thing, TfInput):
|
||||
return thing
|
||||
elif is_placeholder(thing):
|
||||
return PlacholderTfInput(thing)
|
||||
else:
|
||||
raise ValueError("Must be a placeholder or TfInput")
|
||||
|
||||
# ================================================================
|
||||
# Mathematical utils
|
||||
# ================================================================
|
||||
@@ -44,42 +173,78 @@ def huber_loss(x, delta=1.0):
|
||||
delta * (tf.abs(x) - 0.5 * delta)
|
||||
)
|
||||
|
||||
# ================================================================
|
||||
# Optimizer utils
|
||||
# ================================================================
|
||||
|
||||
def minimize_and_clip(optimizer, objective, var_list, clip_val=10):
|
||||
"""Minimized `objective` using `optimizer` w.r.t. variables in
|
||||
`var_list` while ensure the norm of the gradients for each
|
||||
variable is clipped to `clip_val`
|
||||
"""
|
||||
gradients = optimizer.compute_gradients(objective, var_list=var_list)
|
||||
for i, (grad, var) in enumerate(gradients):
|
||||
if grad is not None:
|
||||
gradients[i] = (tf.clip_by_norm(grad, clip_val), var)
|
||||
return optimizer.apply_gradients(gradients)
|
||||
|
||||
# ================================================================
|
||||
# Global session
|
||||
# ================================================================
|
||||
|
||||
def make_session(num_cpu=None, make_default=False):
|
||||
def get_session():
|
||||
"""Returns recently made Tensorflow session"""
|
||||
return tf.get_default_session()
|
||||
|
||||
def make_session(num_cpu):
|
||||
"""Returns a session that will use <num_cpu> CPU's only"""
|
||||
if num_cpu is None:
|
||||
num_cpu = int(os.getenv('RCALL_NUM_CPU', multiprocessing.cpu_count()))
|
||||
tf_config = tf.ConfigProto(
|
||||
inter_op_parallelism_threads=num_cpu,
|
||||
intra_op_parallelism_threads=num_cpu)
|
||||
tf_config.gpu_options.allocator_type = 'BFC'
|
||||
if make_default:
|
||||
return tf.InteractiveSession(config=tf_config)
|
||||
else:
|
||||
return tf.Session(config=tf_config)
|
||||
return tf.Session(config=tf_config)
|
||||
|
||||
def single_threaded_session():
|
||||
"""Returns a session which will only use a single CPU"""
|
||||
return make_session(num_cpu=1)
|
||||
|
||||
def in_session(f):
|
||||
@functools.wraps(f)
|
||||
def newfunc(*args, **kwargs):
|
||||
with tf.Session():
|
||||
f(*args, **kwargs)
|
||||
return newfunc
|
||||
return make_session(1)
|
||||
|
||||
ALREADY_INITIALIZED = set()
|
||||
|
||||
def initialize():
|
||||
"""Initialize all the uninitialized variables in the global scope."""
|
||||
new_variables = set(tf.global_variables()) - ALREADY_INITIALIZED
|
||||
tf.get_default_session().run(tf.variables_initializer(new_variables))
|
||||
get_session().run(tf.variables_initializer(new_variables))
|
||||
ALREADY_INITIALIZED.update(new_variables)
|
||||
|
||||
def eval(expr, feed_dict=None):
|
||||
if feed_dict is None:
|
||||
feed_dict = {}
|
||||
return get_session().run(expr, feed_dict=feed_dict)
|
||||
|
||||
VALUE_SETTERS = collections.OrderedDict()
|
||||
|
||||
def set_value(v, val):
|
||||
global VALUE_SETTERS
|
||||
if v in VALUE_SETTERS:
|
||||
set_op, set_endpoint = VALUE_SETTERS[v]
|
||||
else:
|
||||
set_endpoint = tf.placeholder(v.dtype)
|
||||
set_op = v.assign(set_endpoint)
|
||||
VALUE_SETTERS[v] = (set_op, set_endpoint)
|
||||
get_session().run(set_op, feed_dict={set_endpoint: val})
|
||||
|
||||
# ================================================================
|
||||
# Saving variables
|
||||
# ================================================================
|
||||
|
||||
def load_state(fname):
|
||||
saver = tf.train.Saver()
|
||||
saver.restore(get_session(), fname)
|
||||
|
||||
def save_state(fname):
|
||||
os.makedirs(os.path.dirname(fname), exist_ok=True)
|
||||
saver = tf.train.Saver()
|
||||
saver.save(get_session(), fname)
|
||||
|
||||
# ================================================================
|
||||
# Model components
|
||||
# ================================================================
|
||||
@@ -120,6 +285,36 @@ def conv2d(x, num_filters, name, filter_size=(3, 3), stride=(1, 1), pad="SAME",
|
||||
|
||||
return tf.nn.conv2d(x, w, stride_shape, pad) + b
|
||||
|
||||
def dense(x, size, name, weight_init=None, bias=True):
|
||||
w = tf.get_variable(name + "/w", [x.get_shape()[1], size], initializer=weight_init)
|
||||
ret = tf.matmul(x, w)
|
||||
if bias:
|
||||
b = tf.get_variable(name + "/b", [size], initializer=tf.zeros_initializer())
|
||||
return ret + b
|
||||
else:
|
||||
return ret
|
||||
|
||||
def wndense(x, size, name, init_scale=1.0):
|
||||
v = tf.get_variable(name + "/V", [int(x.get_shape()[1]), size],
|
||||
initializer=tf.random_normal_initializer(0, 0.05))
|
||||
g = tf.get_variable(name + "/g", [size], initializer=tf.constant_initializer(init_scale))
|
||||
b = tf.get_variable(name + "/b", [size], initializer=tf.constant_initializer(0.0))
|
||||
|
||||
# use weight normalization (Salimans & Kingma, 2016)
|
||||
x = tf.matmul(x, v)
|
||||
scaler = g / tf.sqrt(sum(tf.square(v), axis=0, keepdims=True))
|
||||
return tf.reshape(scaler, [1, size]) * x + tf.reshape(b, [1, size])
|
||||
|
||||
def densenobias(x, size, name, weight_init=None):
|
||||
return dense(x, size, name, weight_init=weight_init, bias=False)
|
||||
|
||||
def dropout(x, pkeep, phase=None, mask=None):
|
||||
mask = tf.floor(pkeep + tf.random_uniform(tf.shape(x))) if mask is None else mask
|
||||
if phase is None:
|
||||
return mask * x
|
||||
else:
|
||||
return switch(phase, mask * x, pkeep * x)
|
||||
|
||||
# ================================================================
|
||||
# Theano-like Function
|
||||
# ================================================================
|
||||
@@ -149,7 +344,7 @@ def function(inputs, outputs, updates=None, givens=None):
|
||||
|
||||
Parameters
|
||||
----------
|
||||
inputs: [tf.placeholder, tf.constant, or object with make_feed_dict method]
|
||||
inputs: [tf.placeholder or TfInput]
|
||||
list of input arguments
|
||||
outputs: [tf.Variable] or tf.Variable
|
||||
list of outputs or a single output to be returned from function. Returned
|
||||
@@ -164,36 +359,183 @@ def function(inputs, outputs, updates=None, givens=None):
|
||||
f = _Function(inputs, [outputs], updates, givens=givens)
|
||||
return lambda *args, **kwargs: f(*args, **kwargs)[0]
|
||||
|
||||
|
||||
class _Function(object):
|
||||
def __init__(self, inputs, outputs, updates, givens):
|
||||
def __init__(self, inputs, outputs, updates, givens, check_nan=False):
|
||||
for inpt in inputs:
|
||||
if not hasattr(inpt, 'make_feed_dict') and not (type(inpt) is tf.Tensor and len(inpt.op.inputs) == 0):
|
||||
assert False, "inputs should all be placeholders, constants, or have a make_feed_dict method"
|
||||
if not issubclass(type(inpt), TfInput):
|
||||
assert len(inpt.op.inputs) == 0, "inputs should all be placeholders of baselines.common.TfInput"
|
||||
self.inputs = inputs
|
||||
updates = updates or []
|
||||
self.update_group = tf.group(*updates)
|
||||
self.outputs_update = list(outputs) + [self.update_group]
|
||||
self.givens = {} if givens is None else givens
|
||||
self.check_nan = check_nan
|
||||
|
||||
def _feed_input(self, feed_dict, inpt, value):
|
||||
if hasattr(inpt, 'make_feed_dict'):
|
||||
if issubclass(type(inpt), TfInput):
|
||||
feed_dict.update(inpt.make_feed_dict(value))
|
||||
else:
|
||||
elif is_placeholder(inpt):
|
||||
feed_dict[inpt] = value
|
||||
|
||||
def __call__(self, *args):
|
||||
def __call__(self, *args, **kwargs):
|
||||
assert len(args) <= len(self.inputs), "Too many arguments provided"
|
||||
feed_dict = {}
|
||||
# Update the args
|
||||
for inpt, value in zip(self.inputs, args):
|
||||
self._feed_input(feed_dict, inpt, value)
|
||||
# Update the kwargs
|
||||
kwargs_passed_inpt_names = set()
|
||||
for inpt in self.inputs[len(args):]:
|
||||
inpt_name = inpt.name.split(':')[0]
|
||||
inpt_name = inpt_name.split('/')[-1]
|
||||
assert inpt_name not in kwargs_passed_inpt_names, \
|
||||
"this function has two arguments with the same name \"{}\", so kwargs cannot be used.".format(inpt_name)
|
||||
if inpt_name in kwargs:
|
||||
kwargs_passed_inpt_names.add(inpt_name)
|
||||
self._feed_input(feed_dict, inpt, kwargs.pop(inpt_name))
|
||||
else:
|
||||
assert inpt in self.givens, "Missing argument " + inpt_name
|
||||
assert len(kwargs) == 0, "Function got extra arguments " + str(list(kwargs.keys()))
|
||||
# Update feed dict with givens.
|
||||
for inpt in self.givens:
|
||||
feed_dict[inpt] = feed_dict.get(inpt, self.givens[inpt])
|
||||
results = tf.get_default_session().run(self.outputs_update, feed_dict=feed_dict)[:-1]
|
||||
results = get_session().run(self.outputs_update, feed_dict=feed_dict)[:-1]
|
||||
if self.check_nan:
|
||||
if any(np.isnan(r).any() for r in results):
|
||||
raise RuntimeError("Nan detected")
|
||||
return results
|
||||
|
||||
def mem_friendly_function(nondata_inputs, data_inputs, outputs, batch_size):
|
||||
if isinstance(outputs, list):
|
||||
return _MemFriendlyFunction(nondata_inputs, data_inputs, outputs, batch_size)
|
||||
else:
|
||||
f = _MemFriendlyFunction(nondata_inputs, data_inputs, [outputs], batch_size)
|
||||
return lambda *inputs: f(*inputs)[0]
|
||||
|
||||
class _MemFriendlyFunction(object):
|
||||
def __init__(self, nondata_inputs, data_inputs, outputs, batch_size):
|
||||
self.nondata_inputs = nondata_inputs
|
||||
self.data_inputs = data_inputs
|
||||
self.outputs = list(outputs)
|
||||
self.batch_size = batch_size
|
||||
|
||||
def __call__(self, *inputvals):
|
||||
assert len(inputvals) == len(self.nondata_inputs) + len(self.data_inputs)
|
||||
nondata_vals = inputvals[0:len(self.nondata_inputs)]
|
||||
data_vals = inputvals[len(self.nondata_inputs):]
|
||||
feed_dict = dict(zip(self.nondata_inputs, nondata_vals))
|
||||
n = data_vals[0].shape[0]
|
||||
for v in data_vals[1:]:
|
||||
assert v.shape[0] == n
|
||||
for i_start in range(0, n, self.batch_size):
|
||||
slice_vals = [v[i_start:builtins.min(i_start + self.batch_size, n)] for v in data_vals]
|
||||
for (var, val) in zip(self.data_inputs, slice_vals):
|
||||
feed_dict[var] = val
|
||||
results = tf.get_default_session().run(self.outputs, feed_dict=feed_dict)
|
||||
if i_start == 0:
|
||||
sum_results = results
|
||||
else:
|
||||
for i in range(len(results)):
|
||||
sum_results[i] = sum_results[i] + results[i]
|
||||
for i in range(len(results)):
|
||||
sum_results[i] = sum_results[i] / n
|
||||
return sum_results
|
||||
|
||||
# ================================================================
|
||||
# Modules
|
||||
# ================================================================
|
||||
|
||||
class Module(object):
|
||||
def __init__(self, name):
|
||||
self.name = name
|
||||
self.first_time = True
|
||||
self.scope = None
|
||||
self.cache = {}
|
||||
|
||||
def __call__(self, *args):
|
||||
if args in self.cache:
|
||||
print("(%s) retrieving value from cache" % (self.name,))
|
||||
return self.cache[args]
|
||||
with tf.variable_scope(self.name, reuse=not self.first_time):
|
||||
scope = tf.get_variable_scope().name
|
||||
if self.first_time:
|
||||
self.scope = scope
|
||||
print("(%s) running function for the first time" % (self.name,))
|
||||
else:
|
||||
assert self.scope == scope, "Tried calling function with a different scope"
|
||||
print("(%s) running function on new inputs" % (self.name,))
|
||||
self.first_time = False
|
||||
out = self._call(*args)
|
||||
self.cache[args] = out
|
||||
return out
|
||||
|
||||
def _call(self, *args):
|
||||
raise NotImplementedError
|
||||
|
||||
@property
|
||||
def trainable_variables(self):
|
||||
assert self.scope is not None, "need to call module once before getting variables"
|
||||
return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)
|
||||
|
||||
@property
|
||||
def variables(self):
|
||||
assert self.scope is not None, "need to call module once before getting variables"
|
||||
return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope)
|
||||
|
||||
def module(name):
|
||||
@functools.wraps
|
||||
def wrapper(f):
|
||||
class WrapperModule(Module):
|
||||
def _call(self, *args):
|
||||
return f(*args)
|
||||
return WrapperModule(name)
|
||||
return wrapper
|
||||
|
||||
# ================================================================
|
||||
# Graph traversal
|
||||
# ================================================================
|
||||
|
||||
VARIABLES = {}
|
||||
|
||||
def get_parents(node):
|
||||
return node.op.inputs
|
||||
|
||||
def topsorted(outputs):
|
||||
"""
|
||||
Topological sort via non-recursive depth-first search
|
||||
"""
|
||||
assert isinstance(outputs, (list, tuple))
|
||||
marks = {}
|
||||
out = []
|
||||
stack = [] # pylint: disable=W0621
|
||||
# i: node
|
||||
# jidx = number of children visited so far from that node
|
||||
# marks: state of each node, which is one of
|
||||
# 0: haven't visited
|
||||
# 1: have visited, but not done visiting children
|
||||
# 2: done visiting children
|
||||
for x in outputs:
|
||||
stack.append((x, 0))
|
||||
while stack:
|
||||
(i, jidx) = stack.pop()
|
||||
if jidx == 0:
|
||||
m = marks.get(i, 0)
|
||||
if m == 0:
|
||||
marks[i] = 1
|
||||
elif m == 1:
|
||||
raise ValueError("not a dag")
|
||||
else:
|
||||
continue
|
||||
ps = get_parents(i)
|
||||
if jidx == len(ps):
|
||||
marks[i] = 2
|
||||
out.append(i)
|
||||
else:
|
||||
stack.append((i, jidx + 1))
|
||||
j = ps[jidx]
|
||||
stack.append((j, 0))
|
||||
return out
|
||||
|
||||
# ================================================================
|
||||
# Flat vectors
|
||||
# ================================================================
|
||||
@@ -235,14 +577,88 @@ class SetFromFlat(object):
|
||||
self.op = tf.group(*assigns)
|
||||
|
||||
def __call__(self, theta):
|
||||
tf.get_default_session().run(self.op, feed_dict={self.theta: theta})
|
||||
get_session().run(self.op, feed_dict={self.theta: theta})
|
||||
|
||||
class GetFlat(object):
|
||||
def __init__(self, var_list):
|
||||
self.op = tf.concat(axis=0, values=[tf.reshape(v, [numel(v)]) for v in var_list])
|
||||
|
||||
def __call__(self):
|
||||
return tf.get_default_session().run(self.op)
|
||||
return get_session().run(self.op)
|
||||
|
||||
# ================================================================
|
||||
# Misc
|
||||
# ================================================================
|
||||
|
||||
def fancy_slice_2d(X, inds0, inds1):
|
||||
"""
|
||||
like numpy X[inds0, inds1]
|
||||
XXX this implementation is bad
|
||||
"""
|
||||
inds0 = tf.cast(inds0, tf.int64)
|
||||
inds1 = tf.cast(inds1, tf.int64)
|
||||
shape = tf.cast(tf.shape(X), tf.int64)
|
||||
ncols = shape[1]
|
||||
Xflat = tf.reshape(X, [-1])
|
||||
return tf.gather(Xflat, inds0 * ncols + inds1)
|
||||
|
||||
# ================================================================
|
||||
# Scopes
|
||||
# ================================================================
|
||||
|
||||
def scope_vars(scope, trainable_only=False):
|
||||
"""
|
||||
Get variables inside a scope
|
||||
The scope can be specified as a string
|
||||
|
||||
Parameters
|
||||
----------
|
||||
scope: str or VariableScope
|
||||
scope in which the variables reside.
|
||||
trainable_only: bool
|
||||
whether or not to return only the variables that were marked as trainable.
|
||||
|
||||
Returns
|
||||
-------
|
||||
vars: [tf.Variable]
|
||||
list of variables in `scope`.
|
||||
"""
|
||||
return tf.get_collection(
|
||||
tf.GraphKeys.TRAINABLE_VARIABLES if trainable_only else tf.GraphKeys.GLOBAL_VARIABLES,
|
||||
scope=scope if isinstance(scope, str) else scope.name
|
||||
)
|
||||
|
||||
def scope_name():
|
||||
"""Returns the name of current scope as a string, e.g. deepq/q_func"""
|
||||
return tf.get_variable_scope().name
|
||||
|
||||
def absolute_scope_name(relative_scope_name):
|
||||
"""Appends parent scope name to `relative_scope_name`"""
|
||||
return scope_name() + "/" + relative_scope_name
|
||||
|
||||
def lengths_to_mask(lengths_b, max_length):
|
||||
"""
|
||||
Turns a vector of lengths into a boolean mask
|
||||
|
||||
Args:
|
||||
lengths_b: an integer vector of lengths
|
||||
max_length: maximum length to fill the mask
|
||||
|
||||
Returns:
|
||||
a boolean array of shape (batch_size, max_length)
|
||||
row[i] consists of True repeated lengths_b[i] times, followed by False
|
||||
"""
|
||||
lengths_b = tf.convert_to_tensor(lengths_b)
|
||||
assert lengths_b.get_shape().ndims == 1
|
||||
mask_bt = tf.expand_dims(tf.range(max_length), 0) < tf.expand_dims(lengths_b, 1)
|
||||
return mask_bt
|
||||
|
||||
def in_session(f):
|
||||
@functools.wraps(f)
|
||||
def newfunc(*args, **kwargs):
|
||||
with tf.Session():
|
||||
f(*args, **kwargs)
|
||||
return newfunc
|
||||
|
||||
_PLACEHOLDER_CACHE = {} # name -> (placeholder, dtype, shape)
|
||||
|
||||
@@ -262,19 +678,9 @@ def get_placeholder_cached(name):
|
||||
def flattenallbut0(x):
|
||||
return tf.reshape(x, [-1, intprod(x.get_shape().as_list()[1:])])
|
||||
|
||||
|
||||
# ================================================================
|
||||
# Diagnostics
|
||||
# ================================================================
|
||||
|
||||
def display_var_info(vars):
|
||||
from baselines import logger
|
||||
count_params = 0
|
||||
for v in vars:
|
||||
name = v.name
|
||||
if "/Adam" in name or "beta1_power" in name or "beta2_power" in name: continue
|
||||
count_params += np.prod(v.shape.as_list())
|
||||
if "/b:" in name: continue # Wx+b, bias is not interesting to look at => count params, but not print
|
||||
logger.info(" %s%s%s" % (name, " "*(55-len(name)), str(v.shape)))
|
||||
logger.info("Total model parameters: %0.1f million" % (count_params*1e-6))
|
||||
|
||||
def reset():
|
||||
global _PLACEHOLDER_CACHE
|
||||
global VARIABLES
|
||||
_PLACEHOLDER_CACHE = {}
|
||||
VARIABLES = {}
|
||||
tf.reset_default_graph()
|
||||
|
@@ -1,119 +1,19 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from baselines import logger
|
||||
class VecEnv(object):
|
||||
"""
|
||||
Vectorized environment base class
|
||||
"""
|
||||
def step(self, vac):
|
||||
"""
|
||||
Apply sequence of actions to sequence of environments
|
||||
actions -> (observations, rewards, news)
|
||||
|
||||
class AlreadySteppingError(Exception):
|
||||
"""
|
||||
Raised when an asynchronous step is running while
|
||||
step_async() is called again.
|
||||
"""
|
||||
def __init__(self):
|
||||
msg = 'already running an async step'
|
||||
Exception.__init__(self, msg)
|
||||
|
||||
class NotSteppingError(Exception):
|
||||
"""
|
||||
Raised when an asynchronous step is not running but
|
||||
step_wait() is called.
|
||||
"""
|
||||
def __init__(self):
|
||||
msg = 'not running an async step'
|
||||
Exception.__init__(self, msg)
|
||||
|
||||
class VecEnv(ABC):
|
||||
"""
|
||||
An abstract asynchronous, vectorized environment.
|
||||
"""
|
||||
def __init__(self, num_envs, observation_space, action_space):
|
||||
self.num_envs = num_envs
|
||||
self.observation_space = observation_space
|
||||
self.action_space = action_space
|
||||
|
||||
@abstractmethod
|
||||
where 'news' is a boolean vector indicating whether each element is new.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
def reset(self):
|
||||
"""
|
||||
Reset all the environments and return an array of
|
||||
observations, or a tuple of observation arrays.
|
||||
|
||||
If step_async is still doing work, that work will
|
||||
be cancelled and step_wait() should not be called
|
||||
until step_async() is invoked again.
|
||||
Reset all environments
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def step_async(self, actions):
|
||||
"""
|
||||
Tell all the environments to start taking a step
|
||||
with the given actions.
|
||||
Call step_wait() to get the results of the step.
|
||||
|
||||
You should not call this if a step_async run is
|
||||
already pending.
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def step_wait(self):
|
||||
"""
|
||||
Wait for the step taken with step_async().
|
||||
|
||||
Returns (obs, rews, dones, infos):
|
||||
- obs: an array of observations, or a tuple of
|
||||
arrays of observations.
|
||||
- rews: an array of rewards
|
||||
- dones: an array of "episode done" booleans
|
||||
- infos: a sequence of info objects
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
raise NotImplementedError
|
||||
def close(self):
|
||||
"""
|
||||
Clean up the environments' resources.
|
||||
"""
|
||||
pass
|
||||
|
||||
def step(self, actions):
|
||||
self.step_async(actions)
|
||||
return self.step_wait()
|
||||
|
||||
def render(self):
|
||||
logger.warn('Render not defined for %s'%self)
|
||||
|
||||
class VecEnvWrapper(VecEnv):
|
||||
def __init__(self, venv, observation_space=None, action_space=None):
|
||||
self.venv = venv
|
||||
VecEnv.__init__(self,
|
||||
num_envs=venv.num_envs,
|
||||
observation_space=observation_space or venv.observation_space,
|
||||
action_space=action_space or venv.action_space)
|
||||
|
||||
def step_async(self, actions):
|
||||
self.venv.step_async(actions)
|
||||
|
||||
@abstractmethod
|
||||
def reset(self):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def step_wait(self):
|
||||
pass
|
||||
|
||||
def close(self):
|
||||
return self.venv.close()
|
||||
|
||||
def render(self):
|
||||
self.venv.render()
|
||||
|
||||
class CloudpickleWrapper(object):
|
||||
"""
|
||||
Uses cloudpickle to serialize contents (otherwise multiprocessing tries to use pickle)
|
||||
"""
|
||||
def __init__(self, x):
|
||||
self.x = x
|
||||
def __getstate__(self):
|
||||
import cloudpickle
|
||||
return cloudpickle.dumps(self.x)
|
||||
def __setstate__(self, ob):
|
||||
import pickle
|
||||
self.x = pickle.loads(ob)
|
||||
pass
|
@@ -1,51 +0,0 @@
|
||||
import numpy as np
|
||||
import gym
|
||||
from . import VecEnv
|
||||
|
||||
class DummyVecEnv(VecEnv):
|
||||
def __init__(self, env_fns):
|
||||
self.envs = [fn() for fn in env_fns]
|
||||
env = self.envs[0]
|
||||
VecEnv.__init__(self, len(env_fns), env.observation_space, env.action_space)
|
||||
|
||||
obs_spaces = self.observation_space.spaces if isinstance(self.observation_space, gym.spaces.Tuple) else (self.observation_space,)
|
||||
self.buf_obs = [np.zeros((self.num_envs,) + tuple(s.shape), s.dtype) for s in obs_spaces]
|
||||
self.buf_dones = np.zeros((self.num_envs,), dtype=np.bool)
|
||||
self.buf_rews = np.zeros((self.num_envs,), dtype=np.float32)
|
||||
self.buf_infos = [{} for _ in range(self.num_envs)]
|
||||
self.actions = None
|
||||
|
||||
def step_async(self, actions):
|
||||
self.actions = actions
|
||||
|
||||
def step_wait(self):
|
||||
for i in range(self.num_envs):
|
||||
obs_tuple, self.buf_rews[i], self.buf_dones[i], self.buf_infos[i] = self.envs[i].step(self.actions[i])
|
||||
if self.buf_dones[i]:
|
||||
obs_tuple = self.envs[i].reset()
|
||||
if isinstance(obs_tuple, (tuple, list)):
|
||||
for t,x in enumerate(obs_tuple):
|
||||
self.buf_obs[t][i] = x
|
||||
else:
|
||||
self.buf_obs[0][i] = obs_tuple
|
||||
return (self._obs_from_buf(), np.copy(self.buf_rews), np.copy(self.buf_dones),
|
||||
self.buf_infos.copy())
|
||||
|
||||
def reset(self):
|
||||
for i in range(self.num_envs):
|
||||
obs_tuple = self.envs[i].reset()
|
||||
if isinstance(obs_tuple, (tuple, list)):
|
||||
for t,x in enumerate(obs_tuple):
|
||||
self.buf_obs[t][i] = x
|
||||
else:
|
||||
self.buf_obs[0][i] = obs_tuple
|
||||
return self._obs_from_buf()
|
||||
|
||||
def close(self):
|
||||
return
|
||||
|
||||
def _obs_from_buf(self):
|
||||
if len(self.buf_obs) == 1:
|
||||
return np.copy(self.buf_obs[0])
|
||||
else:
|
||||
return tuple(np.copy(x) for x in self.buf_obs)
|
@@ -1,6 +1,6 @@
|
||||
import numpy as np
|
||||
from multiprocessing import Process, Pipe
|
||||
from baselines.common.vec_env import VecEnv, CloudpickleWrapper
|
||||
from baselines.common.vec_env import VecEnv
|
||||
|
||||
|
||||
def worker(remote, parent_remote, env_fn_wrapper):
|
||||
@@ -23,17 +23,30 @@ def worker(remote, parent_remote, env_fn_wrapper):
|
||||
remote.close()
|
||||
break
|
||||
elif cmd == 'get_spaces':
|
||||
remote.send((env.observation_space, env.action_space))
|
||||
remote.send((env.action_space, env.observation_space))
|
||||
else:
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class CloudpickleWrapper(object):
|
||||
"""
|
||||
Uses cloudpickle to serialize contents (otherwise multiprocessing tries to use pickle)
|
||||
"""
|
||||
def __init__(self, x):
|
||||
self.x = x
|
||||
def __getstate__(self):
|
||||
import cloudpickle
|
||||
return cloudpickle.dumps(self.x)
|
||||
def __setstate__(self, ob):
|
||||
import pickle
|
||||
self.x = pickle.loads(ob)
|
||||
|
||||
|
||||
class SubprocVecEnv(VecEnv):
|
||||
def __init__(self, env_fns, spaces=None):
|
||||
def __init__(self, env_fns):
|
||||
"""
|
||||
envs: list of gym environments to run in subprocesses
|
||||
"""
|
||||
self.waiting = False
|
||||
self.closed = False
|
||||
nenvs = len(env_fns)
|
||||
self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(nenvs)])
|
||||
@@ -46,17 +59,13 @@ class SubprocVecEnv(VecEnv):
|
||||
remote.close()
|
||||
|
||||
self.remotes[0].send(('get_spaces', None))
|
||||
observation_space, action_space = self.remotes[0].recv()
|
||||
VecEnv.__init__(self, len(env_fns), observation_space, action_space)
|
||||
self.action_space, self.observation_space = self.remotes[0].recv()
|
||||
|
||||
def step_async(self, actions):
|
||||
|
||||
def step(self, actions):
|
||||
for remote, action in zip(self.remotes, actions):
|
||||
remote.send(('step', action))
|
||||
self.waiting = True
|
||||
|
||||
def step_wait(self):
|
||||
results = [remote.recv() for remote in self.remotes]
|
||||
self.waiting = False
|
||||
obs, rews, dones, infos = zip(*results)
|
||||
return np.stack(obs), np.stack(rews), np.stack(dones), infos
|
||||
|
||||
@@ -73,11 +82,13 @@ class SubprocVecEnv(VecEnv):
|
||||
def close(self):
|
||||
if self.closed:
|
||||
return
|
||||
if self.waiting:
|
||||
for remote in self.remotes:
|
||||
remote.recv()
|
||||
|
||||
for remote in self.remotes:
|
||||
remote.send(('close', None))
|
||||
for p in self.ps:
|
||||
p.join()
|
||||
self.closed = True
|
||||
|
||||
@property
|
||||
def num_envs(self):
|
||||
return len(self.remotes)
|
||||
|
@@ -1,38 +0,0 @@
|
||||
from baselines.common.vec_env import VecEnvWrapper
|
||||
import numpy as np
|
||||
from gym import spaces
|
||||
|
||||
class VecFrameStack(VecEnvWrapper):
|
||||
"""
|
||||
Vectorized environment base class
|
||||
"""
|
||||
def __init__(self, venv, nstack):
|
||||
self.venv = venv
|
||||
self.nstack = nstack
|
||||
wos = venv.observation_space # wrapped ob space
|
||||
low = np.repeat(wos.low, self.nstack, axis=-1)
|
||||
high = np.repeat(wos.high, self.nstack, axis=-1)
|
||||
self.stackedobs = np.zeros((venv.num_envs,)+low.shape, low.dtype)
|
||||
observation_space = spaces.Box(low=low, high=high, dtype=venv.observation_space.dtype)
|
||||
VecEnvWrapper.__init__(self, venv, observation_space=observation_space)
|
||||
|
||||
def step_wait(self):
|
||||
obs, rews, news, infos = self.venv.step_wait()
|
||||
self.stackedobs = np.roll(self.stackedobs, shift=-1, axis=-1)
|
||||
for (i, new) in enumerate(news):
|
||||
if new:
|
||||
self.stackedobs[i] = 0
|
||||
self.stackedobs[..., -obs.shape[-1]:] = obs
|
||||
return self.stackedobs, rews, news, infos
|
||||
|
||||
def reset(self):
|
||||
"""
|
||||
Reset all environments
|
||||
"""
|
||||
obs = self.venv.reset()
|
||||
self.stackedobs[...] = 0
|
||||
self.stackedobs[..., -obs.shape[-1]:] = obs
|
||||
return self.stackedobs
|
||||
|
||||
def close(self):
|
||||
self.venv.close()
|
@@ -1,47 +0,0 @@
|
||||
from baselines.common.vec_env import VecEnvWrapper
|
||||
from baselines.common.running_mean_std import RunningMeanStd
|
||||
import numpy as np
|
||||
|
||||
class VecNormalize(VecEnvWrapper):
|
||||
"""
|
||||
Vectorized environment base class
|
||||
"""
|
||||
def __init__(self, venv, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8):
|
||||
VecEnvWrapper.__init__(self, venv)
|
||||
self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if ob else None
|
||||
self.ret_rms = RunningMeanStd(shape=()) if ret else None
|
||||
self.clipob = clipob
|
||||
self.cliprew = cliprew
|
||||
self.ret = np.zeros(self.num_envs)
|
||||
self.gamma = gamma
|
||||
self.epsilon = epsilon
|
||||
|
||||
def step_wait(self):
|
||||
"""
|
||||
Apply sequence of actions to sequence of environments
|
||||
actions -> (observations, rewards, news)
|
||||
|
||||
where 'news' is a boolean vector indicating whether each element is new.
|
||||
"""
|
||||
obs, rews, news, infos = self.venv.step_wait()
|
||||
self.ret = self.ret * self.gamma + rews
|
||||
obs = self._obfilt(obs)
|
||||
if self.ret_rms:
|
||||
self.ret_rms.update(self.ret)
|
||||
rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew)
|
||||
return obs, rews, news, infos
|
||||
|
||||
def _obfilt(self, obs):
|
||||
if self.ob_rms:
|
||||
self.ob_rms.update(obs)
|
||||
obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob)
|
||||
return obs
|
||||
else:
|
||||
return obs
|
||||
|
||||
def reset(self):
|
||||
"""
|
||||
Reset all environments
|
||||
"""
|
||||
obs = self.venv.reset()
|
||||
return self._obfilt(obs)
|
@@ -9,7 +9,8 @@ from baselines import logger
|
||||
from baselines.common.mpi_adam import MpiAdam
|
||||
import baselines.common.tf_util as U
|
||||
from baselines.common.mpi_running_mean_std import RunningMeanStd
|
||||
from mpi4py import MPI
|
||||
from baselines.ddpg.util import reduce_std, mpi_mean
|
||||
|
||||
|
||||
def normalize(x, stats):
|
||||
if stats is None:
|
||||
@@ -22,13 +23,6 @@ def denormalize(x, stats):
|
||||
return x
|
||||
return x * stats.std + stats.mean
|
||||
|
||||
def reduce_std(x, axis=None, keepdims=False):
|
||||
return tf.sqrt(reduce_var(x, axis=axis, keepdims=keepdims))
|
||||
|
||||
def reduce_var(x, axis=None, keepdims=False):
|
||||
m = tf.reduce_mean(x, axis=axis, keep_dims=True)
|
||||
devs_squared = tf.square(x - m)
|
||||
return tf.reduce_mean(devs_squared, axis=axis, keep_dims=keepdims)
|
||||
|
||||
def get_target_updates(vars, target_vars, tau):
|
||||
logger.info('setting up target updates ...')
|
||||
@@ -204,7 +198,7 @@ class DDPG(object):
|
||||
new_std = self.ret_rms.std
|
||||
self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean')
|
||||
new_mean = self.ret_rms.mean
|
||||
|
||||
|
||||
self.renormalize_Q_outputs_op = []
|
||||
for vs in [self.critic.output_vars, self.target_critic.output_vars]:
|
||||
assert len(vs) == 2
|
||||
@@ -219,15 +213,15 @@ class DDPG(object):
|
||||
def setup_stats(self):
|
||||
ops = []
|
||||
names = []
|
||||
|
||||
|
||||
if self.normalize_returns:
|
||||
ops += [self.ret_rms.mean, self.ret_rms.std]
|
||||
names += ['ret_rms_mean', 'ret_rms_std']
|
||||
|
||||
|
||||
if self.normalize_observations:
|
||||
ops += [tf.reduce_mean(self.obs_rms.mean), tf.reduce_mean(self.obs_rms.std)]
|
||||
names += ['obs_rms_mean', 'obs_rms_std']
|
||||
|
||||
|
||||
ops += [tf.reduce_mean(self.critic_tf)]
|
||||
names += ['reference_Q_mean']
|
||||
ops += [reduce_std(self.critic_tf)]
|
||||
@@ -237,7 +231,7 @@ class DDPG(object):
|
||||
names += ['reference_actor_Q_mean']
|
||||
ops += [reduce_std(self.critic_with_actor_tf)]
|
||||
names += ['reference_actor_Q_std']
|
||||
|
||||
|
||||
ops += [tf.reduce_mean(self.actor_tf)]
|
||||
names += ['reference_action_mean']
|
||||
ops += [reduce_std(self.actor_tf)]
|
||||
@@ -353,7 +347,7 @@ class DDPG(object):
|
||||
def adapt_param_noise(self):
|
||||
if self.param_noise is None:
|
||||
return 0.
|
||||
|
||||
|
||||
# Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation.
|
||||
batch = self.memory.sample(batch_size=self.batch_size)
|
||||
self.sess.run(self.perturb_adaptive_policy_ops, feed_dict={
|
||||
@@ -364,7 +358,7 @@ class DDPG(object):
|
||||
self.param_noise_stddev: self.param_noise.current_stddev,
|
||||
})
|
||||
|
||||
mean_distance = MPI.COMM_WORLD.allreduce(distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size()
|
||||
mean_distance = mpi_mean(distance)
|
||||
self.param_noise.adapt(mean_distance)
|
||||
return mean_distance
|
||||
|
||||
|
@@ -25,6 +25,7 @@ def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs):
|
||||
# Create envs.
|
||||
env = gym.make(env_id)
|
||||
env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
|
||||
gym.logger.setLevel(logging.WARN)
|
||||
|
||||
if evaluation and rank==0:
|
||||
eval_env = gym.make(env_id)
|
||||
|
@@ -4,6 +4,7 @@ from collections import deque
|
||||
import pickle
|
||||
|
||||
from baselines.ddpg.ddpg import DDPG
|
||||
from baselines.ddpg.util import mpi_mean, mpi_std, mpi_max, mpi_sum
|
||||
import baselines.common.tf_util as U
|
||||
|
||||
from baselines import logger
|
||||
@@ -34,7 +35,7 @@ def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, pa
|
||||
saver = tf.train.Saver()
|
||||
else:
|
||||
saver = None
|
||||
|
||||
|
||||
step = 0
|
||||
episode = 0
|
||||
eval_episode_rewards_history = deque(maxlen=100)
|
||||
@@ -109,7 +110,7 @@ def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, pa
|
||||
epoch_adaptive_distances = []
|
||||
for t_train in range(nb_train_steps):
|
||||
# Adapt param noise, if necessary.
|
||||
if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0:
|
||||
if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0:
|
||||
distance = agent.adapt_param_noise()
|
||||
epoch_adaptive_distances.append(distance)
|
||||
|
||||
@@ -137,46 +138,42 @@ def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, pa
|
||||
eval_episode_rewards_history.append(eval_episode_reward)
|
||||
eval_episode_reward = 0.
|
||||
|
||||
mpi_size = MPI.COMM_WORLD.Get_size()
|
||||
# Log stats.
|
||||
# XXX shouldn't call np.mean on variable length lists
|
||||
epoch_train_duration = time.time() - epoch_start_time
|
||||
duration = time.time() - start_time
|
||||
stats = agent.get_stats()
|
||||
combined_stats = stats.copy()
|
||||
combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
|
||||
combined_stats['rollout/return_history'] = np.mean(episode_rewards_history)
|
||||
combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps)
|
||||
combined_stats['rollout/actions_mean'] = np.mean(epoch_actions)
|
||||
combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
|
||||
combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
|
||||
combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses)
|
||||
combined_stats['train/param_noise_distance'] = np.mean(epoch_adaptive_distances)
|
||||
combined_stats['total/duration'] = duration
|
||||
combined_stats['total/steps_per_second'] = float(t) / float(duration)
|
||||
combined_stats['total/episodes'] = episodes
|
||||
combined_stats['rollout/episodes'] = epoch_episodes
|
||||
combined_stats['rollout/actions_std'] = np.std(epoch_actions)
|
||||
combined_stats = {}
|
||||
for key in sorted(stats.keys()):
|
||||
combined_stats[key] = mpi_mean(stats[key])
|
||||
|
||||
# Rollout statistics.
|
||||
combined_stats['rollout/return'] = mpi_mean(epoch_episode_rewards)
|
||||
combined_stats['rollout/return_history'] = mpi_mean(np.mean(episode_rewards_history))
|
||||
combined_stats['rollout/episode_steps'] = mpi_mean(epoch_episode_steps)
|
||||
combined_stats['rollout/episodes'] = mpi_sum(epoch_episodes)
|
||||
combined_stats['rollout/actions_mean'] = mpi_mean(epoch_actions)
|
||||
combined_stats['rollout/actions_std'] = mpi_std(epoch_actions)
|
||||
combined_stats['rollout/Q_mean'] = mpi_mean(epoch_qs)
|
||||
|
||||
# Train statistics.
|
||||
combined_stats['train/loss_actor'] = mpi_mean(epoch_actor_losses)
|
||||
combined_stats['train/loss_critic'] = mpi_mean(epoch_critic_losses)
|
||||
combined_stats['train/param_noise_distance'] = mpi_mean(epoch_adaptive_distances)
|
||||
|
||||
# Evaluation statistics.
|
||||
if eval_env is not None:
|
||||
combined_stats['eval/return'] = eval_episode_rewards
|
||||
combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history)
|
||||
combined_stats['eval/Q'] = eval_qs
|
||||
combined_stats['eval/episodes'] = len(eval_episode_rewards)
|
||||
def as_scalar(x):
|
||||
if isinstance(x, np.ndarray):
|
||||
assert x.size == 1
|
||||
return x[0]
|
||||
elif np.isscalar(x):
|
||||
return x
|
||||
else:
|
||||
raise ValueError('expected scalar, got %s'%x)
|
||||
combined_stats_sums = MPI.COMM_WORLD.allreduce(np.array([as_scalar(x) for x in combined_stats.values()]))
|
||||
combined_stats = {k : v / mpi_size for (k,v) in zip(combined_stats.keys(), combined_stats_sums)}
|
||||
combined_stats['eval/return'] = mpi_mean(eval_episode_rewards)
|
||||
combined_stats['eval/return_history'] = mpi_mean(np.mean(eval_episode_rewards_history))
|
||||
combined_stats['eval/Q'] = mpi_mean(eval_qs)
|
||||
combined_stats['eval/episodes'] = mpi_mean(len(eval_episode_rewards))
|
||||
|
||||
# Total statistics.
|
||||
combined_stats['total/duration'] = mpi_mean(duration)
|
||||
combined_stats['total/steps_per_second'] = mpi_mean(float(t) / float(duration))
|
||||
combined_stats['total/episodes'] = mpi_mean(episodes)
|
||||
combined_stats['total/epochs'] = epoch + 1
|
||||
combined_stats['total/steps'] = t
|
||||
|
||||
|
||||
for key in sorted(combined_stats.keys()):
|
||||
logger.record_tabular(key, combined_stats[key])
|
||||
logger.dump_tabular()
|
||||
@@ -189,3 +186,4 @@ def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, pa
|
||||
if eval_env and hasattr(eval_env, 'get_state'):
|
||||
with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f:
|
||||
pickle.dump(eval_env.get_state(), f)
|
||||
|
||||
|
44
baselines/ddpg/util.py
Normal file
@@ -0,0 +1,44 @@
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
from mpi4py import MPI
|
||||
from baselines.common.mpi_moments import mpi_moments
|
||||
|
||||
|
||||
def reduce_var(x, axis=None, keepdims=False):
|
||||
m = tf.reduce_mean(x, axis=axis, keep_dims=True)
|
||||
devs_squared = tf.square(x - m)
|
||||
return tf.reduce_mean(devs_squared, axis=axis, keep_dims=keepdims)
|
||||
|
||||
|
||||
def reduce_std(x, axis=None, keepdims=False):
|
||||
return tf.sqrt(reduce_var(x, axis=axis, keepdims=keepdims))
|
||||
|
||||
|
||||
def mpi_mean(value):
|
||||
if value == []:
|
||||
value = [0.]
|
||||
if not isinstance(value, list):
|
||||
value = [value]
|
||||
return mpi_moments(np.array(value))[0][0]
|
||||
|
||||
|
||||
def mpi_std(value):
|
||||
if value == []:
|
||||
value = [0.]
|
||||
if not isinstance(value, list):
|
||||
value = [value]
|
||||
return mpi_moments(np.array(value))[1][0]
|
||||
|
||||
|
||||
def mpi_max(value):
|
||||
global_max = np.zeros(1, dtype='float64')
|
||||
local_max = np.max(value).astype('float64')
|
||||
MPI.COMM_WORLD.Reduce(local_max, global_max, op=MPI.MAX)
|
||||
return global_max[0]
|
||||
|
||||
|
||||
def mpi_sum(value):
|
||||
global_sum = np.zeros(1, dtype='float64')
|
||||
local_sum = np.sum(np.array(value)).astype('float64')
|
||||
MPI.COMM_WORLD.Reduce(local_sum, global_sum, op=MPI.SUM)
|
||||
return global_sum[0]
|
@@ -97,37 +97,6 @@ import tensorflow as tf
|
||||
import baselines.common.tf_util as U
|
||||
|
||||
|
||||
def scope_vars(scope, trainable_only=False):
|
||||
"""
|
||||
Get variables inside a scope
|
||||
The scope can be specified as a string
|
||||
Parameters
|
||||
----------
|
||||
scope: str or VariableScope
|
||||
scope in which the variables reside.
|
||||
trainable_only: bool
|
||||
whether or not to return only the variables that were marked as trainable.
|
||||
Returns
|
||||
-------
|
||||
vars: [tf.Variable]
|
||||
list of variables in `scope`.
|
||||
"""
|
||||
return tf.get_collection(
|
||||
tf.GraphKeys.TRAINABLE_VARIABLES if trainable_only else tf.GraphKeys.GLOBAL_VARIABLES,
|
||||
scope=scope if isinstance(scope, str) else scope.name
|
||||
)
|
||||
|
||||
|
||||
def scope_name():
|
||||
"""Returns the name of current scope as a string, e.g. deepq/q_func"""
|
||||
return tf.get_variable_scope().name
|
||||
|
||||
|
||||
def absolute_scope_name(relative_scope_name):
|
||||
"""Appends parent scope name to `relative_scope_name`"""
|
||||
return scope_name() + "/" + relative_scope_name
|
||||
|
||||
|
||||
def default_param_noise_filter(var):
|
||||
if var not in tf.trainable_variables():
|
||||
# We never perturb non-trainable vars.
|
||||
@@ -174,7 +143,7 @@ def build_act(make_obs_ph, q_func, num_actions, scope="deepq", reuse=None):
|
||||
` See the top of the file for details.
|
||||
"""
|
||||
with tf.variable_scope(scope, reuse=reuse):
|
||||
observations_ph = make_obs_ph("observation")
|
||||
observations_ph = U.ensure_tf_input(make_obs_ph("observation"))
|
||||
stochastic_ph = tf.placeholder(tf.bool, (), name="stochastic")
|
||||
update_eps_ph = tf.placeholder(tf.float32, (), name="update_eps")
|
||||
|
||||
@@ -190,12 +159,10 @@ def build_act(make_obs_ph, q_func, num_actions, scope="deepq", reuse=None):
|
||||
|
||||
output_actions = tf.cond(stochastic_ph, lambda: stochastic_actions, lambda: deterministic_actions)
|
||||
update_eps_expr = eps.assign(tf.cond(update_eps_ph >= 0, lambda: update_eps_ph, lambda: eps))
|
||||
_act = U.function(inputs=[observations_ph, stochastic_ph, update_eps_ph],
|
||||
act = U.function(inputs=[observations_ph, stochastic_ph, update_eps_ph],
|
||||
outputs=output_actions,
|
||||
givens={update_eps_ph: -1.0, stochastic_ph: True},
|
||||
updates=[update_eps_expr])
|
||||
def act(ob, stochastic=True, update_eps=-1):
|
||||
return _act(ob, stochastic, update_eps)
|
||||
return act
|
||||
|
||||
|
||||
@@ -236,7 +203,7 @@ def build_act_with_param_noise(make_obs_ph, q_func, num_actions, scope="deepq",
|
||||
param_noise_filter_func = default_param_noise_filter
|
||||
|
||||
with tf.variable_scope(scope, reuse=reuse):
|
||||
observations_ph = make_obs_ph("observation")
|
||||
observations_ph = U.ensure_tf_input(make_obs_ph("observation"))
|
||||
stochastic_ph = tf.placeholder(tf.bool, (), name="stochastic")
|
||||
update_eps_ph = tf.placeholder(tf.float32, (), name="update_eps")
|
||||
update_param_noise_threshold_ph = tf.placeholder(tf.float32, (), name="update_param_noise_threshold")
|
||||
@@ -256,8 +223,8 @@ def build_act_with_param_noise(make_obs_ph, q_func, num_actions, scope="deepq",
|
||||
# https://stackoverflow.com/questions/37063952/confused-by-the-behavior-of-tf-cond for
|
||||
# a more detailed discussion.
|
||||
def perturb_vars(original_scope, perturbed_scope):
|
||||
all_vars = scope_vars(absolute_scope_name(original_scope))
|
||||
all_perturbed_vars = scope_vars(absolute_scope_name(perturbed_scope))
|
||||
all_vars = U.scope_vars(U.absolute_scope_name("q_func"))
|
||||
all_perturbed_vars = U.scope_vars(U.absolute_scope_name("perturbed_q_func"))
|
||||
assert len(all_vars) == len(all_perturbed_vars)
|
||||
perturb_ops = []
|
||||
for var, perturbed_var in zip(all_vars, all_perturbed_vars):
|
||||
@@ -305,12 +272,10 @@ def build_act_with_param_noise(make_obs_ph, q_func, num_actions, scope="deepq",
|
||||
tf.cond(update_param_noise_scale_ph, lambda: update_scale(), lambda: tf.Variable(0., trainable=False)),
|
||||
update_param_noise_threshold_expr,
|
||||
]
|
||||
_act = U.function(inputs=[observations_ph, stochastic_ph, update_eps_ph, reset_ph, update_param_noise_threshold_ph, update_param_noise_scale_ph],
|
||||
act = U.function(inputs=[observations_ph, stochastic_ph, update_eps_ph, reset_ph, update_param_noise_threshold_ph, update_param_noise_scale_ph],
|
||||
outputs=output_actions,
|
||||
givens={update_eps_ph: -1.0, stochastic_ph: True, reset_ph: False, update_param_noise_threshold_ph: False, update_param_noise_scale_ph: False},
|
||||
updates=updates)
|
||||
def act(ob, reset, update_param_noise_threshold, update_param_noise_scale, stochastic=True, update_eps=-1):
|
||||
return _act(ob, stochastic, update_eps, reset, update_param_noise_threshold, update_param_noise_scale)
|
||||
return act
|
||||
|
||||
|
||||
@@ -377,20 +342,20 @@ def build_train(make_obs_ph, q_func, num_actions, optimizer, grad_norm_clipping=
|
||||
|
||||
with tf.variable_scope(scope, reuse=reuse):
|
||||
# set up placeholders
|
||||
obs_t_input = make_obs_ph("obs_t")
|
||||
obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t"))
|
||||
act_t_ph = tf.placeholder(tf.int32, [None], name="action")
|
||||
rew_t_ph = tf.placeholder(tf.float32, [None], name="reward")
|
||||
obs_tp1_input = make_obs_ph("obs_tp1")
|
||||
obs_tp1_input = U.ensure_tf_input(make_obs_ph("obs_tp1"))
|
||||
done_mask_ph = tf.placeholder(tf.float32, [None], name="done")
|
||||
importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight")
|
||||
|
||||
# q network evaluation
|
||||
q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True) # reuse parameters from act
|
||||
q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/q_func")
|
||||
q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))
|
||||
|
||||
# target q network evalution
|
||||
q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func")
|
||||
target_q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/target_q_func")
|
||||
target_q_func_vars = U.scope_vars(U.absolute_scope_name("target_q_func"))
|
||||
|
||||
# q scores for actions which we know were selected in the given state.
|
||||
q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions), 1)
|
||||
@@ -398,7 +363,7 @@ def build_train(make_obs_ph, q_func, num_actions, optimizer, grad_norm_clipping=
|
||||
# compute estimate of best possible value starting from state at t + 1
|
||||
if double_q:
|
||||
q_tp1_using_online_net = q_func(obs_tp1_input.get(), num_actions, scope="q_func", reuse=True)
|
||||
q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1)
|
||||
q_tp1_best_using_online_net = tf.arg_max(q_tp1_using_online_net, 1)
|
||||
q_tp1_best = tf.reduce_sum(q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions), 1)
|
||||
else:
|
||||
q_tp1_best = tf.reduce_max(q_tp1, 1)
|
||||
@@ -414,11 +379,10 @@ def build_train(make_obs_ph, q_func, num_actions, optimizer, grad_norm_clipping=
|
||||
|
||||
# compute optimization op (potentially with gradient clipping)
|
||||
if grad_norm_clipping is not None:
|
||||
gradients = optimizer.compute_gradients(weighted_error, var_list=q_func_vars)
|
||||
for i, (grad, var) in enumerate(gradients):
|
||||
if grad is not None:
|
||||
gradients[i] = (tf.clip_by_norm(grad, grad_norm_clipping), var)
|
||||
optimize_expr = optimizer.apply_gradients(gradients)
|
||||
optimize_expr = U.minimize_and_clip(optimizer,
|
||||
weighted_error,
|
||||
var_list=q_func_vars,
|
||||
clip_val=grad_norm_clipping)
|
||||
else:
|
||||
optimize_expr = optimizer.minimize(weighted_error, var_list=q_func_vars)
|
||||
|
||||
|
51
baselines/deepq/experiments/atari/download_model.py
Normal file
@@ -0,0 +1,51 @@
|
||||
import argparse
|
||||
import progressbar
|
||||
|
||||
from baselines.common.azure_utils import Container
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser("Download a pretrained model from Azure.")
|
||||
# Environment
|
||||
parser.add_argument("--model-dir", type=str, default=None,
|
||||
help="save model in this directory this directory. ")
|
||||
parser.add_argument("--account-name", type=str, default="openaisciszymon",
|
||||
help="account name for Azure Blob Storage")
|
||||
parser.add_argument("--account-key", type=str, default=None,
|
||||
help="account key for Azure Blob Storage")
|
||||
parser.add_argument("--container", type=str, default="dqn-blogpost",
|
||||
help="container name and blob name separated by colon serparated by colon")
|
||||
parser.add_argument("--blob", type=str, default=None, help="blob with the model")
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
c = Container(account_name=args.account_name,
|
||||
account_key=args.account_key,
|
||||
container_name=args.container)
|
||||
|
||||
if args.blob is None:
|
||||
print("Listing available models:")
|
||||
print()
|
||||
for blob in sorted(c.list(prefix="model-")):
|
||||
print(blob)
|
||||
else:
|
||||
print("Downloading {} to {}...".format(args.blob, args.model_dir))
|
||||
bar = None
|
||||
|
||||
def callback(current, total):
|
||||
nonlocal bar
|
||||
if bar is None:
|
||||
bar = progressbar.ProgressBar(max_value=total)
|
||||
bar.update(current)
|
||||
|
||||
assert c.exists(args.blob), "model {} does not exist".format(args.blob)
|
||||
|
||||
assert args.model_dir is not None
|
||||
|
||||
c.get(args.model_dir, args.blob, callback=callback)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
70
baselines/deepq/experiments/atari/enjoy.py
Normal file
@@ -0,0 +1,70 @@
|
||||
import argparse
|
||||
import gym
|
||||
import os
|
||||
import numpy as np
|
||||
|
||||
from gym.monitoring import VideoRecorder
|
||||
|
||||
import baselines.common.tf_util as U
|
||||
|
||||
from baselines import deepq
|
||||
from baselines.common.misc_util import (
|
||||
boolean_flag,
|
||||
)
|
||||
from baselines import bench
|
||||
from baselines.common.atari_wrappers_deprecated import wrap_dqn
|
||||
from baselines.deepq.experiments.atari.model import model, dueling_model
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser("Run an already learned DQN model.")
|
||||
# Environment
|
||||
parser.add_argument("--env", type=str, required=True, help="name of the game")
|
||||
parser.add_argument("--model-dir", type=str, default=None, help="load model from this directory. ")
|
||||
parser.add_argument("--video", type=str, default=None, help="Path to mp4 file where the video of first episode will be recorded.")
|
||||
boolean_flag(parser, "stochastic", default=True, help="whether or not to use stochastic actions according to models eps value")
|
||||
boolean_flag(parser, "dueling", default=False, help="whether or not to use dueling model")
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def make_env(game_name):
|
||||
env = gym.make(game_name + "NoFrameskip-v4")
|
||||
env = bench.Monitor(env, None)
|
||||
env = wrap_dqn(env)
|
||||
return env
|
||||
|
||||
|
||||
def play(env, act, stochastic, video_path):
|
||||
num_episodes = 0
|
||||
video_recorder = None
|
||||
video_recorder = VideoRecorder(
|
||||
env, video_path, enabled=video_path is not None)
|
||||
obs = env.reset()
|
||||
while True:
|
||||
env.unwrapped.render()
|
||||
video_recorder.capture_frame()
|
||||
action = act(np.array(obs)[None], stochastic=stochastic)[0]
|
||||
obs, rew, done, info = env.step(action)
|
||||
if done:
|
||||
obs = env.reset()
|
||||
if len(info["rewards"]) > num_episodes:
|
||||
if len(info["rewards"]) == 1 and video_recorder.enabled:
|
||||
# save video of first episode
|
||||
print("Saved video.")
|
||||
video_recorder.close()
|
||||
video_recorder.enabled = False
|
||||
print(info["rewards"][-1])
|
||||
num_episodes = len(info["rewards"])
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
with U.make_session(4) as sess:
|
||||
args = parse_args()
|
||||
env = make_env(args.env)
|
||||
act = deepq.build_act(
|
||||
make_obs_ph=lambda name: U.Uint8Input(env.observation_space.shape, name=name),
|
||||
q_func=dueling_model if args.dueling else model,
|
||||
num_actions=env.action_space.n)
|
||||
U.load_state(os.path.join(args.model_dir, "saved"))
|
||||
play(env, act, args.stochastic, args.video)
|
60
baselines/deepq/experiments/atari/model.py
Normal file
@@ -0,0 +1,60 @@
|
||||
import tensorflow as tf
|
||||
import tensorflow.contrib.layers as layers
|
||||
|
||||
|
||||
def layer_norm_fn(x, relu=True):
|
||||
x = layers.layer_norm(x, scale=True, center=True)
|
||||
if relu:
|
||||
x = tf.nn.relu(x)
|
||||
return x
|
||||
|
||||
|
||||
def model(img_in, num_actions, scope, reuse=False, layer_norm=False):
|
||||
"""As described in https://storage.googleapis.com/deepmind-data/assets/papers/DeepMindNature14236Paper.pdf"""
|
||||
with tf.variable_scope(scope, reuse=reuse):
|
||||
out = img_in
|
||||
with tf.variable_scope("convnet"):
|
||||
# original architecture
|
||||
out = layers.convolution2d(out, num_outputs=32, kernel_size=8, stride=4, activation_fn=tf.nn.relu)
|
||||
out = layers.convolution2d(out, num_outputs=64, kernel_size=4, stride=2, activation_fn=tf.nn.relu)
|
||||
out = layers.convolution2d(out, num_outputs=64, kernel_size=3, stride=1, activation_fn=tf.nn.relu)
|
||||
conv_out = layers.flatten(out)
|
||||
|
||||
with tf.variable_scope("action_value"):
|
||||
value_out = layers.fully_connected(conv_out, num_outputs=512, activation_fn=None)
|
||||
if layer_norm:
|
||||
value_out = layer_norm_fn(value_out, relu=True)
|
||||
else:
|
||||
value_out = tf.nn.relu(value_out)
|
||||
value_out = layers.fully_connected(value_out, num_outputs=num_actions, activation_fn=None)
|
||||
return value_out
|
||||
|
||||
|
||||
def dueling_model(img_in, num_actions, scope, reuse=False, layer_norm=False):
|
||||
"""As described in https://arxiv.org/abs/1511.06581"""
|
||||
with tf.variable_scope(scope, reuse=reuse):
|
||||
out = img_in
|
||||
with tf.variable_scope("convnet"):
|
||||
# original architecture
|
||||
out = layers.convolution2d(out, num_outputs=32, kernel_size=8, stride=4, activation_fn=tf.nn.relu)
|
||||
out = layers.convolution2d(out, num_outputs=64, kernel_size=4, stride=2, activation_fn=tf.nn.relu)
|
||||
out = layers.convolution2d(out, num_outputs=64, kernel_size=3, stride=1, activation_fn=tf.nn.relu)
|
||||
conv_out = layers.flatten(out)
|
||||
|
||||
with tf.variable_scope("state_value"):
|
||||
state_hidden = layers.fully_connected(conv_out, num_outputs=512, activation_fn=None)
|
||||
if layer_norm:
|
||||
state_hidden = layer_norm_fn(state_hidden, relu=True)
|
||||
else:
|
||||
state_hidden = tf.nn.relu(state_hidden)
|
||||
state_score = layers.fully_connected(state_hidden, num_outputs=1, activation_fn=None)
|
||||
with tf.variable_scope("action_value"):
|
||||
actions_hidden = layers.fully_connected(conv_out, num_outputs=512, activation_fn=None)
|
||||
if layer_norm:
|
||||
actions_hidden = layer_norm_fn(actions_hidden, relu=True)
|
||||
else:
|
||||
actions_hidden = tf.nn.relu(actions_hidden)
|
||||
action_scores = layers.fully_connected(actions_hidden, num_outputs=num_actions, activation_fn=None)
|
||||
action_scores_mean = tf.reduce_mean(action_scores, 1)
|
||||
action_scores = action_scores - tf.expand_dims(action_scores_mean, 1)
|
||||
return state_score + action_scores
|
273
baselines/deepq/experiments/atari/train.py
Normal file
@@ -0,0 +1,273 @@
|
||||
import argparse
|
||||
import gym
|
||||
import numpy as np
|
||||
import os
|
||||
import tensorflow as tf
|
||||
import tempfile
|
||||
import time
|
||||
import json
|
||||
|
||||
import baselines.common.tf_util as U
|
||||
|
||||
from baselines import logger
|
||||
from baselines import deepq
|
||||
from baselines.deepq.replay_buffer import ReplayBuffer, PrioritizedReplayBuffer
|
||||
from baselines.common.misc_util import (
|
||||
boolean_flag,
|
||||
pickle_load,
|
||||
pretty_eta,
|
||||
relatively_safe_pickle_dump,
|
||||
set_global_seeds,
|
||||
RunningAvg,
|
||||
)
|
||||
from baselines.common.schedules import LinearSchedule, PiecewiseSchedule
|
||||
from baselines import bench
|
||||
from baselines.common.atari_wrappers_deprecated import wrap_dqn
|
||||
from baselines.common.azure_utils import Container
|
||||
from .model import model, dueling_model
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser("DQN experiments for Atari games")
|
||||
# Environment
|
||||
parser.add_argument("--env", type=str, default="Pong", help="name of the game")
|
||||
parser.add_argument("--seed", type=int, default=42, help="which seed to use")
|
||||
# Core DQN parameters
|
||||
parser.add_argument("--replay-buffer-size", type=int, default=int(1e6), help="replay buffer size")
|
||||
parser.add_argument("--lr", type=float, default=1e-4, help="learning rate for Adam optimizer")
|
||||
parser.add_argument("--num-steps", type=int, default=int(2e8), help="total number of steps to run the environment for")
|
||||
parser.add_argument("--batch-size", type=int, default=32, help="number of transitions to optimize at the same time")
|
||||
parser.add_argument("--learning-freq", type=int, default=4, help="number of iterations between every optimization step")
|
||||
parser.add_argument("--target-update-freq", type=int, default=40000, help="number of iterations between every target network update")
|
||||
parser.add_argument("--param-noise-update-freq", type=int, default=50, help="number of iterations between every re-scaling of the parameter noise")
|
||||
parser.add_argument("--param-noise-reset-freq", type=int, default=10000, help="maximum number of steps to take per episode before re-perturbing the exploration policy")
|
||||
# Bells and whistles
|
||||
boolean_flag(parser, "double-q", default=True, help="whether or not to use double q learning")
|
||||
boolean_flag(parser, "dueling", default=False, help="whether or not to use dueling model")
|
||||
boolean_flag(parser, "prioritized", default=False, help="whether or not to use prioritized replay buffer")
|
||||
boolean_flag(parser, "param-noise", default=False, help="whether or not to use parameter space noise for exploration")
|
||||
boolean_flag(parser, "layer-norm", default=False, help="whether or not to use layer norm (should be True if param_noise is used)")
|
||||
boolean_flag(parser, "gym-monitor", default=False, help="whether or not to use a OpenAI Gym monitor (results in slower training due to video recording)")
|
||||
parser.add_argument("--prioritized-alpha", type=float, default=0.6, help="alpha parameter for prioritized replay buffer")
|
||||
parser.add_argument("--prioritized-beta0", type=float, default=0.4, help="initial value of beta parameters for prioritized replay")
|
||||
parser.add_argument("--prioritized-eps", type=float, default=1e-6, help="eps parameter for prioritized replay buffer")
|
||||
# Checkpointing
|
||||
parser.add_argument("--save-dir", type=str, default=None, help="directory in which training state and model should be saved.")
|
||||
parser.add_argument("--save-azure-container", type=str, default=None,
|
||||
help="It present data will saved/loaded from Azure. Should be in format ACCOUNT_NAME:ACCOUNT_KEY:CONTAINER")
|
||||
parser.add_argument("--save-freq", type=int, default=1e6, help="save model once every time this many iterations are completed")
|
||||
boolean_flag(parser, "load-on-start", default=True, help="if true and model was previously saved then training will be resumed")
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def make_env(game_name):
|
||||
env = gym.make(game_name + "NoFrameskip-v4")
|
||||
monitored_env = bench.Monitor(env, logger.get_dir()) # puts rewards and number of steps in info, before environment is wrapped
|
||||
env = wrap_dqn(monitored_env) # applies a bunch of modification to simplify the observation space (downsample, make b/w)
|
||||
return env, monitored_env
|
||||
|
||||
|
||||
def maybe_save_model(savedir, container, state):
|
||||
"""This function checkpoints the model and state of the training algorithm."""
|
||||
if savedir is None:
|
||||
return
|
||||
start_time = time.time()
|
||||
model_dir = "model-{}".format(state["num_iters"])
|
||||
U.save_state(os.path.join(savedir, model_dir, "saved"))
|
||||
if container is not None:
|
||||
container.put(os.path.join(savedir, model_dir), model_dir)
|
||||
relatively_safe_pickle_dump(state, os.path.join(savedir, 'training_state.pkl.zip'), compression=True)
|
||||
if container is not None:
|
||||
container.put(os.path.join(savedir, 'training_state.pkl.zip'), 'training_state.pkl.zip')
|
||||
relatively_safe_pickle_dump(state["monitor_state"], os.path.join(savedir, 'monitor_state.pkl'))
|
||||
if container is not None:
|
||||
container.put(os.path.join(savedir, 'monitor_state.pkl'), 'monitor_state.pkl')
|
||||
logger.log("Saved model in {} seconds\n".format(time.time() - start_time))
|
||||
|
||||
|
||||
def maybe_load_model(savedir, container):
|
||||
"""Load model if present at the specified path."""
|
||||
if savedir is None:
|
||||
return
|
||||
|
||||
state_path = os.path.join(os.path.join(savedir, 'training_state.pkl.zip'))
|
||||
if container is not None:
|
||||
logger.log("Attempting to download model from Azure")
|
||||
found_model = container.get(savedir, 'training_state.pkl.zip')
|
||||
else:
|
||||
found_model = os.path.exists(state_path)
|
||||
if found_model:
|
||||
state = pickle_load(state_path, compression=True)
|
||||
model_dir = "model-{}".format(state["num_iters"])
|
||||
if container is not None:
|
||||
container.get(savedir, model_dir)
|
||||
U.load_state(os.path.join(savedir, model_dir, "saved"))
|
||||
logger.log("Loaded models checkpoint at {} iterations".format(state["num_iters"]))
|
||||
return state
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
args = parse_args()
|
||||
|
||||
# Parse savedir and azure container.
|
||||
savedir = args.save_dir
|
||||
if savedir is None:
|
||||
savedir = os.getenv('OPENAI_LOGDIR', None)
|
||||
if args.save_azure_container is not None:
|
||||
account_name, account_key, container_name = args.save_azure_container.split(":")
|
||||
container = Container(account_name=account_name,
|
||||
account_key=account_key,
|
||||
container_name=container_name,
|
||||
maybe_create=True)
|
||||
if savedir is None:
|
||||
# Careful! This will not get cleaned up. Docker spoils the developers.
|
||||
savedir = tempfile.TemporaryDirectory().name
|
||||
else:
|
||||
container = None
|
||||
# Create and seed the env.
|
||||
env, monitored_env = make_env(args.env)
|
||||
if args.seed > 0:
|
||||
set_global_seeds(args.seed)
|
||||
env.unwrapped.seed(args.seed)
|
||||
|
||||
if args.gym_monitor and savedir:
|
||||
env = gym.wrappers.Monitor(env, os.path.join(savedir, 'gym_monitor'), force=True)
|
||||
|
||||
if savedir:
|
||||
with open(os.path.join(savedir, 'args.json'), 'w') as f:
|
||||
json.dump(vars(args), f)
|
||||
|
||||
with U.make_session(4) as sess:
|
||||
# Create training graph and replay buffer
|
||||
def model_wrapper(img_in, num_actions, scope, **kwargs):
|
||||
actual_model = dueling_model if args.dueling else model
|
||||
return actual_model(img_in, num_actions, scope, layer_norm=args.layer_norm, **kwargs)
|
||||
act, train, update_target, debug = deepq.build_train(
|
||||
make_obs_ph=lambda name: U.Uint8Input(env.observation_space.shape, name=name),
|
||||
q_func=model_wrapper,
|
||||
num_actions=env.action_space.n,
|
||||
optimizer=tf.train.AdamOptimizer(learning_rate=args.lr, epsilon=1e-4),
|
||||
gamma=0.99,
|
||||
grad_norm_clipping=10,
|
||||
double_q=args.double_q,
|
||||
param_noise=args.param_noise
|
||||
)
|
||||
|
||||
approximate_num_iters = args.num_steps / 4
|
||||
exploration = PiecewiseSchedule([
|
||||
(0, 1.0),
|
||||
(approximate_num_iters / 50, 0.1),
|
||||
(approximate_num_iters / 5, 0.01)
|
||||
], outside_value=0.01)
|
||||
|
||||
if args.prioritized:
|
||||
replay_buffer = PrioritizedReplayBuffer(args.replay_buffer_size, args.prioritized_alpha)
|
||||
beta_schedule = LinearSchedule(approximate_num_iters, initial_p=args.prioritized_beta0, final_p=1.0)
|
||||
else:
|
||||
replay_buffer = ReplayBuffer(args.replay_buffer_size)
|
||||
|
||||
U.initialize()
|
||||
update_target()
|
||||
num_iters = 0
|
||||
|
||||
# Load the model
|
||||
state = maybe_load_model(savedir, container)
|
||||
if state is not None:
|
||||
num_iters, replay_buffer = state["num_iters"], state["replay_buffer"],
|
||||
monitored_env.set_state(state["monitor_state"])
|
||||
|
||||
start_time, start_steps = None, None
|
||||
steps_per_iter = RunningAvg(0.999)
|
||||
iteration_time_est = RunningAvg(0.999)
|
||||
obs = env.reset()
|
||||
num_iters_since_reset = 0
|
||||
reset = True
|
||||
|
||||
# Main trianing loop
|
||||
while True:
|
||||
num_iters += 1
|
||||
num_iters_since_reset += 1
|
||||
|
||||
# Take action and store transition in the replay buffer.
|
||||
kwargs = {}
|
||||
if not args.param_noise:
|
||||
update_eps = exploration.value(num_iters)
|
||||
update_param_noise_threshold = 0.
|
||||
else:
|
||||
if args.param_noise_reset_freq > 0 and num_iters_since_reset > args.param_noise_reset_freq:
|
||||
# Reset param noise policy since we have exceeded the maximum number of steps without a reset.
|
||||
reset = True
|
||||
|
||||
update_eps = 0.01 # ensures that we cannot get stuck completely
|
||||
# Compute the threshold such that the KL divergence between perturbed and non-perturbed
|
||||
# policy is comparable to eps-greedy exploration with eps = exploration.value(t).
|
||||
# See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
|
||||
# for detailed explanation.
|
||||
update_param_noise_threshold = -np.log(1. - exploration.value(num_iters) + exploration.value(num_iters) / float(env.action_space.n))
|
||||
kwargs['reset'] = reset
|
||||
kwargs['update_param_noise_threshold'] = update_param_noise_threshold
|
||||
kwargs['update_param_noise_scale'] = (num_iters % args.param_noise_update_freq == 0)
|
||||
|
||||
action = act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0]
|
||||
reset = False
|
||||
new_obs, rew, done, info = env.step(action)
|
||||
replay_buffer.add(obs, action, rew, new_obs, float(done))
|
||||
obs = new_obs
|
||||
if done:
|
||||
num_iters_since_reset = 0
|
||||
obs = env.reset()
|
||||
reset = True
|
||||
|
||||
if (num_iters > max(5 * args.batch_size, args.replay_buffer_size // 20) and
|
||||
num_iters % args.learning_freq == 0):
|
||||
# Sample a bunch of transitions from replay buffer
|
||||
if args.prioritized:
|
||||
experience = replay_buffer.sample(args.batch_size, beta=beta_schedule.value(num_iters))
|
||||
(obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience
|
||||
else:
|
||||
obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(args.batch_size)
|
||||
weights = np.ones_like(rewards)
|
||||
# Minimize the error in Bellman's equation and compute TD-error
|
||||
td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights)
|
||||
# Update the priorities in the replay buffer
|
||||
if args.prioritized:
|
||||
new_priorities = np.abs(td_errors) + args.prioritized_eps
|
||||
replay_buffer.update_priorities(batch_idxes, new_priorities)
|
||||
# Update target network.
|
||||
if num_iters % args.target_update_freq == 0:
|
||||
update_target()
|
||||
|
||||
if start_time is not None:
|
||||
steps_per_iter.update(info['steps'] - start_steps)
|
||||
iteration_time_est.update(time.time() - start_time)
|
||||
start_time, start_steps = time.time(), info["steps"]
|
||||
|
||||
# Save the model and training state.
|
||||
if num_iters > 0 and (num_iters % args.save_freq == 0 or info["steps"] > args.num_steps):
|
||||
maybe_save_model(savedir, container, {
|
||||
'replay_buffer': replay_buffer,
|
||||
'num_iters': num_iters,
|
||||
'monitor_state': monitored_env.get_state(),
|
||||
})
|
||||
|
||||
if info["steps"] > args.num_steps:
|
||||
break
|
||||
|
||||
if done:
|
||||
steps_left = args.num_steps - info["steps"]
|
||||
completion = np.round(info["steps"] / args.num_steps, 1)
|
||||
|
||||
logger.record_tabular("% completion", completion)
|
||||
logger.record_tabular("steps", info["steps"])
|
||||
logger.record_tabular("iters", num_iters)
|
||||
logger.record_tabular("episodes", len(info["rewards"]))
|
||||
logger.record_tabular("reward (100 epi mean)", np.mean(info["rewards"][-100:]))
|
||||
logger.record_tabular("exploration", exploration.value(num_iters))
|
||||
if args.prioritized:
|
||||
logger.record_tabular("max priority", replay_buffer._max_priority)
|
||||
fps_estimate = (float(steps_per_iter) / (float(iteration_time_est) + 1e-6)
|
||||
if steps_per_iter._value is not None else "calculating...")
|
||||
logger.dump_tabular()
|
||||
logger.log()
|
||||
logger.log("ETA: " + pretty_eta(int(steps_left / fps_estimate)))
|
||||
logger.log()
|
81
baselines/deepq/experiments/atari/wang2015_eval.py
Normal file
@@ -0,0 +1,81 @@
|
||||
import argparse
|
||||
import gym
|
||||
import numpy as np
|
||||
import os
|
||||
|
||||
import baselines.common.tf_util as U
|
||||
|
||||
from baselines import deepq, bench
|
||||
from baselines.common.misc_util import get_wrapper_by_name, boolean_flag, set_global_seeds
|
||||
from baselines.common.atari_wrappers_deprecated import wrap_dqn
|
||||
from baselines.deepq.experiments.atari.model import model, dueling_model
|
||||
|
||||
|
||||
def make_env(game_name):
|
||||
env = gym.make(game_name + "NoFrameskip-v4")
|
||||
env_monitored = bench.Monitor(env, None)
|
||||
env = wrap_dqn(env_monitored)
|
||||
return env_monitored, env
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser("Evaluate an already learned DQN model.")
|
||||
# Environment
|
||||
parser.add_argument("--env", type=str, required=True, help="name of the game")
|
||||
parser.add_argument("--model-dir", type=str, default=None, help="load model from this directory. ")
|
||||
boolean_flag(parser, "stochastic", default=True, help="whether or not to use stochastic actions according to models eps value")
|
||||
boolean_flag(parser, "dueling", default=False, help="whether or not to use dueling model")
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def wang2015_eval(game_name, act, stochastic):
|
||||
print("==================== wang2015 evaluation ====================")
|
||||
episode_rewards = []
|
||||
|
||||
for num_noops in range(1, 31):
|
||||
env_monitored, eval_env = make_env(game_name)
|
||||
eval_env.unwrapped.seed(1)
|
||||
|
||||
get_wrapper_by_name(eval_env, "NoopResetEnv").override_num_noops = num_noops
|
||||
|
||||
eval_episode_steps = 0
|
||||
done = True
|
||||
while True:
|
||||
if done:
|
||||
obs = eval_env.reset()
|
||||
eval_episode_steps += 1
|
||||
action = act(np.array(obs)[None], stochastic=stochastic)[0]
|
||||
|
||||
obs, _reward, done, info = eval_env.step(action)
|
||||
if done:
|
||||
obs = eval_env.reset()
|
||||
if len(info["rewards"]) > 0:
|
||||
episode_rewards.append(info["rewards"][0])
|
||||
break
|
||||
if info["steps"] > 108000: # 5 minutes of gameplay
|
||||
episode_rewards.append(sum(env_monitored.rewards))
|
||||
break
|
||||
print("Num steps in episode {} was {} yielding {} reward".format(
|
||||
num_noops, eval_episode_steps, episode_rewards[-1]), flush=True)
|
||||
print("Evaluation results: " + str(np.mean(episode_rewards)))
|
||||
print("=============================================================")
|
||||
return np.mean(episode_rewards)
|
||||
|
||||
|
||||
def main():
|
||||
set_global_seeds(1)
|
||||
args = parse_args()
|
||||
with U.make_session(4): # noqa
|
||||
_, env = make_env(args.env)
|
||||
act = deepq.build_act(
|
||||
make_obs_ph=lambda name: U.Uint8Input(env.observation_space.shape, name=name),
|
||||
q_func=dueling_model if args.dueling else model,
|
||||
num_actions=env.action_space.n)
|
||||
|
||||
U.load_state(os.path.join(args.model_dir, "saved"))
|
||||
wang2015_eval(args.env, act, stochastic=args.stochastic)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
@@ -9,7 +9,6 @@ import baselines.common.tf_util as U
|
||||
from baselines import logger
|
||||
from baselines import deepq
|
||||
from baselines.deepq.replay_buffer import ReplayBuffer
|
||||
from baselines.deepq.utils import BatchInput
|
||||
from baselines.common.schedules import LinearSchedule
|
||||
|
||||
|
||||
@@ -28,7 +27,7 @@ if __name__ == '__main__':
|
||||
env = gym.make("CartPole-v0")
|
||||
# Create all the functions necessary to train the model
|
||||
act, train, update_target, debug = deepq.build_train(
|
||||
make_obs_ph=lambda name: BatchInput(env.observation_space.shape, name=name),
|
||||
make_obs_ph=lambda name: U.BatchInput(env.observation_space.shape, name=name),
|
||||
q_func=model,
|
||||
num_actions=env.action_space.n,
|
||||
optimizer=tf.train.AdamOptimizer(learning_rate=5e-4),
|
||||
|
@@ -1,3 +1,5 @@
|
||||
import gym
|
||||
|
||||
from baselines import deepq
|
||||
from baselines.common import set_global_seeds
|
||||
from baselines import bench
|
||||
|
@@ -3,7 +3,7 @@ import gym
|
||||
from baselines import deepq
|
||||
|
||||
|
||||
def callback(lcl, _glb):
|
||||
def callback(lcl, glb):
|
||||
# stop training if reward exceeds 199
|
||||
is_solved = lcl['t'] > 100 and sum(lcl['episode_rewards'][-101:-1]) / 100 >= 199
|
||||
return is_solved
|
||||
|
@@ -6,7 +6,7 @@ from baselines.common.segment_tree import SumSegmentTree, MinSegmentTree
|
||||
|
||||
class ReplayBuffer(object):
|
||||
def __init__(self, size):
|
||||
"""Create Replay buffer.
|
||||
"""Create Prioritized Replay buffer.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
|
@@ -12,7 +12,6 @@ from baselines import logger
|
||||
from baselines.common.schedules import LinearSchedule
|
||||
from baselines import deepq
|
||||
from baselines.deepq.replay_buffer import ReplayBuffer, PrioritizedReplayBuffer
|
||||
from baselines.deepq.utils import BatchInput, load_state, save_state
|
||||
|
||||
|
||||
class ActWrapper(object):
|
||||
@@ -33,7 +32,7 @@ class ActWrapper(object):
|
||||
f.write(model_data)
|
||||
|
||||
zipfile.ZipFile(arc_path, 'r', zipfile.ZIP_DEFLATED).extractall(td)
|
||||
load_state(os.path.join(td, "model"))
|
||||
U.load_state(os.path.join(td, "model"))
|
||||
|
||||
return ActWrapper(act, act_params)
|
||||
|
||||
@@ -46,7 +45,7 @@ class ActWrapper(object):
|
||||
path = os.path.join(logger.get_dir(), "model.pkl")
|
||||
|
||||
with tempfile.TemporaryDirectory() as td:
|
||||
save_state(os.path.join(td, "model"))
|
||||
U.save_state(os.path.join(td, "model"))
|
||||
arc_name = os.path.join(td, "packed.zip")
|
||||
with zipfile.ZipFile(arc_name, 'w') as zipf:
|
||||
for root, dirs, files in os.walk(td):
|
||||
@@ -172,7 +171,7 @@ def learn(env,
|
||||
# by cloudpickle when serializing make_obs_ph
|
||||
observation_space_shape = env.observation_space.shape
|
||||
def make_obs_ph(name):
|
||||
return BatchInput(observation_space_shape, name=name)
|
||||
return U.BatchInput(observation_space_shape, name=name)
|
||||
|
||||
act, train, update_target, debug = deepq.build_train(
|
||||
make_obs_ph=make_obs_ph,
|
||||
@@ -239,7 +238,11 @@ def learn(env,
|
||||
kwargs['update_param_noise_threshold'] = update_param_noise_threshold
|
||||
kwargs['update_param_noise_scale'] = True
|
||||
action = act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0]
|
||||
env_action = action
|
||||
if isinstance(env.action_space, gym.spaces.MultiBinary):
|
||||
env_action = np.zeros(env.action_space.n)
|
||||
env_action[action] = 1
|
||||
else:
|
||||
env_action = action
|
||||
reset = False
|
||||
new_obs, rew, done, _ = env.step(env_action)
|
||||
# Store transition in the replay buffer.
|
||||
@@ -284,12 +287,12 @@ def learn(env,
|
||||
if print_freq is not None:
|
||||
logger.log("Saving model due to mean reward increase: {} -> {}".format(
|
||||
saved_mean_reward, mean_100ep_reward))
|
||||
save_state(model_file)
|
||||
U.save_state(model_file)
|
||||
model_saved = True
|
||||
saved_mean_reward = mean_100ep_reward
|
||||
if model_saved:
|
||||
if print_freq is not None:
|
||||
logger.log("Restored model with mean reward: {}".format(saved_mean_reward))
|
||||
load_state(model_file)
|
||||
U.load_state(model_file)
|
||||
|
||||
return act
|
||||
|
@@ -1,88 +0,0 @@
|
||||
import os
|
||||
|
||||
import tensorflow as tf
|
||||
|
||||
# ================================================================
|
||||
# Saving variables
|
||||
# ================================================================
|
||||
|
||||
def load_state(fname):
|
||||
saver = tf.train.Saver()
|
||||
saver.restore(tf.get_default_session(), fname)
|
||||
|
||||
def save_state(fname):
|
||||
os.makedirs(os.path.dirname(fname), exist_ok=True)
|
||||
saver = tf.train.Saver()
|
||||
saver.save(tf.get_default_session(), fname)
|
||||
|
||||
# ================================================================
|
||||
# Placeholders
|
||||
# ================================================================
|
||||
|
||||
class TfInput(object):
|
||||
def __init__(self, name="(unnamed)"):
|
||||
"""Generalized Tensorflow placeholder. The main differences are:
|
||||
- possibly uses multiple placeholders internally and returns multiple values
|
||||
- can apply light postprocessing to the value feed to placeholder.
|
||||
"""
|
||||
self.name = name
|
||||
|
||||
def get(self):
|
||||
"""Return the tf variable(s) representing the possibly postprocessed value
|
||||
of placeholder(s).
|
||||
"""
|
||||
raise NotImplemented()
|
||||
|
||||
def make_feed_dict(data):
|
||||
"""Given data input it to the placeholder(s)."""
|
||||
raise NotImplemented()
|
||||
|
||||
|
||||
class PlaceholderTfInput(TfInput):
|
||||
def __init__(self, placeholder):
|
||||
"""Wrapper for regular tensorflow placeholder."""
|
||||
super().__init__(placeholder.name)
|
||||
self._placeholder = placeholder
|
||||
|
||||
def get(self):
|
||||
return self._placeholder
|
||||
|
||||
def make_feed_dict(self, data):
|
||||
return {self._placeholder: data}
|
||||
|
||||
class BatchInput(PlaceholderTfInput):
|
||||
def __init__(self, shape, dtype=tf.float32, name=None):
|
||||
"""Creates a placeholder for a batch of tensors of a given shape and dtype
|
||||
|
||||
Parameters
|
||||
----------
|
||||
shape: [int]
|
||||
shape of a single elemenet of the batch
|
||||
dtype: tf.dtype
|
||||
number representation used for tensor contents
|
||||
name: str
|
||||
name of the underlying placeholder
|
||||
"""
|
||||
super().__init__(tf.placeholder(dtype, [None] + list(shape), name=name))
|
||||
|
||||
class Uint8Input(PlaceholderTfInput):
|
||||
def __init__(self, shape, name=None):
|
||||
"""Takes input in uint8 format which is cast to float32 and divided by 255
|
||||
before passing it to the model.
|
||||
|
||||
On GPU this ensures lower data transfer times.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
shape: [int]
|
||||
shape of the tensor.
|
||||
name: str
|
||||
name of the underlying placeholder
|
||||
"""
|
||||
|
||||
super().__init__(tf.placeholder(tf.uint8, [None] + list(shape), name=name))
|
||||
self._shape = shape
|
||||
self._output = tf.cast(super().get(), tf.float32) / 255.0
|
||||
|
||||
def get(self):
|
||||
return self._output
|
@@ -1,52 +0,0 @@
|
||||
# Generative Adversarial Imitation Learning (GAIL)
|
||||
|
||||
- Original paper: https://arxiv.org/abs/1606.03476
|
||||
|
||||
For results benchmarking on MuJoCo, please navigate to [here](result/gail-result.md)
|
||||
|
||||
## If you want to train an imitation learning agent
|
||||
|
||||
### Step 1: Download expert data
|
||||
|
||||
Download the expert data into `./data`, [download link](https://drive.google.com/drive/folders/1h3H4AY_ZBx08hz-Ct0Nxxus-V1melu1U?usp=sharing)
|
||||
|
||||
### Step 2: Run GAIL
|
||||
|
||||
Run with single thread:
|
||||
|
||||
```bash
|
||||
python -m baselines.gail.run_mujoco
|
||||
```
|
||||
|
||||
Run with multiple threads:
|
||||
|
||||
```bash
|
||||
mpirun -np 16 python -m baselines.gail.run_mujoco
|
||||
```
|
||||
|
||||
See help (`-h`) for more options.
|
||||
|
||||
#### In case you want to run Behavior Cloning (BC)
|
||||
|
||||
```bash
|
||||
python -m baselines.gail.behavior_clone
|
||||
```
|
||||
|
||||
See help (`-h`) for more options.
|
||||
|
||||
|
||||
## Contributing
|
||||
|
||||
Bug reports and pull requests are welcome on GitHub at https://github.com/openai/baselines/pulls.
|
||||
|
||||
## Maintainers
|
||||
|
||||
- Yuan-Hong Liao, andrewliao11_at_gmail_dot_com
|
||||
- Ryan Julian, ryanjulian_at_gmail_dot_com
|
||||
|
||||
## Others
|
||||
|
||||
Thanks to the open source:
|
||||
|
||||
- @openai/imitation
|
||||
- @carpedm20/deep-rl-tensorflow
|
@@ -1,87 +0,0 @@
|
||||
'''
|
||||
Reference: https://github.com/openai/imitation
|
||||
I follow the architecture from the official repository
|
||||
'''
|
||||
import tensorflow as tf
|
||||
import numpy as np
|
||||
|
||||
from baselines.common.mpi_running_mean_std import RunningMeanStd
|
||||
from baselines.common import tf_util as U
|
||||
|
||||
def logsigmoid(a):
|
||||
'''Equivalent to tf.log(tf.sigmoid(a))'''
|
||||
return -tf.nn.softplus(-a)
|
||||
|
||||
""" Reference: https://github.com/openai/imitation/blob/99fbccf3e060b6e6c739bdf209758620fcdefd3c/policyopt/thutil.py#L48-L51"""
|
||||
def logit_bernoulli_entropy(logits):
|
||||
ent = (1.-tf.nn.sigmoid(logits))*logits - logsigmoid(logits)
|
||||
return ent
|
||||
|
||||
class TransitionClassifier(object):
|
||||
def __init__(self, env, hidden_size, entcoeff=0.001, lr_rate=1e-3, scope="adversary"):
|
||||
self.scope = scope
|
||||
self.observation_shape = env.observation_space.shape
|
||||
self.actions_shape = env.action_space.shape
|
||||
self.input_shape = tuple([o+a for o, a in zip(self.observation_shape, self.actions_shape)])
|
||||
self.num_actions = env.action_space.shape[0]
|
||||
self.hidden_size = hidden_size
|
||||
self.build_ph()
|
||||
# Build grpah
|
||||
generator_logits = self.build_graph(self.generator_obs_ph, self.generator_acs_ph, reuse=False)
|
||||
expert_logits = self.build_graph(self.expert_obs_ph, self.expert_acs_ph, reuse=True)
|
||||
# Build accuracy
|
||||
generator_acc = tf.reduce_mean(tf.to_float(tf.nn.sigmoid(generator_logits) < 0.5))
|
||||
expert_acc = tf.reduce_mean(tf.to_float(tf.nn.sigmoid(expert_logits) > 0.5))
|
||||
# Build regression loss
|
||||
# let x = logits, z = targets.
|
||||
# z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
|
||||
generator_loss = tf.nn.sigmoid_cross_entropy_with_logits(logits=generator_logits, labels=tf.zeros_like(generator_logits))
|
||||
generator_loss = tf.reduce_mean(generator_loss)
|
||||
expert_loss = tf.nn.sigmoid_cross_entropy_with_logits(logits=expert_logits, labels=tf.ones_like(expert_logits))
|
||||
expert_loss = tf.reduce_mean(expert_loss)
|
||||
# Build entropy loss
|
||||
logits = tf.concat([generator_logits, expert_logits], 0)
|
||||
entropy = tf.reduce_mean(logit_bernoulli_entropy(logits))
|
||||
entropy_loss = -entcoeff*entropy
|
||||
# Loss + Accuracy terms
|
||||
self.losses = [generator_loss, expert_loss, entropy, entropy_loss, generator_acc, expert_acc]
|
||||
self.loss_name = ["generator_loss", "expert_loss", "entropy", "entropy_loss", "generator_acc", "expert_acc"]
|
||||
self.total_loss = generator_loss + expert_loss + entropy_loss
|
||||
# Build Reward for policy
|
||||
self.reward_op = -tf.log(1-tf.nn.sigmoid(generator_logits)+1e-8)
|
||||
var_list = self.get_trainable_variables()
|
||||
self.lossandgrad = U.function([self.generator_obs_ph, self.generator_acs_ph, self.expert_obs_ph, self.expert_acs_ph],
|
||||
self.losses + [U.flatgrad(self.total_loss, var_list)])
|
||||
|
||||
def build_ph(self):
|
||||
self.generator_obs_ph = tf.placeholder(tf.float32, (None, ) + self.observation_shape, name="observations_ph")
|
||||
self.generator_acs_ph = tf.placeholder(tf.float32, (None, ) + self.actions_shape, name="actions_ph")
|
||||
self.expert_obs_ph = tf.placeholder(tf.float32, (None, ) + self.observation_shape, name="expert_observations_ph")
|
||||
self.expert_acs_ph = tf.placeholder(tf.float32, (None, ) + self.actions_shape, name="expert_actions_ph")
|
||||
|
||||
def build_graph(self, obs_ph, acs_ph, reuse=False):
|
||||
with tf.variable_scope(self.scope):
|
||||
if reuse:
|
||||
tf.get_variable_scope().reuse_variables()
|
||||
|
||||
with tf.variable_scope("obfilter"):
|
||||
self.obs_rms = RunningMeanStd(shape=self.observation_shape)
|
||||
obs = (obs_ph - self.obs_rms.mean / self.obs_rms.std)
|
||||
_input = tf.concat([obs, acs_ph], axis=1) # concatenate the two input -> form a transition
|
||||
p_h1 = tf.contrib.layers.fully_connected(_input, self.hidden_size, activation_fn=tf.nn.tanh)
|
||||
p_h2 = tf.contrib.layers.fully_connected(p_h1, self.hidden_size, activation_fn=tf.nn.tanh)
|
||||
logits = tf.contrib.layers.fully_connected(p_h2, 1, activation_fn=tf.identity)
|
||||
return logits
|
||||
|
||||
def get_trainable_variables(self):
|
||||
return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)
|
||||
|
||||
def get_reward(self, obs, acs):
|
||||
sess = tf.get_default_session()
|
||||
if len(obs.shape) == 1:
|
||||
obs = np.expand_dims(obs, 0)
|
||||
if len(acs.shape) == 1:
|
||||
acs = np.expand_dims(acs, 0)
|
||||
feed_dict = {self.generator_obs_ph: obs, self.generator_acs_ph: acs}
|
||||
reward = sess.run(self.reward_op, feed_dict)
|
||||
return reward
|
@@ -1,124 +0,0 @@
|
||||
'''
|
||||
The code is used to train BC imitator, or pretrained GAIL imitator
|
||||
'''
|
||||
|
||||
import argparse
|
||||
import tempfile
|
||||
import os.path as osp
|
||||
import gym
|
||||
import logging
|
||||
from tqdm import tqdm
|
||||
|
||||
import tensorflow as tf
|
||||
|
||||
from baselines.gail import mlp_policy
|
||||
from baselines import bench
|
||||
from baselines import logger
|
||||
from baselines.common import set_global_seeds, tf_util as U
|
||||
from baselines.common.misc_util import boolean_flag
|
||||
from baselines.common.mpi_adam import MpiAdam
|
||||
from baselines.gail.run_mujoco import runner
|
||||
from baselines.gail.dataset.mujoco_dset import Mujoco_Dset
|
||||
|
||||
|
||||
def argsparser():
|
||||
parser = argparse.ArgumentParser("Tensorflow Implementation of Behavior Cloning")
|
||||
parser.add_argument('--env_id', help='environment ID', default='Hopper-v1')
|
||||
parser.add_argument('--seed', help='RNG seed', type=int, default=0)
|
||||
parser.add_argument('--expert_path', type=str, default='data/deterministic.trpo.Hopper.0.00.npz')
|
||||
parser.add_argument('--checkpoint_dir', help='the directory to save model', default='checkpoint')
|
||||
parser.add_argument('--log_dir', help='the directory to save log file', default='log')
|
||||
# Mujoco Dataset Configuration
|
||||
parser.add_argument('--traj_limitation', type=int, default=-1)
|
||||
# Network Configuration (Using MLP Policy)
|
||||
parser.add_argument('--policy_hidden_size', type=int, default=100)
|
||||
# for evaluatation
|
||||
boolean_flag(parser, 'stochastic_policy', default=False, help='use stochastic/deterministic policy to evaluate')
|
||||
boolean_flag(parser, 'save_sample', default=False, help='save the trajectories or not')
|
||||
parser.add_argument('--BC_max_iter', help='Max iteration for training BC', type=int, default=1e5)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def learn(env, policy_func, dataset, optim_batch_size=128, max_iters=1e4,
|
||||
adam_epsilon=1e-5, optim_stepsize=3e-4,
|
||||
ckpt_dir=None, log_dir=None, task_name=None,
|
||||
verbose=False):
|
||||
|
||||
val_per_iter = int(max_iters/10)
|
||||
ob_space = env.observation_space
|
||||
ac_space = env.action_space
|
||||
pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy
|
||||
# placeholder
|
||||
ob = U.get_placeholder_cached(name="ob")
|
||||
ac = pi.pdtype.sample_placeholder([None])
|
||||
stochastic = U.get_placeholder_cached(name="stochastic")
|
||||
loss = tf.reduce_mean(tf.square(ac-pi.ac))
|
||||
var_list = pi.get_trainable_variables()
|
||||
adam = MpiAdam(var_list, epsilon=adam_epsilon)
|
||||
lossandgrad = U.function([ob, ac, stochastic], [loss]+[U.flatgrad(loss, var_list)])
|
||||
|
||||
U.initialize()
|
||||
adam.sync()
|
||||
logger.log("Pretraining with Behavior Cloning...")
|
||||
for iter_so_far in tqdm(range(int(max_iters))):
|
||||
ob_expert, ac_expert = dataset.get_next_batch(optim_batch_size, 'train')
|
||||
train_loss, g = lossandgrad(ob_expert, ac_expert, True)
|
||||
adam.update(g, optim_stepsize)
|
||||
if verbose and iter_so_far % val_per_iter == 0:
|
||||
ob_expert, ac_expert = dataset.get_next_batch(-1, 'val')
|
||||
val_loss, _ = lossandgrad(ob_expert, ac_expert, True)
|
||||
logger.log("Training loss: {}, Validation loss: {}".format(train_loss, val_loss))
|
||||
|
||||
if ckpt_dir is None:
|
||||
savedir_fname = tempfile.TemporaryDirectory().name
|
||||
else:
|
||||
savedir_fname = osp.join(ckpt_dir, task_name)
|
||||
U.save_state(savedir_fname, var_list=pi.get_variables())
|
||||
return savedir_fname
|
||||
|
||||
|
||||
def get_task_name(args):
|
||||
task_name = 'BC'
|
||||
task_name += '.{}'.format(args.env_id.split("-")[0])
|
||||
task_name += '.traj_limitation_{}'.format(args.traj_limitation)
|
||||
task_name += ".seed_{}".format(args.seed)
|
||||
return task_name
|
||||
|
||||
|
||||
def main(args):
|
||||
U.make_session(num_cpu=1).__enter__()
|
||||
set_global_seeds(args.seed)
|
||||
env = gym.make(args.env_id)
|
||||
|
||||
def policy_fn(name, ob_space, ac_space, reuse=False):
|
||||
return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
|
||||
reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2)
|
||||
env = bench.Monitor(env, logger.get_dir() and
|
||||
osp.join(logger.get_dir(), "monitor.json"))
|
||||
env.seed(args.seed)
|
||||
gym.logger.setLevel(logging.WARN)
|
||||
task_name = get_task_name(args)
|
||||
args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name)
|
||||
args.log_dir = osp.join(args.log_dir, task_name)
|
||||
dataset = Mujoco_Dset(expert_path=args.expert_path, traj_limitation=args.traj_limitation)
|
||||
savedir_fname = learn(env,
|
||||
policy_fn,
|
||||
dataset,
|
||||
max_iters=args.BC_max_iter,
|
||||
ckpt_dir=args.checkpoint_dir,
|
||||
log_dir=args.log_dir,
|
||||
task_name=task_name,
|
||||
verbose=True)
|
||||
avg_len, avg_ret = runner(env,
|
||||
policy_fn,
|
||||
savedir_fname,
|
||||
timesteps_per_batch=1024,
|
||||
number_trajs=10,
|
||||
stochastic_policy=args.stochastic_policy,
|
||||
save=args.save_sample,
|
||||
reuse=True)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
args = argsparser()
|
||||
main(args)
|
@@ -1,116 +0,0 @@
|
||||
'''
|
||||
Data structure of the input .npz:
|
||||
the data is save in python dictionary format with keys: 'acs', 'ep_rets', 'rews', 'obs'
|
||||
the values of each item is a list storing the expert trajectory sequentially
|
||||
a transition can be: (data['obs'][t], data['acs'][t], data['obs'][t+1]) and get reward data['rews'][t]
|
||||
'''
|
||||
|
||||
from baselines import logger
|
||||
import numpy as np
|
||||
|
||||
|
||||
class Dset(object):
|
||||
def __init__(self, inputs, labels, randomize):
|
||||
self.inputs = inputs
|
||||
self.labels = labels
|
||||
assert len(self.inputs) == len(self.labels)
|
||||
self.randomize = randomize
|
||||
self.num_pairs = len(inputs)
|
||||
self.init_pointer()
|
||||
|
||||
def init_pointer(self):
|
||||
self.pointer = 0
|
||||
if self.randomize:
|
||||
idx = np.arange(self.num_pairs)
|
||||
np.random.shuffle(idx)
|
||||
self.inputs = self.inputs[idx, :]
|
||||
self.labels = self.labels[idx, :]
|
||||
|
||||
def get_next_batch(self, batch_size):
|
||||
# if batch_size is negative -> return all
|
||||
if batch_size < 0:
|
||||
return self.inputs, self.labels
|
||||
if self.pointer + batch_size >= self.num_pairs:
|
||||
self.init_pointer()
|
||||
end = self.pointer + batch_size
|
||||
inputs = self.inputs[self.pointer:end, :]
|
||||
labels = self.labels[self.pointer:end, :]
|
||||
self.pointer = end
|
||||
return inputs, labels
|
||||
|
||||
|
||||
class Mujoco_Dset(object):
|
||||
def __init__(self, expert_path, train_fraction=0.7, traj_limitation=-1, randomize=True):
|
||||
traj_data = np.load(expert_path)
|
||||
if traj_limitation < 0:
|
||||
traj_limitation = len(traj_data['obs'])
|
||||
obs = traj_data['obs'][:traj_limitation]
|
||||
acs = traj_data['acs'][:traj_limitation]
|
||||
|
||||
def flatten(x):
|
||||
# x.shape = (E,), or (E, L, D)
|
||||
_, size = x[0].shape
|
||||
episode_length = [len(i) for i in x]
|
||||
y = np.zeros((sum(episode_length), size))
|
||||
start_idx = 0
|
||||
for l, x_i in zip(episode_length, x):
|
||||
y[start_idx:(start_idx+l)] = x_i
|
||||
start_idx += l
|
||||
return y
|
||||
self.obs = np.array(flatten(obs))
|
||||
self.acs = np.array(flatten(acs))
|
||||
self.rets = traj_data['ep_rets'][:traj_limitation]
|
||||
self.avg_ret = sum(self.rets)/len(self.rets)
|
||||
self.std_ret = np.std(np.array(self.rets))
|
||||
if len(self.acs) > 2:
|
||||
self.acs = np.squeeze(self.acs)
|
||||
assert len(self.obs) == len(self.acs)
|
||||
self.num_traj = min(traj_limitation, len(traj_data['obs']))
|
||||
self.num_transition = len(self.obs)
|
||||
self.randomize = randomize
|
||||
self.dset = Dset(self.obs, self.acs, self.randomize)
|
||||
# for behavior cloning
|
||||
self.train_set = Dset(self.obs[:int(self.num_transition*train_fraction), :],
|
||||
self.acs[:int(self.num_transition*train_fraction), :],
|
||||
self.randomize)
|
||||
self.val_set = Dset(self.obs[int(self.num_transition*train_fraction):, :],
|
||||
self.acs[int(self.num_transition*train_fraction):, :],
|
||||
self.randomize)
|
||||
self.log_info()
|
||||
|
||||
def log_info(self):
|
||||
logger.log("Total trajectorues: %d" % self.num_traj)
|
||||
logger.log("Total transitions: %d" % self.num_transition)
|
||||
logger.log("Average returns: %f" % self.avg_ret)
|
||||
logger.log("Std for returns: %f" % self.std_ret)
|
||||
|
||||
def get_next_batch(self, batch_size, split=None):
|
||||
if split is None:
|
||||
return self.dset.get_next_batch(batch_size)
|
||||
elif split == 'train':
|
||||
return self.train_set.get_next_batch(batch_size)
|
||||
elif split == 'val':
|
||||
return self.val_set.get_next_batch(batch_size)
|
||||
else:
|
||||
raise NotImplementedError
|
||||
|
||||
def plot(self):
|
||||
import matplotlib.pyplot as plt
|
||||
plt.hist(self.rets)
|
||||
plt.savefig("histogram_rets.png")
|
||||
plt.close()
|
||||
|
||||
|
||||
def test(expert_path, traj_limitation, plot):
|
||||
dset = Mujoco_Dset(expert_path, traj_limitation=traj_limitation)
|
||||
if plot:
|
||||
dset.plot()
|
||||
|
||||
if __name__ == '__main__':
|
||||
import argparse
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--expert_path", type=str, default="../data/deterministic.trpo.Hopper.0.00.npz")
|
||||
parser.add_argument("--traj_limitation", type=int, default=None)
|
||||
parser.add_argument("--plot", type=bool, default=False)
|
||||
args = parser.parse_args()
|
||||
test(args.expert_path, args.traj_limitation, args.plot)
|
@@ -1,147 +0,0 @@
|
||||
'''
|
||||
This code is used to evalaute the imitators trained with different number of trajectories
|
||||
and plot the results in the same figure for easy comparison.
|
||||
'''
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import glob
|
||||
import gym
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
|
||||
from baselines.gail import run_mujoco
|
||||
from baselines.gail import mlp_policy
|
||||
from baselines.common import set_global_seeds, tf_util as U
|
||||
from baselines.common.misc_util import boolean_flag
|
||||
from baselines.gail.dataset.mujoco_dset import Mujoco_Dset
|
||||
|
||||
|
||||
plt.style.use('ggplot')
|
||||
CONFIG = {
|
||||
'traj_limitation': [1, 5, 10, 50],
|
||||
}
|
||||
|
||||
|
||||
def load_dataset(expert_path):
|
||||
dataset = Mujoco_Dset(expert_path=expert_path)
|
||||
return dataset
|
||||
|
||||
|
||||
def argsparser():
|
||||
parser = argparse.ArgumentParser('Do evaluation')
|
||||
parser.add_argument('--seed', type=int, default=0)
|
||||
parser.add_argument('--policy_hidden_size', type=int, default=100)
|
||||
parser.add_argument('--env', type=str, choices=['Hopper', 'Walker2d', 'HalfCheetah',
|
||||
'Humanoid', 'HumanoidStandup'])
|
||||
boolean_flag(parser, 'stochastic_policy', default=False, help='use stochastic/deterministic policy to evaluate')
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def evaluate_env(env_name, seed, policy_hidden_size, stochastic, reuse, prefix):
|
||||
|
||||
def get_checkpoint_dir(checkpoint_list, limit, prefix):
|
||||
for checkpoint in checkpoint_list:
|
||||
if ('limitation_'+str(limit) in checkpoint) and (prefix in checkpoint):
|
||||
return checkpoint
|
||||
return None
|
||||
|
||||
def policy_fn(name, ob_space, ac_space, reuse=False):
|
||||
return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
|
||||
reuse=reuse, hid_size=policy_hidden_size, num_hid_layers=2)
|
||||
|
||||
data_path = os.path.join('data', 'deterministic.trpo.' + env_name + '.0.00.npz')
|
||||
dataset = load_dataset(data_path)
|
||||
checkpoint_list = glob.glob(os.path.join('checkpoint', '*' + env_name + ".*"))
|
||||
log = {
|
||||
'traj_limitation': [],
|
||||
'upper_bound': [],
|
||||
'avg_ret': [],
|
||||
'avg_len': [],
|
||||
'normalized_ret': []
|
||||
}
|
||||
for i, limit in enumerate(CONFIG['traj_limitation']):
|
||||
# Do one evaluation
|
||||
upper_bound = sum(dataset.rets[:limit])/limit
|
||||
checkpoint_dir = get_checkpoint_dir(checkpoint_list, limit, prefix=prefix)
|
||||
checkpoint_path = tf.train.latest_checkpoint(checkpoint_dir)
|
||||
env = gym.make(env_name + '-v1')
|
||||
env.seed(seed)
|
||||
print('Trajectory limitation: {}, Load checkpoint: {}, '.format(limit, checkpoint_path))
|
||||
avg_len, avg_ret = run_mujoco.runner(env,
|
||||
policy_fn,
|
||||
checkpoint_path,
|
||||
timesteps_per_batch=1024,
|
||||
number_trajs=10,
|
||||
stochastic_policy=stochastic,
|
||||
reuse=((i != 0) or reuse))
|
||||
normalized_ret = avg_ret/upper_bound
|
||||
print('Upper bound: {}, evaluation returns: {}, normalized scores: {}'.format(
|
||||
upper_bound, avg_ret, normalized_ret))
|
||||
log['traj_limitation'].append(limit)
|
||||
log['upper_bound'].append(upper_bound)
|
||||
log['avg_ret'].append(avg_ret)
|
||||
log['avg_len'].append(avg_len)
|
||||
log['normalized_ret'].append(normalized_ret)
|
||||
env.close()
|
||||
return log
|
||||
|
||||
|
||||
def plot(env_name, bc_log, gail_log, stochastic):
|
||||
upper_bound = bc_log['upper_bound']
|
||||
bc_avg_ret = bc_log['avg_ret']
|
||||
gail_avg_ret = gail_log['avg_ret']
|
||||
plt.plot(CONFIG['traj_limitation'], upper_bound)
|
||||
plt.plot(CONFIG['traj_limitation'], bc_avg_ret)
|
||||
plt.plot(CONFIG['traj_limitation'], gail_avg_ret)
|
||||
plt.xlabel('Number of expert trajectories')
|
||||
plt.ylabel('Accumulated reward')
|
||||
plt.title('{} unnormalized scores'.format(env_name))
|
||||
plt.legend(['expert', 'bc-imitator', 'gail-imitator'], loc='lower right')
|
||||
plt.grid(b=True, which='major', color='gray', linestyle='--')
|
||||
if stochastic:
|
||||
title_name = 'result/{}-unnormalized-stochastic-scores.png'.format(env_name)
|
||||
else:
|
||||
title_name = 'result/{}-unnormalized-deterministic-scores.png'.format(env_name)
|
||||
plt.savefig(title_name)
|
||||
plt.close()
|
||||
|
||||
bc_normalized_ret = bc_log['normalized_ret']
|
||||
gail_normalized_ret = gail_log['normalized_ret']
|
||||
plt.plot(CONFIG['traj_limitation'], np.ones(len(CONFIG['traj_limitation'])))
|
||||
plt.plot(CONFIG['traj_limitation'], bc_normalized_ret)
|
||||
plt.plot(CONFIG['traj_limitation'], gail_normalized_ret)
|
||||
plt.xlabel('Number of expert trajectories')
|
||||
plt.ylabel('Normalized performance')
|
||||
plt.title('{} normalized scores'.format(env_name))
|
||||
plt.legend(['expert', 'bc-imitator', 'gail-imitator'], loc='lower right')
|
||||
plt.grid(b=True, which='major', color='gray', linestyle='--')
|
||||
if stochastic:
|
||||
title_name = 'result/{}-normalized-stochastic-scores.png'.format(env_name)
|
||||
else:
|
||||
title_name = 'result/{}-normalized-deterministic-scores.png'.format(env_name)
|
||||
plt.ylim(0, 1.6)
|
||||
plt.savefig(title_name)
|
||||
plt.close()
|
||||
|
||||
|
||||
def main(args):
|
||||
U.make_session(num_cpu=1).__enter__()
|
||||
set_global_seeds(args.seed)
|
||||
print('Evaluating {}'.format(args.env))
|
||||
bc_log = evaluate_env(args.env, args.seed, args.policy_hidden_size,
|
||||
args.stochastic_policy, False, 'BC')
|
||||
print('Evaluation for {}'.format(args.env))
|
||||
print(bc_log)
|
||||
gail_log = evaluate_env(args.env, args.seed, args.policy_hidden_size,
|
||||
args.stochastic_policy, True, 'gail')
|
||||
print('Evaluation for {}'.format(args.env))
|
||||
print(gail_log)
|
||||
plot(args.env, bc_log, gail_log, args.stochastic_policy)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
args = argsparser()
|
||||
main(args)
|
@@ -1,75 +0,0 @@
|
||||
'''
|
||||
from baselines/ppo1/mlp_policy.py and add simple modification
|
||||
(1) add reuse argument
|
||||
(2) cache the `stochastic` placeholder
|
||||
'''
|
||||
import tensorflow as tf
|
||||
import gym
|
||||
|
||||
import baselines.common.tf_util as U
|
||||
from baselines.common.mpi_running_mean_std import RunningMeanStd
|
||||
from baselines.common.distributions import make_pdtype
|
||||
from baselines.acktr.utils import dense
|
||||
|
||||
|
||||
class MlpPolicy(object):
|
||||
recurrent = False
|
||||
|
||||
def __init__(self, name, reuse=False, *args, **kwargs):
|
||||
with tf.variable_scope(name):
|
||||
if reuse:
|
||||
tf.get_variable_scope().reuse_variables()
|
||||
self._init(*args, **kwargs)
|
||||
self.scope = tf.get_variable_scope().name
|
||||
|
||||
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True):
|
||||
assert isinstance(ob_space, gym.spaces.Box)
|
||||
|
||||
self.pdtype = pdtype = make_pdtype(ac_space)
|
||||
sequence_length = None
|
||||
|
||||
ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))
|
||||
|
||||
with tf.variable_scope("obfilter"):
|
||||
self.ob_rms = RunningMeanStd(shape=ob_space.shape)
|
||||
|
||||
obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
|
||||
last_out = obz
|
||||
for i in range(num_hid_layers):
|
||||
last_out = tf.nn.tanh(dense(last_out, hid_size, "vffc%i" % (i+1), weight_init=U.normc_initializer(1.0)))
|
||||
self.vpred = dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:, 0]
|
||||
|
||||
last_out = obz
|
||||
for i in range(num_hid_layers):
|
||||
last_out = tf.nn.tanh(dense(last_out, hid_size, "polfc%i" % (i+1), weight_init=U.normc_initializer(1.0)))
|
||||
|
||||
if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
|
||||
mean = dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01))
|
||||
logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer())
|
||||
pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
|
||||
else:
|
||||
pdparam = dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01))
|
||||
|
||||
self.pd = pdtype.pdfromflat(pdparam)
|
||||
|
||||
self.state_in = []
|
||||
self.state_out = []
|
||||
|
||||
# change for BC
|
||||
stochastic = U.get_placeholder(name="stochastic", dtype=tf.bool, shape=())
|
||||
ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
|
||||
self.ac = ac
|
||||
self._act = U.function([stochastic, ob], [ac, self.vpred])
|
||||
|
||||
def act(self, stochastic, ob):
|
||||
ac1, vpred1 = self._act(stochastic, ob[None])
|
||||
return ac1[0], vpred1[0]
|
||||
|
||||
def get_variables(self):
|
||||
return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope)
|
||||
|
||||
def get_trainable_variables(self):
|
||||
return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)
|
||||
|
||||
def get_initial_state(self):
|
||||
return []
|
Before Width: | Height: | Size: 33 KiB |
Before Width: | Height: | Size: 41 KiB |
Before Width: | Height: | Size: 43 KiB |
Before Width: | Height: | Size: 52 KiB |
Before Width: | Height: | Size: 30 KiB |
Before Width: | Height: | Size: 42 KiB |
Before Width: | Height: | Size: 33 KiB |
Before Width: | Height: | Size: 48 KiB |
Before Width: | Height: | Size: 35 KiB |
Before Width: | Height: | Size: 40 KiB |
Before Width: | Height: | Size: 43 KiB |
Before Width: | Height: | Size: 46 KiB |
Before Width: | Height: | Size: 32 KiB |
Before Width: | Height: | Size: 40 KiB |
Before Width: | Height: | Size: 45 KiB |
Before Width: | Height: | Size: 49 KiB |
Before Width: | Height: | Size: 31 KiB |
Before Width: | Height: | Size: 41 KiB |
Before Width: | Height: | Size: 38 KiB |
Before Width: | Height: | Size: 46 KiB |
@@ -1,53 +0,0 @@
|
||||
# Results of GAIL/BC on Mujoco
|
||||
|
||||
Here's the extensive experimental results of applying GAIL/BC on Mujoco environments, including
|
||||
Hopper-v1, Walker2d-v1, HalfCheetah-v1, Humanoid-v1, HumanoidStandup-v1. Every imitator is evaluated with seed to be 0.
|
||||
|
||||
## Results
|
||||
|
||||
### Training through iterations
|
||||
|
||||
- Hoppers-v1
|
||||
<img src='hopper-training.png'>
|
||||
|
||||
- HalfCheetah-v1
|
||||
<img src='halfcheetah-training.png'>
|
||||
|
||||
- Walker2d-v1
|
||||
<img src='walker2d-training.png'>
|
||||
|
||||
- Humanoid-v1
|
||||
<img src='humanoid-training.png'>
|
||||
|
||||
- HumanoidStandup-v1
|
||||
<img src='humanoidstandup-training.png'>
|
||||
|
||||
For details (e.g., adversarial loss, discriminator accuracy, etc.) about GAIL training, please see [here](https://drive.google.com/drive/folders/1nnU8dqAV9i37-_5_vWIspyFUJFQLCsDD?usp=sharing)
|
||||
|
||||
### Determinstic Polciy (Set std=0)
|
||||
| | Un-normalized | Normalized |
|
||||
|---|---|---|
|
||||
| Hopper-v1 | <img src='Hopper-unnormalized-deterministic-scores.png'> | <img src='Hopper-normalized-deterministic-scores.png'> |
|
||||
| HalfCheetah-v1 | <img src='HalfCheetah-unnormalized-deterministic-scores.png'> | <img src='HalfCheetah-normalized-deterministic-scores.png'> |
|
||||
| Walker2d-v1 | <img src='Walker2d-unnormalized-deterministic-scores.png'> | <img src='Walker2d-normalized-deterministic-scores.png'> |
|
||||
| Humanoid-v1 | <img src='Humanoid-unnormalized-deterministic-scores.png'> | <img src='Humanoid-normalized-deterministic-scores.png'> |
|
||||
| HumanoidStandup-v1 | <img src='HumanoidStandup-unnormalized-deterministic-scores.png'> | <img src='HumanoidStandup-normalized-deterministic-scores.png'> |
|
||||
|
||||
### Stochatic Policy
|
||||
| | Un-normalized | Normalized |
|
||||
|---|---|---|
|
||||
| Hopper-v1 | <img src='Hopper-unnormalized-stochastic-scores.png'> | <img src='Hopper-normalized-stochastic-scores.png'> |
|
||||
| HalfCheetah-v1 | <img src='HalfCheetah-unnormalized-stochastic-scores.png'> | <img src='HalfCheetah-normalized-stochastic-scores.png'> |
|
||||
| Walker2d-v1 | <img src='Walker2d-unnormalized-stochastic-scores.png'> | <img src='Walker2d-normalized-stochastic-scores.png'> |
|
||||
| Humanoid-v1 | <img src='Humanoid-unnormalized-stochastic-scores.png'> | <img src='Humanoid-normalized-stochastic-scores.png'> |
|
||||
| HumanoidStandup-v1 | <img src='HumanoidStandup-unnormalized-stochastic-scores.png'> | <img src='HumanoidStandup-normalized-stochastic-scores.png'> |
|
||||
|
||||
### details about GAIL imitator
|
||||
|
||||
For all environments, the
|
||||
imitator is trained with 1, 5, 10, 50 trajectories, where each trajectory contains at most
|
||||
1024 transitions, and seed 0, 1, 2, 3, respectively.
|
||||
|
||||
### details about the BC imitators
|
||||
|
||||
All BC imitators are trained with seed 0.
|
Before Width: | Height: | Size: 504 KiB |
Before Width: | Height: | Size: 534 KiB |
Before Width: | Height: | Size: 538 KiB |
Before Width: | Height: | Size: 684 KiB |
Before Width: | Height: | Size: 629 KiB |
@@ -1,239 +0,0 @@
|
||||
'''
|
||||
Disclaimer: this code is highly based on trpo_mpi at @openai/baselines and @openai/imitation
|
||||
'''
|
||||
|
||||
import argparse
|
||||
import os.path as osp
|
||||
import logging
|
||||
from mpi4py import MPI
|
||||
from tqdm import tqdm
|
||||
|
||||
import numpy as np
|
||||
import gym
|
||||
|
||||
from baselines.gail import mlp_policy
|
||||
from baselines.common import set_global_seeds, tf_util as U
|
||||
from baselines.common.misc_util import boolean_flag
|
||||
from baselines import bench
|
||||
from baselines import logger
|
||||
from baselines.gail.dataset.mujoco_dset import Mujoco_Dset
|
||||
from baselines.gail.adversary import TransitionClassifier
|
||||
|
||||
|
||||
def argsparser():
|
||||
parser = argparse.ArgumentParser("Tensorflow Implementation of GAIL")
|
||||
parser.add_argument('--env_id', help='environment ID', default='Hopper-v2')
|
||||
parser.add_argument('--seed', help='RNG seed', type=int, default=0)
|
||||
parser.add_argument('--expert_path', type=str, default='data/deterministic.trpo.Hopper.0.00.npz')
|
||||
parser.add_argument('--checkpoint_dir', help='the directory to save model', default='checkpoint')
|
||||
parser.add_argument('--log_dir', help='the directory to save log file', default='log')
|
||||
parser.add_argument('--load_model_path', help='if provided, load the model', type=str, default=None)
|
||||
# Task
|
||||
parser.add_argument('--task', type=str, choices=['train', 'evaluate', 'sample'], default='train')
|
||||
# for evaluatation
|
||||
boolean_flag(parser, 'stochastic_policy', default=False, help='use stochastic/deterministic policy to evaluate')
|
||||
boolean_flag(parser, 'save_sample', default=False, help='save the trajectories or not')
|
||||
# Mujoco Dataset Configuration
|
||||
parser.add_argument('--traj_limitation', type=int, default=-1)
|
||||
# Optimization Configuration
|
||||
parser.add_argument('--g_step', help='number of steps to train policy in each epoch', type=int, default=3)
|
||||
parser.add_argument('--d_step', help='number of steps to train discriminator in each epoch', type=int, default=1)
|
||||
# Network Configuration (Using MLP Policy)
|
||||
parser.add_argument('--policy_hidden_size', type=int, default=100)
|
||||
parser.add_argument('--adversary_hidden_size', type=int, default=100)
|
||||
# Algorithms Configuration
|
||||
parser.add_argument('--algo', type=str, choices=['trpo', 'ppo'], default='trpo')
|
||||
parser.add_argument('--max_kl', type=float, default=0.01)
|
||||
parser.add_argument('--policy_entcoeff', help='entropy coefficiency of policy', type=float, default=0)
|
||||
parser.add_argument('--adversary_entcoeff', help='entropy coefficiency of discriminator', type=float, default=1e-3)
|
||||
# Traing Configuration
|
||||
parser.add_argument('--save_per_iter', help='save model every xx iterations', type=int, default=100)
|
||||
parser.add_argument('--num_timesteps', help='number of timesteps per episode', type=int, default=5e6)
|
||||
# Behavior Cloning
|
||||
boolean_flag(parser, 'pretrained', default=False, help='Use BC to pretrain')
|
||||
parser.add_argument('--BC_max_iter', help='Max iteration for training BC', type=int, default=1e4)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def get_task_name(args):
|
||||
task_name = args.algo + "_gail."
|
||||
if args.pretrained:
|
||||
task_name += "with_pretrained."
|
||||
if args.traj_limitation != np.inf:
|
||||
task_name += "transition_limitation_%d." % args.traj_limitation
|
||||
task_name += args.env_id.split("-")[0]
|
||||
task_name = task_name + ".g_step_" + str(args.g_step) + ".d_step_" + str(args.d_step) + \
|
||||
".policy_entcoeff_" + str(args.policy_entcoeff) + ".adversary_entcoeff_" + str(args.adversary_entcoeff)
|
||||
task_name += ".seed_" + str(args.seed)
|
||||
return task_name
|
||||
|
||||
|
||||
def main(args):
|
||||
U.make_session(num_cpu=1).__enter__()
|
||||
set_global_seeds(args.seed)
|
||||
env = gym.make(args.env_id)
|
||||
|
||||
def policy_fn(name, ob_space, ac_space, reuse=False):
|
||||
return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
|
||||
reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2)
|
||||
env = bench.Monitor(env, logger.get_dir() and
|
||||
osp.join(logger.get_dir(), "monitor.json"))
|
||||
env.seed(args.seed)
|
||||
gym.logger.setLevel(logging.WARN)
|
||||
task_name = get_task_name(args)
|
||||
args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name)
|
||||
args.log_dir = osp.join(args.log_dir, task_name)
|
||||
|
||||
if args.task == 'train':
|
||||
dataset = Mujoco_Dset(expert_path=args.expert_path, traj_limitation=args.traj_limitation)
|
||||
reward_giver = TransitionClassifier(env, args.adversary_hidden_size, entcoeff=args.adversary_entcoeff)
|
||||
train(env,
|
||||
args.seed,
|
||||
policy_fn,
|
||||
reward_giver,
|
||||
dataset,
|
||||
args.algo,
|
||||
args.g_step,
|
||||
args.d_step,
|
||||
args.policy_entcoeff,
|
||||
args.num_timesteps,
|
||||
args.save_per_iter,
|
||||
args.checkpoint_dir,
|
||||
args.log_dir,
|
||||
args.pretrained,
|
||||
args.BC_max_iter,
|
||||
task_name
|
||||
)
|
||||
elif args.task == 'evaluate':
|
||||
runner(env,
|
||||
policy_fn,
|
||||
args.load_model_path,
|
||||
timesteps_per_batch=1024,
|
||||
number_trajs=10,
|
||||
stochastic_policy=args.stochastic_policy,
|
||||
save=args.save_sample
|
||||
)
|
||||
else:
|
||||
raise NotImplementedError
|
||||
env.close()
|
||||
|
||||
|
||||
def train(env, seed, policy_fn, reward_giver, dataset, algo,
|
||||
g_step, d_step, policy_entcoeff, num_timesteps, save_per_iter,
|
||||
checkpoint_dir, log_dir, pretrained, BC_max_iter, task_name=None):
|
||||
|
||||
pretrained_weight = None
|
||||
if pretrained and (BC_max_iter > 0):
|
||||
# Pretrain with behavior cloning
|
||||
from baselines.gail import behavior_clone
|
||||
pretrained_weight = behavior_clone.learn(env, policy_fn, dataset,
|
||||
max_iters=BC_max_iter)
|
||||
|
||||
if algo == 'trpo':
|
||||
from baselines.gail import trpo_mpi
|
||||
# Set up for MPI seed
|
||||
rank = MPI.COMM_WORLD.Get_rank()
|
||||
if rank != 0:
|
||||
logger.set_level(logger.DISABLED)
|
||||
workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
|
||||
set_global_seeds(workerseed)
|
||||
env.seed(workerseed)
|
||||
trpo_mpi.learn(env, policy_fn, reward_giver, dataset, rank,
|
||||
pretrained=pretrained, pretrained_weight=pretrained_weight,
|
||||
g_step=g_step, d_step=d_step,
|
||||
entcoeff=policy_entcoeff,
|
||||
max_timesteps=num_timesteps,
|
||||
ckpt_dir=checkpoint_dir, log_dir=log_dir,
|
||||
save_per_iter=save_per_iter,
|
||||
timesteps_per_batch=1024,
|
||||
max_kl=0.01, cg_iters=10, cg_damping=0.1,
|
||||
gamma=0.995, lam=0.97,
|
||||
vf_iters=5, vf_stepsize=1e-3,
|
||||
task_name=task_name)
|
||||
else:
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
def runner(env, policy_func, load_model_path, timesteps_per_batch, number_trajs,
|
||||
stochastic_policy, save=False, reuse=False):
|
||||
|
||||
# Setup network
|
||||
# ----------------------------------------
|
||||
ob_space = env.observation_space
|
||||
ac_space = env.action_space
|
||||
pi = policy_func("pi", ob_space, ac_space, reuse=reuse)
|
||||
U.initialize()
|
||||
# Prepare for rollouts
|
||||
# ----------------------------------------
|
||||
U.load_state(load_model_path)
|
||||
|
||||
obs_list = []
|
||||
acs_list = []
|
||||
len_list = []
|
||||
ret_list = []
|
||||
for _ in tqdm(range(number_trajs)):
|
||||
traj = traj_1_generator(pi, env, timesteps_per_batch, stochastic=stochastic_policy)
|
||||
obs, acs, ep_len, ep_ret = traj['ob'], traj['ac'], traj['ep_len'], traj['ep_ret']
|
||||
obs_list.append(obs)
|
||||
acs_list.append(acs)
|
||||
len_list.append(ep_len)
|
||||
ret_list.append(ep_ret)
|
||||
if stochastic_policy:
|
||||
print('stochastic policy:')
|
||||
else:
|
||||
print('deterministic policy:')
|
||||
if save:
|
||||
filename = load_model_path.split('/')[-1] + '.' + env.spec.id
|
||||
np.savez(filename, obs=np.array(obs_list), acs=np.array(acs_list),
|
||||
lens=np.array(len_list), rets=np.array(ret_list))
|
||||
avg_len = sum(len_list)/len(len_list)
|
||||
avg_ret = sum(ret_list)/len(ret_list)
|
||||
print("Average length:", avg_len)
|
||||
print("Average return:", avg_ret)
|
||||
return avg_len, avg_ret
|
||||
|
||||
|
||||
# Sample one trajectory (until trajectory end)
|
||||
def traj_1_generator(pi, env, horizon, stochastic):
|
||||
|
||||
t = 0
|
||||
ac = env.action_space.sample() # not used, just so we have the datatype
|
||||
new = True # marks if we're on first timestep of an episode
|
||||
|
||||
ob = env.reset()
|
||||
cur_ep_ret = 0 # return in current episode
|
||||
cur_ep_len = 0 # len of current episode
|
||||
|
||||
# Initialize history arrays
|
||||
obs = []
|
||||
rews = []
|
||||
news = []
|
||||
acs = []
|
||||
|
||||
while True:
|
||||
ac, vpred = pi.act(stochastic, ob)
|
||||
obs.append(ob)
|
||||
news.append(new)
|
||||
acs.append(ac)
|
||||
|
||||
ob, rew, new, _ = env.step(ac)
|
||||
rews.append(rew)
|
||||
|
||||
cur_ep_ret += rew
|
||||
cur_ep_len += 1
|
||||
if new or t >= horizon:
|
||||
break
|
||||
t += 1
|
||||
|
||||
obs = np.array(obs)
|
||||
rews = np.array(rews)
|
||||
news = np.array(news)
|
||||
acs = np.array(acs)
|
||||
traj = {"ob": obs, "rew": rews, "new": news, "ac": acs,
|
||||
"ep_ret": cur_ep_ret, "ep_len": cur_ep_len}
|
||||
return traj
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
args = argsparser()
|
||||
main(args)
|
@@ -1,45 +0,0 @@
|
||||
'''
|
||||
This code is highly based on https://github.com/carpedm20/deep-rl-tensorflow/blob/master/agents/statistic.py
|
||||
'''
|
||||
|
||||
import tensorflow as tf
|
||||
import numpy as np
|
||||
|
||||
import baselines.common.tf_util as U
|
||||
|
||||
|
||||
class stats():
|
||||
|
||||
def __init__(self, scalar_keys=[], histogram_keys=[]):
|
||||
self.scalar_keys = scalar_keys
|
||||
self.histogram_keys = histogram_keys
|
||||
self.scalar_summaries = []
|
||||
self.scalar_summaries_ph = []
|
||||
self.histogram_summaries_ph = []
|
||||
self.histogram_summaries = []
|
||||
with tf.variable_scope('summary'):
|
||||
for k in scalar_keys:
|
||||
ph = tf.placeholder('float32', None, name=k+'.scalar.summary')
|
||||
sm = tf.summary.scalar(k+'.scalar.summary', ph)
|
||||
self.scalar_summaries_ph.append(ph)
|
||||
self.scalar_summaries.append(sm)
|
||||
for k in histogram_keys:
|
||||
ph = tf.placeholder('float32', None, name=k+'.histogram.summary')
|
||||
sm = tf.summary.scalar(k+'.histogram.summary', ph)
|
||||
self.histogram_summaries_ph.append(ph)
|
||||
self.histogram_summaries.append(sm)
|
||||
|
||||
self.summaries = tf.summary.merge(self.scalar_summaries+self.histogram_summaries)
|
||||
|
||||
def add_all_summary(self, writer, values, iter):
|
||||
# Note that the order of the incoming ```values``` should be the same as the that of the
|
||||
# ```scalar_keys``` given in ```__init__```
|
||||
if np.sum(np.isnan(values)+0) != 0:
|
||||
return
|
||||
sess = U.get_session()
|
||||
keys = self.scalar_summaries_ph + self.histogram_summaries_ph
|
||||
feed_dict = {}
|
||||
for k, v in zip(keys, values):
|
||||
feed_dict.update({k: v})
|
||||
summaries_str = sess.run(self.summaries, feed_dict)
|
||||
writer.add_summary(summaries_str, iter)
|
@@ -1,354 +0,0 @@
|
||||
'''
|
||||
Disclaimer: The trpo part highly rely on trpo_mpi at @openai/baselines
|
||||
'''
|
||||
|
||||
import time
|
||||
import os
|
||||
from contextlib import contextmanager
|
||||
from mpi4py import MPI
|
||||
from collections import deque
|
||||
|
||||
import tensorflow as tf
|
||||
import numpy as np
|
||||
|
||||
import baselines.common.tf_util as U
|
||||
from baselines.common import explained_variance, zipsame, dataset, fmt_row
|
||||
from baselines import logger
|
||||
from baselines.common import colorize
|
||||
from baselines.common.mpi_adam import MpiAdam
|
||||
from baselines.common.cg import cg
|
||||
from baselines.gail.statistics import stats
|
||||
|
||||
|
||||
def traj_segment_generator(pi, env, reward_giver, horizon, stochastic):
|
||||
|
||||
# Initialize state variables
|
||||
t = 0
|
||||
ac = env.action_space.sample()
|
||||
new = True
|
||||
rew = 0.0
|
||||
true_rew = 0.0
|
||||
ob = env.reset()
|
||||
|
||||
cur_ep_ret = 0
|
||||
cur_ep_len = 0
|
||||
cur_ep_true_ret = 0
|
||||
ep_true_rets = []
|
||||
ep_rets = []
|
||||
ep_lens = []
|
||||
|
||||
# Initialize history arrays
|
||||
obs = np.array([ob for _ in range(horizon)])
|
||||
true_rews = np.zeros(horizon, 'float32')
|
||||
rews = np.zeros(horizon, 'float32')
|
||||
vpreds = np.zeros(horizon, 'float32')
|
||||
news = np.zeros(horizon, 'int32')
|
||||
acs = np.array([ac for _ in range(horizon)])
|
||||
prevacs = acs.copy()
|
||||
|
||||
while True:
|
||||
prevac = ac
|
||||
ac, vpred = pi.act(stochastic, ob)
|
||||
# Slight weirdness here because we need value function at time T
|
||||
# before returning segment [0, T-1] so we get the correct
|
||||
# terminal value
|
||||
if t > 0 and t % horizon == 0:
|
||||
yield {"ob": obs, "rew": rews, "vpred": vpreds, "new": news,
|
||||
"ac": acs, "prevac": prevacs, "nextvpred": vpred * (1 - new),
|
||||
"ep_rets": ep_rets, "ep_lens": ep_lens, "ep_true_rets": ep_true_rets}
|
||||
_, vpred = pi.act(stochastic, ob)
|
||||
# Be careful!!! if you change the downstream algorithm to aggregate
|
||||
# several of these batches, then be sure to do a deepcopy
|
||||
ep_rets = []
|
||||
ep_true_rets = []
|
||||
ep_lens = []
|
||||
i = t % horizon
|
||||
obs[i] = ob
|
||||
vpreds[i] = vpred
|
||||
news[i] = new
|
||||
acs[i] = ac
|
||||
prevacs[i] = prevac
|
||||
|
||||
rew = reward_giver.get_reward(ob, ac)
|
||||
ob, true_rew, new, _ = env.step(ac)
|
||||
rews[i] = rew
|
||||
true_rews[i] = true_rew
|
||||
|
||||
cur_ep_ret += rew
|
||||
cur_ep_true_ret += true_rew
|
||||
cur_ep_len += 1
|
||||
if new:
|
||||
ep_rets.append(cur_ep_ret)
|
||||
ep_true_rets.append(cur_ep_true_ret)
|
||||
ep_lens.append(cur_ep_len)
|
||||
cur_ep_ret = 0
|
||||
cur_ep_true_ret = 0
|
||||
cur_ep_len = 0
|
||||
ob = env.reset()
|
||||
t += 1
|
||||
|
||||
|
||||
def add_vtarg_and_adv(seg, gamma, lam):
|
||||
new = np.append(seg["new"], 0) # last element is only used for last vtarg, but we already zeroed it if last new = 1
|
||||
vpred = np.append(seg["vpred"], seg["nextvpred"])
|
||||
T = len(seg["rew"])
|
||||
seg["adv"] = gaelam = np.empty(T, 'float32')
|
||||
rew = seg["rew"]
|
||||
lastgaelam = 0
|
||||
for t in reversed(range(T)):
|
||||
nonterminal = 1-new[t+1]
|
||||
delta = rew[t] + gamma * vpred[t+1] * nonterminal - vpred[t]
|
||||
gaelam[t] = lastgaelam = delta + gamma * lam * nonterminal * lastgaelam
|
||||
seg["tdlamret"] = seg["adv"] + seg["vpred"]
|
||||
|
||||
|
||||
def learn(env, policy_func, reward_giver, expert_dataset, rank,
|
||||
pretrained, pretrained_weight, *,
|
||||
g_step, d_step, entcoeff, save_per_iter,
|
||||
ckpt_dir, log_dir, timesteps_per_batch, task_name,
|
||||
gamma, lam,
|
||||
max_kl, cg_iters, cg_damping=1e-2,
|
||||
vf_stepsize=3e-4, d_stepsize=3e-4, vf_iters=3,
|
||||
max_timesteps=0, max_episodes=0, max_iters=0,
|
||||
callback=None
|
||||
):
|
||||
|
||||
nworkers = MPI.COMM_WORLD.Get_size()
|
||||
rank = MPI.COMM_WORLD.Get_rank()
|
||||
np.set_printoptions(precision=3)
|
||||
# Setup losses and stuff
|
||||
# ----------------------------------------
|
||||
ob_space = env.observation_space
|
||||
ac_space = env.action_space
|
||||
pi = policy_func("pi", ob_space, ac_space, reuse=(pretrained_weight != None))
|
||||
oldpi = policy_func("oldpi", ob_space, ac_space)
|
||||
atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable)
|
||||
ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return
|
||||
|
||||
ob = U.get_placeholder_cached(name="ob")
|
||||
ac = pi.pdtype.sample_placeholder([None])
|
||||
|
||||
kloldnew = oldpi.pd.kl(pi.pd)
|
||||
ent = pi.pd.entropy()
|
||||
meankl = tf.reduce_mean(kloldnew)
|
||||
meanent = tf.reduce_mean(ent)
|
||||
entbonus = entcoeff * meanent
|
||||
|
||||
vferr = tf.reduce_mean(tf.square(pi.vpred - ret))
|
||||
|
||||
ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold
|
||||
surrgain = tf.reduce_mean(ratio * atarg)
|
||||
|
||||
optimgain = surrgain + entbonus
|
||||
losses = [optimgain, meankl, entbonus, surrgain, meanent]
|
||||
loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"]
|
||||
|
||||
dist = meankl
|
||||
|
||||
all_var_list = pi.get_trainable_variables()
|
||||
var_list = [v for v in all_var_list if v.name.startswith("pi/pol") or v.name.startswith("pi/logstd")]
|
||||
vf_var_list = [v for v in all_var_list if v.name.startswith("pi/vff")]
|
||||
assert len(var_list) == len(vf_var_list) + 1
|
||||
d_adam = MpiAdam(reward_giver.get_trainable_variables())
|
||||
vfadam = MpiAdam(vf_var_list)
|
||||
|
||||
get_flat = U.GetFlat(var_list)
|
||||
set_from_flat = U.SetFromFlat(var_list)
|
||||
klgrads = tf.gradients(dist, var_list)
|
||||
flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan")
|
||||
shapes = [var.get_shape().as_list() for var in var_list]
|
||||
start = 0
|
||||
tangents = []
|
||||
for shape in shapes:
|
||||
sz = U.intprod(shape)
|
||||
tangents.append(tf.reshape(flat_tangent[start:start+sz], shape))
|
||||
start += sz
|
||||
gvp = tf.add_n([tf.reduce_sum(g*tangent) for (g, tangent) in zipsame(klgrads, tangents)]) # pylint: disable=E1111
|
||||
fvp = U.flatgrad(gvp, var_list)
|
||||
|
||||
assign_old_eq_new = U.function([], [], updates=[tf.assign(oldv, newv)
|
||||
for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())])
|
||||
compute_losses = U.function([ob, ac, atarg], losses)
|
||||
compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)])
|
||||
compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp)
|
||||
compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list))
|
||||
|
||||
@contextmanager
|
||||
def timed(msg):
|
||||
if rank == 0:
|
||||
print(colorize(msg, color='magenta'))
|
||||
tstart = time.time()
|
||||
yield
|
||||
print(colorize("done in %.3f seconds" % (time.time() - tstart), color='magenta'))
|
||||
else:
|
||||
yield
|
||||
|
||||
def allmean(x):
|
||||
assert isinstance(x, np.ndarray)
|
||||
out = np.empty_like(x)
|
||||
MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM)
|
||||
out /= nworkers
|
||||
return out
|
||||
|
||||
U.initialize()
|
||||
th_init = get_flat()
|
||||
MPI.COMM_WORLD.Bcast(th_init, root=0)
|
||||
set_from_flat(th_init)
|
||||
d_adam.sync()
|
||||
vfadam.sync()
|
||||
if rank == 0:
|
||||
print("Init param sum", th_init.sum(), flush=True)
|
||||
|
||||
# Prepare for rollouts
|
||||
# ----------------------------------------
|
||||
seg_gen = traj_segment_generator(pi, env, reward_giver, timesteps_per_batch, stochastic=True)
|
||||
|
||||
episodes_so_far = 0
|
||||
timesteps_so_far = 0
|
||||
iters_so_far = 0
|
||||
tstart = time.time()
|
||||
lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths
|
||||
rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards
|
||||
true_rewbuffer = deque(maxlen=40)
|
||||
|
||||
assert sum([max_iters > 0, max_timesteps > 0, max_episodes > 0]) == 1
|
||||
|
||||
g_loss_stats = stats(loss_names)
|
||||
d_loss_stats = stats(reward_giver.loss_name)
|
||||
ep_stats = stats(["True_rewards", "Rewards", "Episode_length"])
|
||||
# if provide pretrained weight
|
||||
if pretrained_weight is not None:
|
||||
U.load_state(pretrained_weight, var_list=pi.get_variables())
|
||||
|
||||
while True:
|
||||
if callback: callback(locals(), globals())
|
||||
if max_timesteps and timesteps_so_far >= max_timesteps:
|
||||
break
|
||||
elif max_episodes and episodes_so_far >= max_episodes:
|
||||
break
|
||||
elif max_iters and iters_so_far >= max_iters:
|
||||
break
|
||||
|
||||
# Save model
|
||||
if rank == 0 and iters_so_far % save_per_iter == 0 and ckpt_dir is not None:
|
||||
fname = os.path.join(ckpt_dir, task_name)
|
||||
os.makedirs(os.path.dirname(fname), exist_ok=True)
|
||||
saver = tf.train.Saver()
|
||||
saver.save(tf.get_default_session(), fname)
|
||||
|
||||
logger.log("********** Iteration %i ************" % iters_so_far)
|
||||
|
||||
def fisher_vector_product(p):
|
||||
return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p
|
||||
# ------------------ Update G ------------------
|
||||
logger.log("Optimizing Policy...")
|
||||
for _ in range(g_step):
|
||||
with timed("sampling"):
|
||||
seg = seg_gen.__next__()
|
||||
add_vtarg_and_adv(seg, gamma, lam)
|
||||
# ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
|
||||
ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"]
|
||||
vpredbefore = seg["vpred"] # predicted value function before udpate
|
||||
atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate
|
||||
|
||||
if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy
|
||||
|
||||
args = seg["ob"], seg["ac"], atarg
|
||||
fvpargs = [arr[::5] for arr in args]
|
||||
|
||||
assign_old_eq_new() # set old parameter values to new parameter values
|
||||
with timed("computegrad"):
|
||||
*lossbefore, g = compute_lossandgrad(*args)
|
||||
lossbefore = allmean(np.array(lossbefore))
|
||||
g = allmean(g)
|
||||
if np.allclose(g, 0):
|
||||
logger.log("Got zero gradient. not updating")
|
||||
else:
|
||||
with timed("cg"):
|
||||
stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank == 0)
|
||||
assert np.isfinite(stepdir).all()
|
||||
shs = .5*stepdir.dot(fisher_vector_product(stepdir))
|
||||
lm = np.sqrt(shs / max_kl)
|
||||
# logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g))
|
||||
fullstep = stepdir / lm
|
||||
expectedimprove = g.dot(fullstep)
|
||||
surrbefore = lossbefore[0]
|
||||
stepsize = 1.0
|
||||
thbefore = get_flat()
|
||||
for _ in range(10):
|
||||
thnew = thbefore + fullstep * stepsize
|
||||
set_from_flat(thnew)
|
||||
meanlosses = surr, kl, *_ = allmean(np.array(compute_losses(*args)))
|
||||
improve = surr - surrbefore
|
||||
logger.log("Expected: %.3f Actual: %.3f" % (expectedimprove, improve))
|
||||
if not np.isfinite(meanlosses).all():
|
||||
logger.log("Got non-finite value of losses -- bad!")
|
||||
elif kl > max_kl * 1.5:
|
||||
logger.log("violated KL constraint. shrinking step.")
|
||||
elif improve < 0:
|
||||
logger.log("surrogate didn't improve. shrinking step.")
|
||||
else:
|
||||
logger.log("Stepsize OK!")
|
||||
break
|
||||
stepsize *= .5
|
||||
else:
|
||||
logger.log("couldn't compute a good step")
|
||||
set_from_flat(thbefore)
|
||||
if nworkers > 1 and iters_so_far % 20 == 0:
|
||||
paramsums = MPI.COMM_WORLD.allgather((thnew.sum(), vfadam.getflat().sum())) # list of tuples
|
||||
assert all(np.allclose(ps, paramsums[0]) for ps in paramsums[1:])
|
||||
with timed("vf"):
|
||||
for _ in range(vf_iters):
|
||||
for (mbob, mbret) in dataset.iterbatches((seg["ob"], seg["tdlamret"]),
|
||||
include_final_partial_batch=False, batch_size=128):
|
||||
if hasattr(pi, "ob_rms"):
|
||||
pi.ob_rms.update(mbob) # update running mean/std for policy
|
||||
g = allmean(compute_vflossandgrad(mbob, mbret))
|
||||
vfadam.update(g, vf_stepsize)
|
||||
|
||||
g_losses = meanlosses
|
||||
for (lossname, lossval) in zip(loss_names, meanlosses):
|
||||
logger.record_tabular(lossname, lossval)
|
||||
logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret))
|
||||
# ------------------ Update D ------------------
|
||||
logger.log("Optimizing Discriminator...")
|
||||
logger.log(fmt_row(13, reward_giver.loss_name))
|
||||
ob_expert, ac_expert = expert_dataset.get_next_batch(len(ob))
|
||||
batch_size = len(ob) // d_step
|
||||
d_losses = [] # list of tuples, each of which gives the loss for a minibatch
|
||||
for ob_batch, ac_batch in dataset.iterbatches((ob, ac),
|
||||
include_final_partial_batch=False,
|
||||
batch_size=batch_size):
|
||||
ob_expert, ac_expert = expert_dataset.get_next_batch(len(ob_batch))
|
||||
# update running mean/std for reward_giver
|
||||
if hasattr(reward_giver, "obs_rms"): reward_giver.obs_rms.update(np.concatenate((ob_batch, ob_expert), 0))
|
||||
*newlosses, g = reward_giver.lossandgrad(ob_batch, ac_batch, ob_expert, ac_expert)
|
||||
d_adam.update(allmean(g), d_stepsize)
|
||||
d_losses.append(newlosses)
|
||||
logger.log(fmt_row(13, np.mean(d_losses, axis=0)))
|
||||
|
||||
lrlocal = (seg["ep_lens"], seg["ep_rets"], seg["ep_true_rets"]) # local values
|
||||
listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples
|
||||
lens, rews, true_rets = map(flatten_lists, zip(*listoflrpairs))
|
||||
true_rewbuffer.extend(true_rets)
|
||||
lenbuffer.extend(lens)
|
||||
rewbuffer.extend(rews)
|
||||
|
||||
logger.record_tabular("EpLenMean", np.mean(lenbuffer))
|
||||
logger.record_tabular("EpRewMean", np.mean(rewbuffer))
|
||||
logger.record_tabular("EpTrueRewMean", np.mean(true_rewbuffer))
|
||||
logger.record_tabular("EpThisIter", len(lens))
|
||||
episodes_so_far += len(lens)
|
||||
timesteps_so_far += sum(lens)
|
||||
iters_so_far += 1
|
||||
|
||||
logger.record_tabular("EpisodesSoFar", episodes_so_far)
|
||||
logger.record_tabular("TimestepsSoFar", timesteps_so_far)
|
||||
logger.record_tabular("TimeElapsed", time.time() - tstart)
|
||||
|
||||
if rank == 0:
|
||||
logger.dump_tabular()
|
||||
|
||||
|
||||
def flatten_lists(listoflists):
|
||||
return [el for list_ in listoflists for el in list_]
|
@@ -1,35 +0,0 @@
|
||||
# Hindsight Experience Replay
|
||||
For details on Hindsight Experience Replay (HER), please read the [paper](https://arxiv.org/pdf/1707.01495.pdf).
|
||||
|
||||
## How to use Hindsight Experience Replay
|
||||
|
||||
### Getting started
|
||||
Training an agent is very simple:
|
||||
```bash
|
||||
python -m baselines.her.experiment.train
|
||||
```
|
||||
This will train a DDPG+HER agent on the `FetchReach` environment.
|
||||
You should see the success rate go up quickly to `1.0`, which means that the agent achieves the
|
||||
desired goal in 100% of the cases.
|
||||
The training script logs other diagnostics as well and pickles the best policy so far (w.r.t. to its test success rate),
|
||||
the latest policy, and, if enabled, a history of policies every K epochs.
|
||||
|
||||
To inspect what the agent has learned, use the play script:
|
||||
```bash
|
||||
python -m baselines.her.experiment.play /path/to/an/experiment/policy_best.pkl
|
||||
```
|
||||
You can try it right now with the results of the training step (the script prints out the path for you).
|
||||
This should visualize the current policy for 10 episodes and will also print statistics.
|
||||
|
||||
|
||||
### Advanced usage
|
||||
The train script comes with advanced features like MPI support, that allows to scale across all cores of a single machine.
|
||||
To see all available options, simply run this command:
|
||||
```bash
|
||||
python -m baselines.her.experiment.train --help
|
||||
```
|
||||
To run on, say, 20 CPU cores, you can use the following command:
|
||||
```bash
|
||||
python -m baselines.her.experiment.train --num_cpu 20
|
||||
```
|
||||
That's it, you are now running rollouts using 20 MPI workers and average gradients for network updates across all 20 core.
|
@@ -1,44 +0,0 @@
|
||||
import tensorflow as tf
|
||||
from baselines.her.util import store_args, nn
|
||||
|
||||
|
||||
class ActorCritic:
|
||||
@store_args
|
||||
def __init__(self, inputs_tf, dimo, dimg, dimu, max_u, o_stats, g_stats, hidden, layers,
|
||||
**kwargs):
|
||||
"""The actor-critic network and related training code.
|
||||
|
||||
Args:
|
||||
inputs_tf (dict of tensors): all necessary inputs for the network: the
|
||||
observation (o), the goal (g), and the action (u)
|
||||
dimo (int): the dimension of the observations
|
||||
dimg (int): the dimension of the goals
|
||||
dimu (int): the dimension of the actions
|
||||
max_u (float): the maximum magnitude of actions; action outputs will be scaled
|
||||
accordingly
|
||||
o_stats (baselines.her.Normalizer): normalizer for observations
|
||||
g_stats (baselines.her.Normalizer): normalizer for goals
|
||||
hidden (int): number of hidden units that should be used in hidden layers
|
||||
layers (int): number of hidden layers
|
||||
"""
|
||||
self.o_tf = inputs_tf['o']
|
||||
self.g_tf = inputs_tf['g']
|
||||
self.u_tf = inputs_tf['u']
|
||||
|
||||
# Prepare inputs for actor and critic.
|
||||
o = self.o_stats.normalize(self.o_tf)
|
||||
g = self.g_stats.normalize(self.g_tf)
|
||||
input_pi = tf.concat(axis=1, values=[o, g]) # for actor
|
||||
|
||||
# Networks.
|
||||
with tf.variable_scope('pi'):
|
||||
self.pi_tf = self.max_u * tf.tanh(nn(
|
||||
input_pi, [self.hidden] * self.layers + [self.dimu]))
|
||||
with tf.variable_scope('Q'):
|
||||
# for policy training
|
||||
input_Q = tf.concat(axis=1, values=[o, g, self.pi_tf / self.max_u])
|
||||
self.Q_pi_tf = nn(input_Q, [self.hidden] * self.layers + [1])
|
||||
# for critic training
|
||||
input_Q = tf.concat(axis=1, values=[o, g, self.u_tf / self.max_u])
|
||||
self._input_Q = input_Q # exposed for tests
|
||||
self.Q_tf = nn(input_Q, [self.hidden] * self.layers + [1], reuse=True)
|
@@ -1,340 +0,0 @@
|
||||
from collections import OrderedDict
|
||||
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
from tensorflow.contrib.staging import StagingArea
|
||||
|
||||
from baselines import logger
|
||||
from baselines.her.util import (
|
||||
import_function, store_args, flatten_grads, transitions_in_episode_batch)
|
||||
from baselines.her.normalizer import Normalizer
|
||||
from baselines.her.replay_buffer import ReplayBuffer
|
||||
from baselines.common.mpi_adam import MpiAdam
|
||||
|
||||
|
||||
def dims_to_shapes(input_dims):
|
||||
return {key: tuple([val]) if val > 0 else tuple() for key, val in input_dims.items()}
|
||||
|
||||
|
||||
class DDPG(object):
|
||||
@store_args
|
||||
def __init__(self, input_dims, buffer_size, hidden, layers, network_class, polyak, batch_size,
|
||||
Q_lr, pi_lr, norm_eps, norm_clip, max_u, action_l2, clip_obs, scope, T,
|
||||
rollout_batch_size, subtract_goals, relative_goals, clip_pos_returns, clip_return,
|
||||
sample_transitions, gamma, reuse=False, **kwargs):
|
||||
"""Implementation of DDPG that is used in combination with Hindsight Experience Replay (HER).
|
||||
|
||||
Args:
|
||||
input_dims (dict of ints): dimensions for the observation (o), the goal (g), and the
|
||||
actions (u)
|
||||
buffer_size (int): number of transitions that are stored in the replay buffer
|
||||
hidden (int): number of units in the hidden layers
|
||||
layers (int): number of hidden layers
|
||||
network_class (str): the network class that should be used (e.g. 'baselines.her.ActorCritic')
|
||||
polyak (float): coefficient for Polyak-averaging of the target network
|
||||
batch_size (int): batch size for training
|
||||
Q_lr (float): learning rate for the Q (critic) network
|
||||
pi_lr (float): learning rate for the pi (actor) network
|
||||
norm_eps (float): a small value used in the normalizer to avoid numerical instabilities
|
||||
norm_clip (float): normalized inputs are clipped to be in [-norm_clip, norm_clip]
|
||||
max_u (float): maximum action magnitude, i.e. actions are in [-max_u, max_u]
|
||||
action_l2 (float): coefficient for L2 penalty on the actions
|
||||
clip_obs (float): clip observations before normalization to be in [-clip_obs, clip_obs]
|
||||
scope (str): the scope used for the TensorFlow graph
|
||||
T (int): the time horizon for rollouts
|
||||
rollout_batch_size (int): number of parallel rollouts per DDPG agent
|
||||
subtract_goals (function): function that subtracts goals from each other
|
||||
relative_goals (boolean): whether or not relative goals should be fed into the network
|
||||
clip_pos_returns (boolean): whether or not positive returns should be clipped
|
||||
clip_return (float): clip returns to be in [-clip_return, clip_return]
|
||||
sample_transitions (function) function that samples from the replay buffer
|
||||
gamma (float): gamma used for Q learning updates
|
||||
reuse (boolean): whether or not the networks should be reused
|
||||
"""
|
||||
if self.clip_return is None:
|
||||
self.clip_return = np.inf
|
||||
|
||||
self.create_actor_critic = import_function(self.network_class)
|
||||
|
||||
input_shapes = dims_to_shapes(self.input_dims)
|
||||
self.dimo = self.input_dims['o']
|
||||
self.dimg = self.input_dims['g']
|
||||
self.dimu = self.input_dims['u']
|
||||
|
||||
# Prepare staging area for feeding data to the model.
|
||||
stage_shapes = OrderedDict()
|
||||
for key in sorted(self.input_dims.keys()):
|
||||
if key.startswith('info_'):
|
||||
continue
|
||||
stage_shapes[key] = (None, *input_shapes[key])
|
||||
for key in ['o', 'g']:
|
||||
stage_shapes[key + '_2'] = stage_shapes[key]
|
||||
stage_shapes['r'] = (None,)
|
||||
self.stage_shapes = stage_shapes
|
||||
|
||||
# Create network.
|
||||
with tf.variable_scope(self.scope):
|
||||
self.staging_tf = StagingArea(
|
||||
dtypes=[tf.float32 for _ in self.stage_shapes.keys()],
|
||||
shapes=list(self.stage_shapes.values()))
|
||||
self.buffer_ph_tf = [
|
||||
tf.placeholder(tf.float32, shape=shape) for shape in self.stage_shapes.values()]
|
||||
self.stage_op = self.staging_tf.put(self.buffer_ph_tf)
|
||||
|
||||
self._create_network(reuse=reuse)
|
||||
|
||||
# Configure the replay buffer.
|
||||
buffer_shapes = {key: (self.T if key != 'o' else self.T+1, *input_shapes[key])
|
||||
for key, val in input_shapes.items()}
|
||||
buffer_shapes['g'] = (buffer_shapes['g'][0], self.dimg)
|
||||
buffer_shapes['ag'] = (self.T+1, self.dimg)
|
||||
|
||||
buffer_size = (self.buffer_size // self.rollout_batch_size) * self.rollout_batch_size
|
||||
self.buffer = ReplayBuffer(buffer_shapes, buffer_size, self.T, self.sample_transitions)
|
||||
|
||||
def _random_action(self, n):
|
||||
return np.random.uniform(low=-self.max_u, high=self.max_u, size=(n, self.dimu))
|
||||
|
||||
def _preprocess_og(self, o, ag, g):
|
||||
if self.relative_goals:
|
||||
g_shape = g.shape
|
||||
g = g.reshape(-1, self.dimg)
|
||||
ag = ag.reshape(-1, self.dimg)
|
||||
g = self.subtract_goals(g, ag)
|
||||
g = g.reshape(*g_shape)
|
||||
o = np.clip(o, -self.clip_obs, self.clip_obs)
|
||||
g = np.clip(g, -self.clip_obs, self.clip_obs)
|
||||
return o, g
|
||||
|
||||
def get_actions(self, o, ag, g, noise_eps=0., random_eps=0., use_target_net=False,
|
||||
compute_Q=False):
|
||||
o, g = self._preprocess_og(o, ag, g)
|
||||
policy = self.target if use_target_net else self.main
|
||||
# values to compute
|
||||
vals = [policy.pi_tf]
|
||||
if compute_Q:
|
||||
vals += [policy.Q_pi_tf]
|
||||
# feed
|
||||
feed = {
|
||||
policy.o_tf: o.reshape(-1, self.dimo),
|
||||
policy.g_tf: g.reshape(-1, self.dimg),
|
||||
policy.u_tf: np.zeros((o.size // self.dimo, self.dimu), dtype=np.float32)
|
||||
}
|
||||
|
||||
ret = self.sess.run(vals, feed_dict=feed)
|
||||
# action postprocessing
|
||||
u = ret[0]
|
||||
noise = noise_eps * self.max_u * np.random.randn(*u.shape) # gaussian noise
|
||||
u += noise
|
||||
u = np.clip(u, -self.max_u, self.max_u)
|
||||
u += np.random.binomial(1, random_eps, u.shape[0]).reshape(-1, 1) * (self._random_action(u.shape[0]) - u) # eps-greedy
|
||||
if u.shape[0] == 1:
|
||||
u = u[0]
|
||||
u = u.copy()
|
||||
ret[0] = u
|
||||
|
||||
if len(ret) == 1:
|
||||
return ret[0]
|
||||
else:
|
||||
return ret
|
||||
|
||||
def store_episode(self, episode_batch, update_stats=True):
|
||||
"""
|
||||
episode_batch: array of batch_size x (T or T+1) x dim_key
|
||||
'o' is of size T+1, others are of size T
|
||||
"""
|
||||
|
||||
self.buffer.store_episode(episode_batch)
|
||||
|
||||
if update_stats:
|
||||
# add transitions to normalizer
|
||||
episode_batch['o_2'] = episode_batch['o'][:, 1:, :]
|
||||
episode_batch['ag_2'] = episode_batch['ag'][:, 1:, :]
|
||||
num_normalizing_transitions = transitions_in_episode_batch(episode_batch)
|
||||
transitions = self.sample_transitions(episode_batch, num_normalizing_transitions)
|
||||
|
||||
o, o_2, g, ag = transitions['o'], transitions['o_2'], transitions['g'], transitions['ag']
|
||||
transitions['o'], transitions['g'] = self._preprocess_og(o, ag, g)
|
||||
# No need to preprocess the o_2 and g_2 since this is only used for stats
|
||||
|
||||
self.o_stats.update(transitions['o'])
|
||||
self.g_stats.update(transitions['g'])
|
||||
|
||||
self.o_stats.recompute_stats()
|
||||
self.g_stats.recompute_stats()
|
||||
|
||||
def get_current_buffer_size(self):
|
||||
return self.buffer.get_current_size()
|
||||
|
||||
def _sync_optimizers(self):
|
||||
self.Q_adam.sync()
|
||||
self.pi_adam.sync()
|
||||
|
||||
def _grads(self):
|
||||
# Avoid feed_dict here for performance!
|
||||
critic_loss, actor_loss, Q_grad, pi_grad = self.sess.run([
|
||||
self.Q_loss_tf,
|
||||
self.main.Q_pi_tf,
|
||||
self.Q_grad_tf,
|
||||
self.pi_grad_tf
|
||||
])
|
||||
return critic_loss, actor_loss, Q_grad, pi_grad
|
||||
|
||||
def _update(self, Q_grad, pi_grad):
|
||||
self.Q_adam.update(Q_grad, self.Q_lr)
|
||||
self.pi_adam.update(pi_grad, self.pi_lr)
|
||||
|
||||
def sample_batch(self):
|
||||
transitions = self.buffer.sample(self.batch_size)
|
||||
o, o_2, g = transitions['o'], transitions['o_2'], transitions['g']
|
||||
ag, ag_2 = transitions['ag'], transitions['ag_2']
|
||||
transitions['o'], transitions['g'] = self._preprocess_og(o, ag, g)
|
||||
transitions['o_2'], transitions['g_2'] = self._preprocess_og(o_2, ag_2, g)
|
||||
|
||||
transitions_batch = [transitions[key] for key in self.stage_shapes.keys()]
|
||||
return transitions_batch
|
||||
|
||||
def stage_batch(self, batch=None):
|
||||
if batch is None:
|
||||
batch = self.sample_batch()
|
||||
assert len(self.buffer_ph_tf) == len(batch)
|
||||
self.sess.run(self.stage_op, feed_dict=dict(zip(self.buffer_ph_tf, batch)))
|
||||
|
||||
def train(self, stage=True):
|
||||
if stage:
|
||||
self.stage_batch()
|
||||
critic_loss, actor_loss, Q_grad, pi_grad = self._grads()
|
||||
self._update(Q_grad, pi_grad)
|
||||
return critic_loss, actor_loss
|
||||
|
||||
def _init_target_net(self):
|
||||
self.sess.run(self.init_target_net_op)
|
||||
|
||||
def update_target_net(self):
|
||||
self.sess.run(self.update_target_net_op)
|
||||
|
||||
def clear_buffer(self):
|
||||
self.buffer.clear_buffer()
|
||||
|
||||
def _vars(self, scope):
|
||||
res = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.scope + '/' + scope)
|
||||
assert len(res) > 0
|
||||
return res
|
||||
|
||||
def _global_vars(self, scope):
|
||||
res = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.scope + '/' + scope)
|
||||
return res
|
||||
|
||||
def _create_network(self, reuse=False):
|
||||
logger.info("Creating a DDPG agent with action space %d x %s..." % (self.dimu, self.max_u))
|
||||
|
||||
self.sess = tf.get_default_session()
|
||||
if self.sess is None:
|
||||
self.sess = tf.InteractiveSession()
|
||||
|
||||
# running averages
|
||||
with tf.variable_scope('o_stats') as vs:
|
||||
if reuse:
|
||||
vs.reuse_variables()
|
||||
self.o_stats = Normalizer(self.dimo, self.norm_eps, self.norm_clip, sess=self.sess)
|
||||
with tf.variable_scope('g_stats') as vs:
|
||||
if reuse:
|
||||
vs.reuse_variables()
|
||||
self.g_stats = Normalizer(self.dimg, self.norm_eps, self.norm_clip, sess=self.sess)
|
||||
|
||||
# mini-batch sampling.
|
||||
batch = self.staging_tf.get()
|
||||
batch_tf = OrderedDict([(key, batch[i])
|
||||
for i, key in enumerate(self.stage_shapes.keys())])
|
||||
batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1])
|
||||
|
||||
# networks
|
||||
with tf.variable_scope('main') as vs:
|
||||
if reuse:
|
||||
vs.reuse_variables()
|
||||
self.main = self.create_actor_critic(batch_tf, net_type='main', **self.__dict__)
|
||||
vs.reuse_variables()
|
||||
with tf.variable_scope('target') as vs:
|
||||
if reuse:
|
||||
vs.reuse_variables()
|
||||
target_batch_tf = batch_tf.copy()
|
||||
target_batch_tf['o'] = batch_tf['o_2']
|
||||
target_batch_tf['g'] = batch_tf['g_2']
|
||||
self.target = self.create_actor_critic(
|
||||
target_batch_tf, net_type='target', **self.__dict__)
|
||||
vs.reuse_variables()
|
||||
assert len(self._vars("main")) == len(self._vars("target"))
|
||||
|
||||
# loss functions
|
||||
target_Q_pi_tf = self.target.Q_pi_tf
|
||||
clip_range = (-self.clip_return, 0. if self.clip_pos_returns else np.inf)
|
||||
target_tf = tf.clip_by_value(batch_tf['r'] + self.gamma * target_Q_pi_tf, *clip_range)
|
||||
self.Q_loss_tf = tf.reduce_mean(tf.square(tf.stop_gradient(target_tf) - self.main.Q_tf))
|
||||
self.pi_loss_tf = -tf.reduce_mean(self.main.Q_pi_tf)
|
||||
self.pi_loss_tf += self.action_l2 * tf.reduce_mean(tf.square(self.main.pi_tf / self.max_u))
|
||||
Q_grads_tf = tf.gradients(self.Q_loss_tf, self._vars('main/Q'))
|
||||
pi_grads_tf = tf.gradients(self.pi_loss_tf, self._vars('main/pi'))
|
||||
assert len(self._vars('main/Q')) == len(Q_grads_tf)
|
||||
assert len(self._vars('main/pi')) == len(pi_grads_tf)
|
||||
self.Q_grads_vars_tf = zip(Q_grads_tf, self._vars('main/Q'))
|
||||
self.pi_grads_vars_tf = zip(pi_grads_tf, self._vars('main/pi'))
|
||||
self.Q_grad_tf = flatten_grads(grads=Q_grads_tf, var_list=self._vars('main/Q'))
|
||||
self.pi_grad_tf = flatten_grads(grads=pi_grads_tf, var_list=self._vars('main/pi'))
|
||||
|
||||
# optimizers
|
||||
self.Q_adam = MpiAdam(self._vars('main/Q'), scale_grad_by_procs=False)
|
||||
self.pi_adam = MpiAdam(self._vars('main/pi'), scale_grad_by_procs=False)
|
||||
|
||||
# polyak averaging
|
||||
self.main_vars = self._vars('main/Q') + self._vars('main/pi')
|
||||
self.target_vars = self._vars('target/Q') + self._vars('target/pi')
|
||||
self.stats_vars = self._global_vars('o_stats') + self._global_vars('g_stats')
|
||||
self.init_target_net_op = list(
|
||||
map(lambda v: v[0].assign(v[1]), zip(self.target_vars, self.main_vars)))
|
||||
self.update_target_net_op = list(
|
||||
map(lambda v: v[0].assign(self.polyak * v[0] + (1. - self.polyak) * v[1]), zip(self.target_vars, self.main_vars)))
|
||||
|
||||
# initialize all variables
|
||||
tf.variables_initializer(self._global_vars('')).run()
|
||||
self._sync_optimizers()
|
||||
self._init_target_net()
|
||||
|
||||
def logs(self, prefix=''):
|
||||
logs = []
|
||||
logs += [('stats_o/mean', np.mean(self.sess.run([self.o_stats.mean])))]
|
||||
logs += [('stats_o/std', np.mean(self.sess.run([self.o_stats.std])))]
|
||||
logs += [('stats_g/mean', np.mean(self.sess.run([self.g_stats.mean])))]
|
||||
logs += [('stats_g/std', np.mean(self.sess.run([self.g_stats.std])))]
|
||||
|
||||
if prefix is not '' and not prefix.endswith('/'):
|
||||
return [(prefix + '/' + key, val) for key, val in logs]
|
||||
else:
|
||||
return logs
|
||||
|
||||
def __getstate__(self):
|
||||
"""Our policies can be loaded from pkl, but after unpickling you cannot continue training.
|
||||
"""
|
||||
excluded_subnames = ['_tf', '_op', '_vars', '_adam', 'buffer', 'sess', '_stats',
|
||||
'main', 'target', 'lock', 'env', 'sample_transitions',
|
||||
'stage_shapes', 'create_actor_critic']
|
||||
|
||||
state = {k: v for k, v in self.__dict__.items() if all([not subname in k for subname in excluded_subnames])}
|
||||
state['buffer_size'] = self.buffer_size
|
||||
state['tf'] = self.sess.run([x for x in self._global_vars('') if 'buffer' not in x.name])
|
||||
return state
|
||||
|
||||
def __setstate__(self, state):
|
||||
if 'sample_transitions' not in state:
|
||||
# We don't need this for playing the policy.
|
||||
state['sample_transitions'] = None
|
||||
|
||||
self.__init__(**state)
|
||||
# set up stats (they are overwritten in __init__)
|
||||
for k, v in state.items():
|
||||
if k[-6:] == '_stats':
|
||||
self.__dict__[k] = v
|
||||
# load TF variables
|
||||
vars = [x for x in self._global_vars('') if 'buffer' not in x.name]
|
||||
assert(len(vars) == len(state["tf"]))
|
||||
node = [tf.assign(var, val) for var, val in zip(vars, state["tf"])]
|
||||
self.sess.run(node)
|