Add ACER, PPO2, and results_plotter.py
This commit is contained in:
@@ -15,16 +15,18 @@ pip install -e .
|
||||
```
|
||||
|
||||
- [A2C](baselines/a2c)
|
||||
- [ACER](baselines/acer)
|
||||
- [ACKTR](baselines/acktr)
|
||||
- [DDPG](baselines/ddpg)
|
||||
- [DQN](baselines/deepq)
|
||||
- [PPO](baselines/ppo1)
|
||||
- [PPO1](baselines/ppo1) (Multi-CPU using MPI)
|
||||
- [PPO2](baselines/ppo2) (Optimized for GPU)
|
||||
- [TRPO](baselines/trpo_mpi)
|
||||
|
||||
To cite this repository in publications:
|
||||
|
||||
@misc{baselines,
|
||||
author = {Hesse, Christopher and Plappert, Matthias and Radford, Alec and Schulman, John and Sidor, Szymon and Wu, Yuhuai},
|
||||
author = {Dhariwal, Prafulla and Hesse, Christopher and Plappert, Matthias and Radford, Alec and Schulman, John and Sidor, Szymon and Wu, Yuhuai},
|
||||
title = {OpenAI Baselines},
|
||||
year = {2017},
|
||||
publisher = {GitHub},
|
||||
|
@@ -238,7 +238,7 @@ def check_shape(ts,shapes):
|
||||
def avg_norm(t):
|
||||
return tf.reduce_mean(tf.sqrt(tf.reduce_sum(tf.square(t), axis=-1)))
|
||||
|
||||
def myadd(g1, g2, param):
|
||||
def gradient_add(g1, g2, param):
|
||||
print([g1, g2, param.name])
|
||||
assert (not (g1 is None and g2 is None)), param.name
|
||||
if g1 is None:
|
||||
@@ -248,7 +248,7 @@ def myadd(g1, g2, param):
|
||||
else:
|
||||
return g1 + g2
|
||||
|
||||
def my_explained_variance(qpred, q):
|
||||
def q_explained_variance(qpred, q):
|
||||
_, vary = tf.nn.moments(q, axes=[0, 1])
|
||||
_, varpred = tf.nn.moments(q - qpred, axes=[0, 1])
|
||||
check_shape([vary, varpred], [[]] * 2)
|
||||
|
4
baselines/acer/README.md
Normal file
4
baselines/acer/README.md
Normal file
@@ -0,0 +1,4 @@
|
||||
# ACER
|
||||
|
||||
- Original paper: https://arxiv.org/abs/1611.01224
|
||||
- `python -m baselines.acer.run_atari` runs the algorithm for 40M frames = 10M timesteps on an Atari game. See help (`-h`) for more options.
|
0
baselines/acer/__init__.py
Normal file
0
baselines/acer/__init__.py
Normal file
349
baselines/acer/acer_simple.py
Normal file
349
baselines/acer/acer_simple.py
Normal file
@@ -0,0 +1,349 @@
|
||||
import time
|
||||
import joblib
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
from baselines import logger
|
||||
|
||||
from baselines.common import set_global_seeds
|
||||
|
||||
from baselines.a2c.utils import batch_to_seq, seq_to_batch
|
||||
from baselines.a2c.utils import Scheduler, make_path, find_trainable_variables
|
||||
from baselines.a2c.utils import cat_entropy_softmax
|
||||
from baselines.a2c.utils import EpisodeStats
|
||||
from baselines.a2c.utils import get_by_index, check_shape, avg_norm, gradient_add, q_explained_variance
|
||||
from baselines.acer.buffer import Buffer
|
||||
|
||||
# remove last step
|
||||
def strip(var, nenvs, nsteps, flat = False):
|
||||
vars = batch_to_seq(var, nenvs, nsteps + 1, flat)
|
||||
return seq_to_batch(vars[:-1], flat)
|
||||
|
||||
def q_retrace(R, D, q_i, v, rho_i, nenvs, nsteps, gamma):
|
||||
"""
|
||||
Calculates q_retrace targets
|
||||
|
||||
:param R: Rewards
|
||||
:param D: Dones
|
||||
:param q_i: Q values for actions taken
|
||||
:param v: V values
|
||||
:param rho_i: Importance weight for each action
|
||||
:return: Q_retrace values
|
||||
"""
|
||||
rho_bar = batch_to_seq(tf.minimum(1.0, rho_i), nenvs, nsteps, True) # list of len steps, shape [nenvs]
|
||||
rs = batch_to_seq(R, nenvs, nsteps, True) # list of len steps, shape [nenvs]
|
||||
ds = batch_to_seq(D, nenvs, nsteps, True) # list of len steps, shape [nenvs]
|
||||
q_is = batch_to_seq(q_i, nenvs, nsteps, True)
|
||||
vs = batch_to_seq(v, nenvs, nsteps + 1, True)
|
||||
v_final = vs[-1]
|
||||
qret = v_final
|
||||
qrets = []
|
||||
for i in range(nsteps - 1, -1, -1):
|
||||
check_shape([qret, ds[i], rs[i], rho_bar[i], q_is[i], vs[i]], [[nenvs]] * 6)
|
||||
qret = rs[i] + gamma * qret * (1.0 - ds[i])
|
||||
qrets.append(qret)
|
||||
qret = (rho_bar[i] * (qret - q_is[i])) + vs[i]
|
||||
qrets = qrets[::-1]
|
||||
qret = seq_to_batch(qrets, flat=True)
|
||||
return qret
|
||||
|
||||
# For ACER with PPO clipping instead of trust region
|
||||
# def clip(ratio, eps_clip):
|
||||
# # assume 0 <= eps_clip <= 1
|
||||
# return tf.minimum(1 + eps_clip, tf.maximum(1 - eps_clip, ratio))
|
||||
|
||||
class Model(object):
|
||||
def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, nstack, num_procs,
|
||||
ent_coef, q_coef, gamma, max_grad_norm, lr,
|
||||
rprop_alpha, rprop_epsilon, total_timesteps, lrschedule,
|
||||
c, trust_region, alpha, delta):
|
||||
config = tf.ConfigProto(allow_soft_placement=True,
|
||||
intra_op_parallelism_threads=num_procs,
|
||||
inter_op_parallelism_threads=num_procs)
|
||||
sess = tf.Session(config=config)
|
||||
nact = ac_space.n
|
||||
nbatch = nenvs * nsteps
|
||||
|
||||
A = tf.placeholder(tf.int32, [nbatch]) # actions
|
||||
D = tf.placeholder(tf.float32, [nbatch]) # dones
|
||||
R = tf.placeholder(tf.float32, [nbatch]) # rewards, not returns
|
||||
MU = tf.placeholder(tf.float32, [nbatch, nact]) # mu's
|
||||
LR = tf.placeholder(tf.float32, [])
|
||||
eps = 1e-6
|
||||
|
||||
step_model = policy(sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False)
|
||||
train_model = policy(sess, ob_space, ac_space, nenvs, nsteps + 1, nstack, reuse=True)
|
||||
|
||||
params = find_trainable_variables("model")
|
||||
print("Params {}".format(len(params)))
|
||||
for var in params:
|
||||
print(var)
|
||||
|
||||
# create polyak averaged model
|
||||
ema = tf.train.ExponentialMovingAverage(alpha)
|
||||
ema_apply_op = ema.apply(params)
|
||||
|
||||
def custom_getter(getter, *args, **kwargs):
|
||||
v = ema.average(getter(*args, **kwargs))
|
||||
print(v.name)
|
||||
return v
|
||||
|
||||
with tf.variable_scope("", custom_getter=custom_getter, reuse=True):
|
||||
polyak_model = policy(sess, ob_space, ac_space, nenvs, nsteps + 1, nstack, reuse=True)
|
||||
|
||||
# Notation: (var) = batch variable, (var)s = seqeuence variable, (var)_i = variable index by action at step i
|
||||
v = tf.reduce_sum(train_model.pi * train_model.q, axis = -1) # shape is [nenvs * (nsteps + 1)]
|
||||
|
||||
# strip off last step
|
||||
f, f_pol, q = map(lambda var: strip(var, nenvs, nsteps), [train_model.pi, polyak_model.pi, train_model.q])
|
||||
# Get pi and q values for actions taken
|
||||
f_i = get_by_index(f, A)
|
||||
q_i = get_by_index(q, A)
|
||||
|
||||
# Compute ratios for importance truncation
|
||||
rho = f / (MU + eps)
|
||||
rho_i = get_by_index(rho, A)
|
||||
|
||||
# Calculate Q_retrace targets
|
||||
qret = q_retrace(R, D, q_i, v, rho_i, nenvs, nsteps, gamma)
|
||||
|
||||
# Calculate losses
|
||||
# Entropy
|
||||
entropy = tf.reduce_mean(cat_entropy_softmax(f))
|
||||
|
||||
# Policy Graident loss, with truncated importance sampling & bias correction
|
||||
v = strip(v, nenvs, nsteps, True)
|
||||
check_shape([qret, v, rho_i, f_i], [[nenvs * nsteps]] * 4)
|
||||
check_shape([rho, f, q], [[nenvs * nsteps, nact]] * 2)
|
||||
|
||||
# Truncated importance sampling
|
||||
adv = qret - v
|
||||
logf = tf.log(f_i + eps)
|
||||
gain_f = logf * tf.stop_gradient(adv * tf.minimum(c, rho_i)) # [nenvs * nsteps]
|
||||
loss_f = -tf.reduce_mean(gain_f)
|
||||
|
||||
# Bias correction for the truncation
|
||||
adv_bc = (q - tf.reshape(v, [nenvs * nsteps, 1])) # [nenvs * nsteps, nact]
|
||||
logf_bc = tf.log(f + eps) # / (f_old + eps)
|
||||
check_shape([adv_bc, logf_bc], [[nenvs * nsteps, nact]]*2)
|
||||
gain_bc = tf.reduce_sum(logf_bc * tf.stop_gradient(adv_bc * tf.nn.relu(1.0 - (c / (rho + eps))) * f), axis = 1) #IMP: This is sum, as expectation wrt f
|
||||
loss_bc= -tf.reduce_mean(gain_bc)
|
||||
|
||||
loss_policy = loss_f + loss_bc
|
||||
|
||||
# Value/Q function loss, and explained variance
|
||||
check_shape([qret, q_i], [[nenvs * nsteps]]*2)
|
||||
ev = q_explained_variance(tf.reshape(q_i, [nenvs, nsteps]), tf.reshape(qret, [nenvs, nsteps]))
|
||||
loss_q = tf.reduce_mean(tf.square(tf.stop_gradient(qret) - q_i)*0.5)
|
||||
|
||||
# Net loss
|
||||
check_shape([loss_policy, loss_q, entropy], [[]] * 3)
|
||||
loss = loss_policy + q_coef * loss_q - ent_coef * entropy
|
||||
|
||||
if trust_region:
|
||||
g = tf.gradients(- (loss_policy - ent_coef * entropy) * nsteps * nenvs, f) #[nenvs * nsteps, nact]
|
||||
# k = tf.gradients(KL(f_pol || f), f)
|
||||
k = - f_pol / (f + eps) #[nenvs * nsteps, nact] # Directly computed gradient of KL divergence wrt f
|
||||
k_dot_g = tf.reduce_sum(k * g, axis=-1)
|
||||
adj = tf.maximum(0.0, (tf.reduce_sum(k * g, axis=-1) - delta) / (tf.reduce_sum(tf.square(k), axis=-1) + eps)) #[nenvs * nsteps]
|
||||
|
||||
# Calculate stats (before doing adjustment) for logging.
|
||||
avg_norm_k = avg_norm(k)
|
||||
avg_norm_g = avg_norm(g)
|
||||
avg_norm_k_dot_g = tf.reduce_mean(tf.abs(k_dot_g))
|
||||
avg_norm_adj = tf.reduce_mean(tf.abs(adj))
|
||||
|
||||
g = g - tf.reshape(adj, [nenvs * nsteps, 1]) * k
|
||||
grads_f = -g/(nenvs*nsteps) # These are turst region adjusted gradients wrt f ie statistics of policy pi
|
||||
grads_policy = tf.gradients(f, params, grads_f)
|
||||
grads_q = tf.gradients(loss_q * q_coef, params)
|
||||
grads = [gradient_add(g1, g2, param) for (g1, g2, param) in zip(grads_policy, grads_q, params)]
|
||||
|
||||
avg_norm_grads_f = avg_norm(grads_f) * (nsteps * nenvs)
|
||||
norm_grads_q = tf.global_norm(grads_q)
|
||||
norm_grads_policy = tf.global_norm(grads_policy)
|
||||
else:
|
||||
grads = tf.gradients(loss, params)
|
||||
|
||||
if max_grad_norm is not None:
|
||||
grads, norm_grads = tf.clip_by_global_norm(grads, max_grad_norm)
|
||||
grads = list(zip(grads, params))
|
||||
trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=rprop_alpha, epsilon=rprop_epsilon)
|
||||
_opt_op = trainer.apply_gradients(grads)
|
||||
|
||||
# so when you call _train, you first do the gradient step, then you apply ema
|
||||
with tf.control_dependencies([_opt_op]):
|
||||
_train = tf.group(ema_apply_op)
|
||||
|
||||
lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)
|
||||
|
||||
# Ops/Summaries to run, and their names for logging
|
||||
run_ops = [_train, loss, loss_q, entropy, loss_policy, loss_f, loss_bc, ev, norm_grads]
|
||||
names_ops = ['loss', 'loss_q', 'entropy', 'loss_policy', 'loss_f', 'loss_bc', 'explained_variance',
|
||||
'norm_grads']
|
||||
if trust_region:
|
||||
run_ops = run_ops + [norm_grads_q, norm_grads_policy, avg_norm_grads_f, avg_norm_k, avg_norm_g, avg_norm_k_dot_g,
|
||||
avg_norm_adj]
|
||||
names_ops = names_ops + ['norm_grads_q', 'norm_grads_policy', 'avg_norm_grads_f', 'avg_norm_k', 'avg_norm_g',
|
||||
'avg_norm_k_dot_g', 'avg_norm_adj']
|
||||
|
||||
def train(obs, actions, rewards, dones, mus, states, masks, steps):
|
||||
cur_lr = lr.value_steps(steps)
|
||||
td_map = {train_model.X: obs, polyak_model.X: obs, A: actions, R: rewards, D: dones, MU: mus, LR: cur_lr}
|
||||
if states != []:
|
||||
td_map[train_model.S] = states
|
||||
td_map[train_model.M] = masks
|
||||
td_map[polyak_model.S] = states
|
||||
td_map[polyak_model.M] = masks
|
||||
return names_ops, sess.run(run_ops, td_map)[1:] # strip off _train
|
||||
|
||||
def save(save_path):
|
||||
ps = sess.run(params)
|
||||
make_path(save_path)
|
||||
joblib.dump(ps, save_path)
|
||||
|
||||
self.train = train
|
||||
self.save = save
|
||||
self.train_model = train_model
|
||||
self.step_model = step_model
|
||||
self.step = step_model.step
|
||||
self.initial_state = step_model.initial_state
|
||||
tf.global_variables_initializer().run(session=sess)
|
||||
|
||||
class Runner(object):
|
||||
def __init__(self, env, model, nsteps, nstack):
|
||||
self.env = env
|
||||
self.nstack = nstack
|
||||
self.model = model
|
||||
nh, nw, nc = env.observation_space.shape
|
||||
self.nc = nc # nc = 1 for atari, but just in case
|
||||
self.nenv = nenv = env.num_envs
|
||||
self.nact = env.action_space.n
|
||||
self.nbatch = nenv * nsteps
|
||||
self.batch_ob_shape = (nenv*(nsteps+1), nh, nw, nc*nstack)
|
||||
self.obs = np.zeros((nenv, nh, nw, nc * nstack), dtype=np.uint8)
|
||||
obs = env.reset()
|
||||
self.update_obs(obs)
|
||||
self.nsteps = nsteps
|
||||
self.states = model.initial_state
|
||||
self.dones = [False for _ in range(nenv)]
|
||||
|
||||
def update_obs(self, obs, dones=None):
|
||||
if dones is not None:
|
||||
self.obs *= (1 - dones.astype(np.uint8))[:, None, None, None]
|
||||
self.obs = np.roll(self.obs, shift=-self.nc, axis=3)
|
||||
self.obs[:, :, :, -self.nc:] = obs[:, :, :, :]
|
||||
|
||||
def run(self):
|
||||
enc_obs = np.split(self.obs, self.nstack, axis=3) # so now list of obs steps
|
||||
mb_obs, mb_actions, mb_mus, mb_dones, mb_rewards = [], [], [], [], []
|
||||
for _ in range(self.nsteps):
|
||||
actions, mus, states = self.model.step(self.obs, state=self.states, mask=self.dones)
|
||||
mb_obs.append(np.copy(self.obs))
|
||||
mb_actions.append(actions)
|
||||
mb_mus.append(mus)
|
||||
mb_dones.append(self.dones)
|
||||
obs, rewards, dones, _ = self.env.step(actions)
|
||||
# states information for statefull models like LSTM
|
||||
self.states = states
|
||||
self.dones = dones
|
||||
self.update_obs(obs, dones)
|
||||
mb_rewards.append(rewards)
|
||||
enc_obs.append(obs)
|
||||
mb_obs.append(np.copy(self.obs))
|
||||
mb_dones.append(self.dones)
|
||||
|
||||
enc_obs = np.asarray(enc_obs, dtype=np.uint8).swapaxes(1, 0)
|
||||
mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0)
|
||||
mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0)
|
||||
mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0)
|
||||
mb_mus = np.asarray(mb_mus, dtype=np.float32).swapaxes(1, 0)
|
||||
|
||||
mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
|
||||
|
||||
mb_masks = mb_dones # Used for statefull models like LSTM's to mask state when done
|
||||
mb_dones = mb_dones[:, 1:] # Used for calculating returns. The dones array is now aligned with rewards
|
||||
|
||||
# shapes are now [nenv, nsteps, []]
|
||||
# When pulling from buffer, arrays will now be reshaped in place, preventing a deep copy.
|
||||
|
||||
return enc_obs, mb_obs, mb_actions, mb_rewards, mb_mus, mb_dones, mb_masks
|
||||
|
||||
class Acer():
|
||||
def __init__(self, runner, model, buffer, log_interval):
|
||||
self.runner = runner
|
||||
self.model = model
|
||||
self.buffer = buffer
|
||||
self.log_interval = log_interval
|
||||
self.tstart = None
|
||||
self.episode_stats = EpisodeStats(runner.nsteps, runner.nenv)
|
||||
self.steps = None
|
||||
|
||||
def call(self, on_policy):
|
||||
runner, model, buffer, steps = self.runner, self.model, self.buffer, self.steps
|
||||
if on_policy:
|
||||
enc_obs, obs, actions, rewards, mus, dones, masks = runner.run()
|
||||
self.episode_stats.feed(rewards, dones)
|
||||
if buffer is not None:
|
||||
buffer.put(enc_obs, actions, rewards, mus, dones, masks)
|
||||
else:
|
||||
# get obs, actions, rewards, mus, dones from buffer.
|
||||
obs, actions, rewards, mus, dones, masks = buffer.get()
|
||||
|
||||
# reshape stuff correctly
|
||||
obs = obs.reshape(runner.batch_ob_shape)
|
||||
actions = actions.reshape([runner.nbatch])
|
||||
rewards = rewards.reshape([runner.nbatch])
|
||||
mus = mus.reshape([runner.nbatch, runner.nact])
|
||||
dones = dones.reshape([runner.nbatch])
|
||||
masks = masks.reshape([runner.batch_ob_shape[0]])
|
||||
|
||||
names_ops, values_ops = model.train(obs, actions, rewards, dones, mus, model.initial_state, masks, steps)
|
||||
|
||||
if on_policy and (int(steps/runner.nbatch) % self.log_interval == 0):
|
||||
logger.record_tabular("total_timesteps", steps)
|
||||
logger.record_tabular("fps", int(steps/(time.time() - self.tstart)))
|
||||
# IMP: In EpisodicLife env, during training, we get done=True at each loss of life, not just at the terminal state.
|
||||
# Thus, this is mean until end of life, not end of episode.
|
||||
# For true episode rewards, see the monitor files in the log folder.
|
||||
logger.record_tabular("mean_episode_length", self.episode_stats.mean_length())
|
||||
logger.record_tabular("mean_episode_reward", self.episode_stats.mean_reward())
|
||||
for name, val in zip(names_ops, values_ops):
|
||||
logger.record_tabular(name, float(val))
|
||||
logger.dump_tabular()
|
||||
|
||||
|
||||
def learn(policy, env, seed, nsteps=20, nstack=4, total_timesteps=int(80e6), q_coef=0.5, ent_coef=0.01,
|
||||
max_grad_norm=10, lr=7e-4, lrschedule='linear', rprop_epsilon=1e-5, rprop_alpha=0.99, gamma=0.99,
|
||||
log_interval=100, buffer_size=50000, replay_ratio=4, replay_start=10000, c=10.0,
|
||||
trust_region=True, alpha=0.99, delta=1):
|
||||
print("Running Acer Simple")
|
||||
print(locals())
|
||||
tf.reset_default_graph()
|
||||
set_global_seeds(seed)
|
||||
|
||||
nenvs = env.num_envs
|
||||
ob_space = env.observation_space
|
||||
ac_space = env.action_space
|
||||
num_procs = len(env.remotes) # HACK
|
||||
model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nenvs=nenvs, nsteps=nsteps, nstack=nstack,
|
||||
num_procs=num_procs, ent_coef=ent_coef, q_coef=q_coef, gamma=gamma,
|
||||
max_grad_norm=max_grad_norm, lr=lr, rprop_alpha=rprop_alpha, rprop_epsilon=rprop_epsilon,
|
||||
total_timesteps=total_timesteps, lrschedule=lrschedule, c=c,
|
||||
trust_region=trust_region, alpha=alpha, delta=delta)
|
||||
|
||||
runner = Runner(env=env, model=model, nsteps=nsteps, nstack=nstack)
|
||||
if replay_ratio > 0:
|
||||
buffer = Buffer(env=env, nsteps=nsteps, nstack=nstack, size=buffer_size)
|
||||
else:
|
||||
buffer = None
|
||||
nbatch = nenvs*nsteps
|
||||
acer = Acer(runner, model, buffer, log_interval)
|
||||
acer.tstart = time.time()
|
||||
for acer.steps in range(0, total_timesteps, nbatch): #nbatch samples, 1 on_policy call and multiple off-policy calls
|
||||
acer.call(on_policy=True)
|
||||
if replay_ratio > 0 and buffer.has_atleast(replay_start):
|
||||
n = np.random.poisson(replay_ratio)
|
||||
for _ in range(n):
|
||||
acer.call(on_policy=False) # no simulation steps in this
|
||||
|
||||
env.close()
|
103
baselines/acer/buffer.py
Normal file
103
baselines/acer/buffer.py
Normal file
@@ -0,0 +1,103 @@
|
||||
import numpy as np
|
||||
|
||||
class Buffer(object):
|
||||
# gets obs, actions, rewards, mu's, (states, masks), dones
|
||||
def __init__(self, env, nsteps, nstack, size=50000):
|
||||
self.nenv = env.num_envs
|
||||
self.nsteps = nsteps
|
||||
self.nh, self.nw, self.nc = env.observation_space.shape
|
||||
self.nstack = nstack
|
||||
self.nbatch = self.nenv * self.nsteps
|
||||
self.size = size // (self.nsteps) # Each loc contains nenv * nsteps frames, thus total buffer is nenv * size frames
|
||||
|
||||
# Memory
|
||||
self.enc_obs = None
|
||||
self.actions = None
|
||||
self.rewards = None
|
||||
self.mus = None
|
||||
self.dones = None
|
||||
self.masks = None
|
||||
|
||||
# Size indexes
|
||||
self.next_idx = 0
|
||||
self.num_in_buffer = 0
|
||||
|
||||
def has_atleast(self, frames):
|
||||
# Frames per env, so total (nenv * frames) Frames needed
|
||||
# Each buffer loc has nenv * nsteps frames
|
||||
return self.num_in_buffer >= (frames // self.nsteps)
|
||||
|
||||
def can_sample(self):
|
||||
return self.num_in_buffer > 0
|
||||
|
||||
# Generate stacked frames
|
||||
def decode(self, enc_obs, dones):
|
||||
# enc_obs has shape [nenvs, nsteps + nstack, nh, nw, nc]
|
||||
# dones has shape [nenvs, nsteps, nh, nw, nc]
|
||||
# returns stacked obs of shape [nenv, (nsteps + 1), nh, nw, nstack*nc]
|
||||
nstack, nenv, nsteps, nh, nw, nc = self.nstack, self.nenv, self.nsteps, self.nh, self.nw, self.nc
|
||||
y = np.empty([nsteps + nstack - 1, nenv, 1, 1, 1], dtype=np.float32)
|
||||
obs = np.zeros([nstack, nsteps + nstack, nenv, nh, nw, nc], dtype=np.uint8)
|
||||
x = np.reshape(enc_obs, [nenv, nsteps + nstack, nh, nw, nc]).swapaxes(1,
|
||||
0) # [nsteps + nstack, nenv, nh, nw, nc]
|
||||
y[3:] = np.reshape(1.0 - dones, [nenv, nsteps, 1, 1, 1]).swapaxes(1, 0) # keep
|
||||
y[:3] = 1.0
|
||||
# y = np.reshape(1 - dones, [nenvs, nsteps, 1, 1, 1])
|
||||
for i in range(nstack):
|
||||
obs[-(i + 1), i:] = x
|
||||
# obs[:,i:,:,:,-(i+1),:] = x
|
||||
x = x[:-1] * y
|
||||
y = y[1:]
|
||||
return np.reshape(obs[:, 3:].transpose((2, 1, 3, 4, 0, 5)), [nenv, (nsteps + 1), nh, nw, nstack * nc])
|
||||
|
||||
def put(self, enc_obs, actions, rewards, mus, dones, masks):
|
||||
# enc_obs [nenv, (nsteps + nstack), nh, nw, nc]
|
||||
# actions, rewards, dones [nenv, nsteps]
|
||||
# mus [nenv, nsteps, nact]
|
||||
|
||||
if self.enc_obs is None:
|
||||
self.enc_obs = np.empty([self.size] + list(enc_obs.shape), dtype=np.uint8)
|
||||
self.actions = np.empty([self.size] + list(actions.shape), dtype=np.int32)
|
||||
self.rewards = np.empty([self.size] + list(rewards.shape), dtype=np.float32)
|
||||
self.mus = np.empty([self.size] + list(mus.shape), dtype=np.float32)
|
||||
self.dones = np.empty([self.size] + list(dones.shape), dtype=np.bool)
|
||||
self.masks = np.empty([self.size] + list(masks.shape), dtype=np.bool)
|
||||
|
||||
self.enc_obs[self.next_idx] = enc_obs
|
||||
self.actions[self.next_idx] = actions
|
||||
self.rewards[self.next_idx] = rewards
|
||||
self.mus[self.next_idx] = mus
|
||||
self.dones[self.next_idx] = dones
|
||||
self.masks[self.next_idx] = masks
|
||||
|
||||
self.next_idx = (self.next_idx + 1) % self.size
|
||||
self.num_in_buffer = min(self.size, self.num_in_buffer + 1)
|
||||
|
||||
def take(self, x, idx, envx):
|
||||
nenv = self.nenv
|
||||
out = np.empty([nenv] + list(x.shape[2:]), dtype=x.dtype)
|
||||
for i in range(nenv):
|
||||
out[i] = x[idx[i], envx[i]]
|
||||
return out
|
||||
|
||||
def get(self):
|
||||
# returns
|
||||
# obs [nenv, (nsteps + 1), nh, nw, nstack*nc]
|
||||
# actions, rewards, dones [nenv, nsteps]
|
||||
# mus [nenv, nsteps, nact]
|
||||
nenv = self.nenv
|
||||
assert self.can_sample()
|
||||
|
||||
# Sample exactly one id per env. If you sample across envs, then higher correlation in samples from same env.
|
||||
idx = np.random.randint(0, self.num_in_buffer, nenv)
|
||||
envx = np.arange(nenv)
|
||||
|
||||
take = lambda x: self.take(x, idx, envx) # for i in range(nenv)], axis = 0)
|
||||
dones = take(self.dones)
|
||||
enc_obs = take(self.enc_obs)
|
||||
obs = self.decode(enc_obs, dones)
|
||||
actions = take(self.actions)
|
||||
rewards = take(self.rewards)
|
||||
mus = take(self.mus)
|
||||
masks = take(self.masks)
|
||||
return obs, actions, rewards, mus, dones, masks
|
86
baselines/acer/policies.py
Normal file
86
baselines/acer/policies.py
Normal file
@@ -0,0 +1,86 @@
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
from baselines.a2c.utils import conv, fc, conv_to_fc, batch_to_seq, seq_to_batch, lstm, lnlstm, sample, check_shape
|
||||
|
||||
|
||||
class AcerCnnPolicy(object):
|
||||
|
||||
def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False):
|
||||
nbatch = nenv * nsteps
|
||||
nh, nw, nc = ob_space.shape
|
||||
ob_shape = (nbatch, nh, nw, nc * nstack)
|
||||
nact = ac_space.n
|
||||
X = tf.placeholder(tf.uint8, ob_shape) # obs
|
||||
with tf.variable_scope("model", reuse=reuse):
|
||||
h = conv(tf.cast(X, tf.float32) / 255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2))
|
||||
h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))
|
||||
h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2))
|
||||
h3 = conv_to_fc(h3)
|
||||
h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
|
||||
pi_logits = fc(h4, 'pi', nact, act=lambda x: x, init_scale=0.01)
|
||||
pi = tf.nn.softmax(pi_logits)
|
||||
q = fc(h4, 'q', nact, act=lambda x: x)
|
||||
|
||||
a = sample(pi_logits) # could change this to use self.pi instead
|
||||
self.initial_state = [] # not stateful
|
||||
self.X = X
|
||||
self.pi = pi # actual policy params now
|
||||
self.q = q
|
||||
|
||||
def step(ob, *args, **kwargs):
|
||||
# returns actions, mus, states
|
||||
a0, pi0 = sess.run([a, pi], {X: ob})
|
||||
return a0, pi0, [] # dummy state
|
||||
|
||||
def out(ob, *args, **kwargs):
|
||||
pi0, q0 = sess.run([pi, q], {X: ob})
|
||||
return pi0, q0
|
||||
|
||||
def act(ob, *args, **kwargs):
|
||||
return sess.run(a, {X: ob})
|
||||
|
||||
self.step = step
|
||||
self.out = out
|
||||
self.act = act
|
||||
|
||||
class AcerLstmPolicy(object):
|
||||
|
||||
def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False, nlstm=256):
|
||||
nbatch = nenv * nsteps
|
||||
nh, nw, nc = ob_space.shape
|
||||
ob_shape = (nbatch, nh, nw, nc * nstack)
|
||||
nact = ac_space.n
|
||||
X = tf.placeholder(tf.uint8, ob_shape) # obs
|
||||
M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
|
||||
S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
|
||||
with tf.variable_scope("model", reuse=reuse):
|
||||
h = conv(tf.cast(X, tf.float32) / 255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2))
|
||||
h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))
|
||||
h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2))
|
||||
h3 = conv_to_fc(h3)
|
||||
h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
|
||||
|
||||
# lstm
|
||||
xs = batch_to_seq(h4, nenv, nsteps)
|
||||
ms = batch_to_seq(M, nenv, nsteps)
|
||||
h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
|
||||
h5 = seq_to_batch(h5)
|
||||
|
||||
pi_logits = fc(h5, 'pi', nact, act=lambda x: x, init_scale=0.01)
|
||||
pi = tf.nn.softmax(pi_logits)
|
||||
q = fc(h5, 'q', nact, act=lambda x: x)
|
||||
|
||||
a = sample(pi_logits) # could change this to use self.pi instead
|
||||
self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)
|
||||
self.X = X
|
||||
self.M = M
|
||||
self.S = S
|
||||
self.pi = pi # actual policy params now
|
||||
self.q = q
|
||||
|
||||
def step(ob, state, mask, *args, **kwargs):
|
||||
# returns actions, mus, states
|
||||
a0, pi0, s = sess.run([a, pi, snew], {X: ob, S: state, M: mask})
|
||||
return a0, pi0, s
|
||||
|
||||
self.step = step
|
47
baselines/acer/run_atari.py
Normal file
47
baselines/acer/run_atari.py
Normal file
@@ -0,0 +1,47 @@
|
||||
#!/usr/bin/env python
|
||||
import os, logging, gym
|
||||
from baselines import logger
|
||||
from baselines.common import set_global_seeds
|
||||
from baselines import bench
|
||||
from baselines.acer.acer_simple import learn
|
||||
from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
|
||||
from baselines.common.atari_wrappers import make_atari, wrap_deepmind
|
||||
from baselines.acer.policies import AcerCnnPolicy, AcerLstmPolicy
|
||||
|
||||
def train(env_id, num_timesteps, seed, policy, lrschedule, num_cpu):
|
||||
def make_env(rank):
|
||||
def _thunk():
|
||||
env = make_atari(env_id)
|
||||
env.seed(seed + rank)
|
||||
env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
|
||||
gym.logger.setLevel(logging.WARN)
|
||||
return wrap_deepmind(env)
|
||||
return _thunk
|
||||
set_global_seeds(seed)
|
||||
env = SubprocVecEnv([make_env(i) for i in range(num_cpu)])
|
||||
if policy == 'cnn':
|
||||
policy_fn = AcerCnnPolicy
|
||||
elif policy == 'lstm':
|
||||
policy_fn = AcerLstmPolicy
|
||||
else:
|
||||
print("Policy {} not implemented".format(policy))
|
||||
return
|
||||
learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), lrschedule=lrschedule)
|
||||
env.close()
|
||||
|
||||
def main():
|
||||
import argparse
|
||||
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||
parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4')
|
||||
parser.add_argument('--seed', help='RNG seed', type=int, default=0)
|
||||
parser.add_argument('--policy', help='Policy architecture', choices=['cnn', 'lstm', 'lnlstm'], default='cnn')
|
||||
parser.add_argument('--lrschedule', help='Learning rate schedule', choices=['constant', 'linear'], default='constant')
|
||||
parser.add_argument('--logdir', help ='Directory for logging', default='./log')
|
||||
parser.add_argument('--num-timesteps', type=int, default=int(10e6))
|
||||
args = parser.parse_args()
|
||||
logger.configure(os.path.abspath(args.logdir))
|
||||
train(args.env, num_timesteps=args.num_timesteps, seed=args.seed,
|
||||
policy=args.policy, lrschedule=args.lrschedule, num_cpu=16)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
@@ -1,3 +1,2 @@
|
||||
from baselines.bench.benchmarks import *
|
||||
from baselines.bench.monitor import *
|
||||
from baselines.bench.simple_bench import simple_bench
|
||||
|
@@ -128,5 +128,5 @@ _atari50 = [ # actually 47
|
||||
register_benchmark({
|
||||
'name': 'Atari50_10M',
|
||||
'description': '47 Atari games from Mnih et al. (2013), with pixel observations, 10M timesteps',
|
||||
'tasks': [{'env_id': _game + _ATARI_SUFFIX, 'trials': 3, 'num_timesteps': int(10e6)} for _game in _atari50]
|
||||
'tasks': [{'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(10e6)} for _game in _atari50]
|
||||
})
|
||||
|
@@ -12,10 +12,6 @@ class NoopResetEnv(gym.Wrapper):
|
||||
gym.Wrapper.__init__(self, env)
|
||||
self.noop_max = noop_max
|
||||
self.override_num_noops = None
|
||||
if isinstance(env.action_space, gym.spaces.MultiBinary):
|
||||
self.noop_action = np.zeros(self.env.action_space.n, dtype=np.int64)
|
||||
else:
|
||||
# used for atari environments
|
||||
self.noop_action = 0
|
||||
assert env.unwrapped.get_action_meanings()[0] == 'NOOP'
|
||||
|
||||
@@ -175,7 +171,7 @@ class LazyFrames(object):
|
||||
|
||||
This object should only be converted to numpy array before being passed to the model.
|
||||
|
||||
You'd not belive how complex the previous solution was."""
|
||||
You'd not believe how complex the previous solution was."""
|
||||
self._frames = frames
|
||||
|
||||
def __array__(self, dtype=None):
|
||||
|
@@ -238,10 +238,6 @@ def learn(env,
|
||||
kwargs['update_param_noise_threshold'] = update_param_noise_threshold
|
||||
kwargs['update_param_noise_scale'] = True
|
||||
action = act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0]
|
||||
if isinstance(env.action_space, gym.spaces.MultiBinary):
|
||||
env_action = np.zeros(env.action_space.n)
|
||||
env_action[action] = 1
|
||||
else:
|
||||
env_action = action
|
||||
reset = False
|
||||
new_obs, rew, done, _ = env.step(env_action)
|
||||
|
6
baselines/ppo2/README.md
Normal file
6
baselines/ppo2/README.md
Normal file
@@ -0,0 +1,6 @@
|
||||
# PPO2
|
||||
|
||||
- Original paper: https://arxiv.org/abs/1707.06347
|
||||
- Baselines blog post: https://blog.openai.com/openai-baselines-ppo/
|
||||
- `mpirun -np 8 python -m baselines.ppo1.run_atari` runs the algorithm for 40M frames = 10M timesteps on an Atari game. See help (`-h`) for more options.
|
||||
- `python -m baselines.ppo1.run_mujoco` runs the algorithm for 1M frames on a Mujoco environment.
|
0
baselines/ppo2/__init__.py
Normal file
0
baselines/ppo2/__init__.py
Normal file
167
baselines/ppo2/policies.py
Normal file
167
baselines/ppo2/policies.py
Normal file
@@ -0,0 +1,167 @@
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
from baselines.a2c.utils import conv, fc, conv_to_fc, batch_to_seq, seq_to_batch, lstm, lnlstm
|
||||
from baselines.common.distributions import make_pdtype
|
||||
|
||||
class LnLstmPolicy(object):
|
||||
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False):
|
||||
nenv = nbatch // nsteps
|
||||
nh, nw, nc = ob_space.shape
|
||||
ob_shape = (nbatch, nh, nw, nc)
|
||||
nact = ac_space.n
|
||||
X = tf.placeholder(tf.uint8, ob_shape) #obs
|
||||
M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
|
||||
S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
|
||||
with tf.variable_scope("model", reuse=reuse):
|
||||
h = conv(tf.cast(X, tf.float32)/255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2))
|
||||
h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))
|
||||
h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2))
|
||||
h3 = conv_to_fc(h3)
|
||||
h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
|
||||
xs = batch_to_seq(h4, nenv, nsteps)
|
||||
ms = batch_to_seq(M, nenv, nsteps)
|
||||
h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm)
|
||||
h5 = seq_to_batch(h5)
|
||||
pi = fc(h5, 'pi', nact, act=lambda x:x)
|
||||
vf = fc(h5, 'v', 1, act=lambda x:x)
|
||||
|
||||
self.pdtype = make_pdtype(ac_space)
|
||||
self.pd = self.pdtype.pdfromflat(pi)
|
||||
|
||||
v0 = vf[:, 0]
|
||||
a0 = self.pd.sample()
|
||||
neglogp0 = self.pd.neglogp(a0)
|
||||
self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)
|
||||
|
||||
def step(ob, state, mask):
|
||||
return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask})
|
||||
|
||||
def value(ob, state, mask):
|
||||
return sess.run(v0, {X:ob, S:state, M:mask})
|
||||
|
||||
self.X = X
|
||||
self.M = M
|
||||
self.S = S
|
||||
self.pi = pi
|
||||
self.vf = vf
|
||||
self.step = step
|
||||
self.value = value
|
||||
|
||||
class LstmPolicy(object):
|
||||
|
||||
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False):
|
||||
nenv = nbatch // nsteps
|
||||
|
||||
nh, nw, nc = ob_space.shape
|
||||
ob_shape = (nbatch, nh, nw, nc)
|
||||
nact = ac_space.n
|
||||
X = tf.placeholder(tf.uint8, ob_shape) #obs
|
||||
M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
|
||||
S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
|
||||
with tf.variable_scope("model", reuse=reuse):
|
||||
h = conv(tf.cast(X, tf.float32)/255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2))
|
||||
h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))
|
||||
h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2))
|
||||
h3 = conv_to_fc(h3)
|
||||
h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
|
||||
xs = batch_to_seq(h4, nenv, nsteps)
|
||||
ms = batch_to_seq(M, nenv, nsteps)
|
||||
h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
|
||||
h5 = seq_to_batch(h5)
|
||||
pi = fc(h5, 'pi', nact, act=lambda x:x)
|
||||
vf = fc(h5, 'v', 1, act=lambda x:x)
|
||||
|
||||
self.pdtype = make_pdtype(ac_space)
|
||||
self.pd = self.pdtype.pdfromflat(pi)
|
||||
|
||||
v0 = vf[:, 0]
|
||||
a0 = self.pd.sample()
|
||||
neglogp0 = self.pd.neglogp(a0)
|
||||
self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)
|
||||
|
||||
def step(ob, state, mask):
|
||||
return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask})
|
||||
|
||||
def value(ob, state, mask):
|
||||
return sess.run(v0, {X:ob, S:state, M:mask})
|
||||
|
||||
self.X = X
|
||||
self.M = M
|
||||
self.S = S
|
||||
self.pi = pi
|
||||
self.vf = vf
|
||||
self.step = step
|
||||
self.value = value
|
||||
|
||||
class CnnPolicy(object):
|
||||
|
||||
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False): #pylint: disable=W0613
|
||||
nh, nw, nc = ob_space.shape
|
||||
ob_shape = (nbatch, nh, nw, nc)
|
||||
nact = ac_space.n
|
||||
X = tf.placeholder(tf.uint8, ob_shape) #obs
|
||||
with tf.variable_scope("model", reuse=reuse):
|
||||
h = conv(tf.cast(X, tf.float32)/255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2))
|
||||
h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))
|
||||
h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2))
|
||||
h3 = conv_to_fc(h3)
|
||||
h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
|
||||
pi = fc(h4, 'pi', nact, act=lambda x:x, init_scale=0.01)
|
||||
vf = fc(h4, 'v', 1, act=lambda x:x)[:,0]
|
||||
|
||||
self.pdtype = make_pdtype(ac_space)
|
||||
self.pd = self.pdtype.pdfromflat(pi)
|
||||
|
||||
a0 = self.pd.sample()
|
||||
neglogp0 = self.pd.neglogp(a0)
|
||||
self.initial_state = None
|
||||
|
||||
def step(ob, *_args, **_kwargs):
|
||||
a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob})
|
||||
return a, v, self.initial_state, neglogp
|
||||
|
||||
def value(ob, *_args, **_kwargs):
|
||||
return sess.run(vf, {X:ob})
|
||||
|
||||
self.X = X
|
||||
self.pi = pi
|
||||
self.vf = vf
|
||||
self.step = step
|
||||
self.value = value
|
||||
|
||||
class MlpPolicy(object):
|
||||
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False): #pylint: disable=W0613
|
||||
ob_shape = (nbatch,) + ob_space.shape
|
||||
actdim = ac_space.shape[0]
|
||||
X = tf.placeholder(tf.float32, ob_shape, name='Ob') #obs
|
||||
with tf.variable_scope("model", reuse=reuse):
|
||||
h1 = fc(X, 'pi_fc1', nh=64, init_scale=np.sqrt(2), act=tf.tanh)
|
||||
h2 = fc(h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2), act=tf.tanh)
|
||||
pi = fc(h2, 'pi', actdim, act=lambda x:x, init_scale=0.01)
|
||||
h1 = fc(X, 'vf_fc1', nh=64, init_scale=np.sqrt(2), act=tf.tanh)
|
||||
h2 = fc(h1, 'vf_fc2', nh=64, init_scale=np.sqrt(2), act=tf.tanh)
|
||||
vf = fc(h2, 'vf', 1, act=lambda x:x)[:,0]
|
||||
logstd = tf.get_variable(name="logstd", shape=[1, actdim],
|
||||
initializer=tf.zeros_initializer())
|
||||
|
||||
pdparam = tf.concat([pi, pi * 0.0 + logstd], axis=1)
|
||||
|
||||
self.pdtype = make_pdtype(ac_space)
|
||||
self.pd = self.pdtype.pdfromflat(pdparam)
|
||||
|
||||
a0 = self.pd.sample()
|
||||
neglogp0 = self.pd.neglogp(a0)
|
||||
self.initial_state = None
|
||||
|
||||
def step(ob, *_args, **_kwargs):
|
||||
a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob})
|
||||
return a, v, self.initial_state, neglogp
|
||||
|
||||
def value(ob, *_args, **_kwargs):
|
||||
return sess.run(vf, {X:ob})
|
||||
|
||||
self.X = X
|
||||
self.pi = pi
|
||||
self.vf = vf
|
||||
self.step = step
|
||||
self.value = value
|
244
baselines/ppo2/ppo2.py
Normal file
244
baselines/ppo2/ppo2.py
Normal file
@@ -0,0 +1,244 @@
|
||||
import os
|
||||
import time
|
||||
import joblib
|
||||
import numpy as np
|
||||
import os.path as osp
|
||||
import tensorflow as tf
|
||||
from baselines import logger
|
||||
from collections import deque
|
||||
from baselines.common import explained_variance
|
||||
|
||||
class Model(object):
|
||||
def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train,
|
||||
nsteps, ent_coef, vf_coef, max_grad_norm):
|
||||
sess = tf.get_default_session()
|
||||
|
||||
act_model = policy(sess, ob_space, ac_space, nbatch_act, 1, reuse=False)
|
||||
train_model = policy(sess, ob_space, ac_space, nbatch_train, nsteps, reuse=True)
|
||||
|
||||
A = train_model.pdtype.sample_placeholder([None])
|
||||
ADV = tf.placeholder(tf.float32, [None])
|
||||
R = tf.placeholder(tf.float32, [None])
|
||||
OLDNEGLOGPAC = tf.placeholder(tf.float32, [None])
|
||||
OLDVPRED = tf.placeholder(tf.float32, [None])
|
||||
LR = tf.placeholder(tf.float32, [])
|
||||
CLIPRANGE = tf.placeholder(tf.float32, [])
|
||||
|
||||
neglogpac = train_model.pd.neglogp(A)
|
||||
entropy = tf.reduce_mean(train_model.pd.entropy())
|
||||
|
||||
vpred = train_model.vf
|
||||
vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, - CLIPRANGE, CLIPRANGE)
|
||||
vf_losses1 = tf.square(vpred - R)
|
||||
vf_losses2 = tf.square(vpredclipped - R)
|
||||
vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))
|
||||
ratio = tf.exp(OLDNEGLOGPAC - neglogpac)
|
||||
pg_losses = -ADV * ratio
|
||||
pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE)
|
||||
pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2))
|
||||
approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC))
|
||||
clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE)))
|
||||
loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef
|
||||
with tf.variable_scope('model'):
|
||||
params = tf.trainable_variables()
|
||||
grads = tf.gradients(loss, params)
|
||||
if max_grad_norm is not None:
|
||||
grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
|
||||
grads = list(zip(grads, params))
|
||||
trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5)
|
||||
_train = trainer.apply_gradients(grads)
|
||||
|
||||
def train(lr, cliprange, obs, returns, masks, actions, values, neglogpacs, states=None):
|
||||
advs = returns - values
|
||||
advs = (advs - advs.mean()) / (advs.std() + 1e-8)
|
||||
td_map = {train_model.X:obs, A:actions, ADV:advs, R:returns, LR:lr,
|
||||
CLIPRANGE:cliprange, OLDNEGLOGPAC:neglogpacs, OLDVPRED:values}
|
||||
if states is not None:
|
||||
td_map[train_model.S] = states
|
||||
td_map[train_model.M] = masks
|
||||
return sess.run(
|
||||
[pg_loss, vf_loss, entropy, approxkl, clipfrac, _train],
|
||||
td_map
|
||||
)[:-1]
|
||||
self.loss_names = ['policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac']
|
||||
|
||||
def save(save_path):
|
||||
ps = sess.run(params)
|
||||
joblib.dump(ps, save_path)
|
||||
|
||||
def load(load_path):
|
||||
loaded_params = joblib.load(load_path)
|
||||
restores = []
|
||||
for p, loaded_p in zip(params, loaded_params):
|
||||
restores.append(p.assign(loaded_p))
|
||||
sess.run(restores)
|
||||
|
||||
self.train = train
|
||||
self.train_model = train_model
|
||||
self.act_model = act_model
|
||||
self.step = act_model.step
|
||||
self.value = act_model.value
|
||||
self.initial_state = act_model.initial_state
|
||||
self.save = save
|
||||
self.load = load
|
||||
tf.global_variables_initializer().run(session=sess) #pylint: disable=E1101
|
||||
|
||||
class Runner(object):
|
||||
|
||||
def __init__(self, *, env, model, nsteps, gamma, lam):
|
||||
self.env = env
|
||||
self.model = model
|
||||
nenv = env.num_envs
|
||||
self.obs = np.zeros((nenv,) + env.observation_space.shape, dtype=model.train_model.X.dtype.name)
|
||||
self.obs[:] = env.reset()
|
||||
self.gamma = gamma
|
||||
self.lam = lam
|
||||
self.nsteps = nsteps
|
||||
self.states = model.initial_state
|
||||
self.dones = [False for _ in range(nenv)]
|
||||
|
||||
def run(self):
|
||||
mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, mb_neglogpacs = [],[],[],[],[],[]
|
||||
mb_states = self.states
|
||||
epinfos = []
|
||||
for _ in range(self.nsteps):
|
||||
actions, values, self.states, neglogpacs = self.model.step(self.obs, self.states, self.dones)
|
||||
mb_obs.append(self.obs.copy())
|
||||
mb_actions.append(actions)
|
||||
mb_values.append(values)
|
||||
mb_neglogpacs.append(neglogpacs)
|
||||
mb_dones.append(self.dones)
|
||||
self.obs[:], rewards, self.dones, infos = self.env.step(actions)
|
||||
for info in infos:
|
||||
maybeepinfo = info.get('episode')
|
||||
if maybeepinfo: epinfos.append(maybeepinfo)
|
||||
mb_rewards.append(rewards)
|
||||
#batch of steps to batch of rollouts
|
||||
mb_obs = np.asarray(mb_obs, dtype=self.obs.dtype)
|
||||
mb_rewards = np.asarray(mb_rewards, dtype=np.float32)
|
||||
mb_actions = np.asarray(mb_actions)
|
||||
mb_values = np.asarray(mb_values, dtype=np.float32)
|
||||
mb_neglogpacs = np.asarray(mb_neglogpacs, dtype=np.float32)
|
||||
mb_dones = np.asarray(mb_dones, dtype=np.bool)
|
||||
last_values = self.model.value(self.obs, self.states, self.dones)
|
||||
#discount/bootstrap off value fn
|
||||
mb_returns = np.zeros_like(mb_rewards)
|
||||
mb_advs = np.zeros_like(mb_rewards)
|
||||
lastgaelam = 0
|
||||
for t in reversed(range(self.nsteps)):
|
||||
if t == self.nsteps - 1:
|
||||
nextnonterminal = 1.0 - self.dones
|
||||
nextvalues = last_values
|
||||
else:
|
||||
nextnonterminal = 1.0 - mb_dones[t+1]
|
||||
nextvalues = mb_values[t+1]
|
||||
delta = mb_rewards[t] + self.gamma * nextvalues * nextnonterminal - mb_values[t]
|
||||
mb_advs[t] = lastgaelam = delta + self.gamma * self.lam * nextnonterminal * lastgaelam
|
||||
mb_returns = mb_advs + mb_values
|
||||
return (*map(sf01, (mb_obs, mb_returns, mb_dones, mb_actions, mb_values, mb_neglogpacs)),
|
||||
mb_states, epinfos)
|
||||
# obs, returns, masks, actions, values, neglogpacs, states = runner.run()
|
||||
def sf01(arr):
|
||||
"""
|
||||
swap and then flatten axes 0 and 1
|
||||
"""
|
||||
s = arr.shape
|
||||
return arr.swapaxes(0, 1).reshape(s[0] * s[1], *s[2:])
|
||||
|
||||
def constfn(val):
|
||||
def f(_):
|
||||
return val
|
||||
return f
|
||||
|
||||
def learn(*, policy, env, nsteps, total_timesteps, ent_coef, lr,
|
||||
vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95,
|
||||
log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2,
|
||||
save_interval=0):
|
||||
|
||||
if isinstance(lr, float): lr = constfn(lr)
|
||||
else: assert callable(lr)
|
||||
if isinstance(cliprange, float): cliprange = constfn(cliprange)
|
||||
else: assert callable(cliprange)
|
||||
total_timesteps = int(total_timesteps)
|
||||
|
||||
nenvs = env.num_envs
|
||||
ob_space = env.observation_space
|
||||
ac_space = env.action_space
|
||||
nbatch = nenvs * nsteps
|
||||
nbatch_train = nbatch // nminibatches
|
||||
|
||||
make_model = lambda : Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train,
|
||||
nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef,
|
||||
max_grad_norm=max_grad_norm)
|
||||
if save_interval and logger.get_dir():
|
||||
import cloudpickle
|
||||
with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh:
|
||||
fh.write(cloudpickle.dumps(make_model))
|
||||
model = make_model()
|
||||
runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam)
|
||||
|
||||
epinfobuf = deque(maxlen=100)
|
||||
tfirststart = time.time()
|
||||
|
||||
nupdates = total_timesteps//nbatch
|
||||
for update in range(1, nupdates+1):
|
||||
assert nbatch % nminibatches == 0
|
||||
nbatch_train = nbatch // nminibatches
|
||||
tstart = time.time()
|
||||
frac = 1.0 - (update - 1.0) / nupdates
|
||||
lrnow = lr(frac)
|
||||
cliprangenow = cliprange(frac)
|
||||
obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run() #pylint: disable=E0632
|
||||
epinfobuf.extend(epinfos)
|
||||
mblossvals = []
|
||||
if states is None: # nonrecurrent version
|
||||
inds = np.arange(nbatch)
|
||||
for _ in range(noptepochs):
|
||||
np.random.shuffle(inds)
|
||||
for start in range(0, nbatch, nbatch_train):
|
||||
end = start + nbatch_train
|
||||
mbinds = inds[start:end]
|
||||
slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs))
|
||||
mblossvals.append(model.train(lrnow, cliprangenow, *slices))
|
||||
else: # recurrent version
|
||||
assert nenvs % nminibatches == 0
|
||||
envsperbatch = nenvs // nminibatches
|
||||
envinds = np.arange(nenvs)
|
||||
flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps)
|
||||
envsperbatch = nbatch_train // nsteps
|
||||
for _ in range(noptepochs):
|
||||
np.random.shuffle(envinds)
|
||||
for start in range(0, nenvs, envsperbatch):
|
||||
end = start + envsperbatch
|
||||
mbenvinds = envinds[start:end]
|
||||
mbflatinds = flatinds[mbenvinds].ravel()
|
||||
slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs))
|
||||
mbstates = states[mbenvinds]
|
||||
mblossvals.append(model.train(lrnow, cliprangenow, *slices, mbstates))
|
||||
|
||||
lossvals = np.mean(mblossvals, axis=0)
|
||||
tnow = time.time()
|
||||
fps = int(nbatch / (tnow - tstart))
|
||||
if update % log_interval == 0 or update == 1:
|
||||
ev = explained_variance(values, returns)
|
||||
logger.logkv("serial_timesteps", update*nsteps)
|
||||
logger.logkv("nupdates", update)
|
||||
logger.logkv("total_timesteps", update*nbatch)
|
||||
logger.logkv("fps", fps)
|
||||
logger.logkv("explained_variance", float(ev))
|
||||
logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf]))
|
||||
logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf]))
|
||||
logger.logkv('time_elapsed', tnow - tfirststart)
|
||||
for (lossval, lossname) in zip(lossvals, model.loss_names):
|
||||
logger.logkv(lossname, lossval)
|
||||
logger.dumpkvs()
|
||||
if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir():
|
||||
checkdir = osp.join(logger.get_dir(), 'checkpoints')
|
||||
os.makedirs(checkdir, exist_ok=True)
|
||||
savepath = osp.join(checkdir, '%.5i'%update)
|
||||
print('Saving to', savepath)
|
||||
model.save(savepath)
|
||||
env.close()
|
||||
|
||||
def safemean(xs):
|
||||
return np.nan if len(xs) == 0 else np.mean(xs)
|
58
baselines/ppo2/run_atari.py
Normal file
58
baselines/ppo2/run_atari.py
Normal file
@@ -0,0 +1,58 @@
|
||||
#!/usr/bin/env python
|
||||
import sys
|
||||
import argparse
|
||||
from baselines import bench, logger
|
||||
|
||||
def train(env_id, num_timesteps, seed, policy):
|
||||
from baselines.common import set_global_seeds
|
||||
from baselines.common.atari_wrappers import make_atari, wrap_deepmind
|
||||
from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
|
||||
from baselines.common.vec_env.vec_frame_stack import VecFrameStack
|
||||
from baselines.ppo2 import ppo2
|
||||
from baselines.ppo2.policies import CnnPolicy, LstmPolicy, LnLstmPolicy
|
||||
import gym
|
||||
import logging
|
||||
import multiprocessing
|
||||
import os.path as osp
|
||||
import tensorflow as tf
|
||||
ncpu = multiprocessing.cpu_count()
|
||||
if sys.platform == 'darwin': ncpu //= 2
|
||||
config = tf.ConfigProto(allow_soft_placement=True,
|
||||
intra_op_parallelism_threads=ncpu,
|
||||
inter_op_parallelism_threads=ncpu)
|
||||
config.gpu_options.allow_growth = True #pylint: disable=E1101
|
||||
gym.logger.setLevel(logging.WARN)
|
||||
tf.Session(config=config).__enter__()
|
||||
|
||||
def make_env(rank):
|
||||
def env_fn():
|
||||
env = make_atari(env_id)
|
||||
env.seed(seed + rank)
|
||||
env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), str(rank)))
|
||||
return wrap_deepmind(env)
|
||||
return env_fn
|
||||
nenvs = 8
|
||||
env = SubprocVecEnv([make_env(i) for i in range(nenvs)])
|
||||
set_global_seeds(seed)
|
||||
env = VecFrameStack(env, 4)
|
||||
policy = {'cnn' : CnnPolicy, 'lstm' : LstmPolicy, 'lnlstm' : LnLstmPolicy}[policy]
|
||||
ppo2.learn(policy=policy, env=env, nsteps=128, nminibatches=4,
|
||||
lam=0.95, gamma=0.99, noptepochs=4, log_interval=1,
|
||||
ent_coef=.01,
|
||||
lr=lambda f : f * 2.5e-4,
|
||||
cliprange=lambda f : f * 0.1,
|
||||
total_timesteps=int(num_timesteps * 1.1))
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||
parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4')
|
||||
parser.add_argument('--seed', help='RNG seed', type=int, default=0)
|
||||
parser.add_argument('--policy', help='Policy architecture', choices=['cnn', 'lstm', 'lnlstm'], default='cnn')
|
||||
parser.add_argument('--num-timesteps', type=int, default=int(10e6))
|
||||
args = parser.parse_args()
|
||||
logger.configure()
|
||||
train(args.env, num_timesteps=args.num_timesteps, seed=args.seed,
|
||||
policy=args.policy)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
47
baselines/ppo2/run_mujoco.py
Normal file
47
baselines/ppo2/run_mujoco.py
Normal file
@@ -0,0 +1,47 @@
|
||||
#!/usr/bin/env python
|
||||
import argparse
|
||||
from baselines import bench, logger
|
||||
|
||||
def train(env_id, num_timesteps, seed):
|
||||
from baselines.common import set_global_seeds
|
||||
from baselines.common.vec_env.vec_normalize import VecNormalize
|
||||
from baselines.ppo2 import ppo2
|
||||
from baselines.ppo2.policies import MlpPolicy
|
||||
import gym
|
||||
import tensorflow as tf
|
||||
from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
|
||||
ncpu = 1
|
||||
config = tf.ConfigProto(allow_soft_placement=True,
|
||||
intra_op_parallelism_threads=ncpu,
|
||||
inter_op_parallelism_threads=ncpu)
|
||||
tf.Session(config=config).__enter__()
|
||||
def make_env():
|
||||
env = gym.make(env_id)
|
||||
env = bench.Monitor(env, logger.get_dir())
|
||||
return env
|
||||
env = DummyVecEnv([make_env])
|
||||
env = VecNormalize(env)
|
||||
|
||||
set_global_seeds(seed)
|
||||
policy = MlpPolicy
|
||||
ppo2.learn(policy=policy, env=env, nsteps=2048, nminibatches=32,
|
||||
lam=0.95, gamma=0.99, noptepochs=10, log_interval=1,
|
||||
ent_coef=0.0,
|
||||
lr=3e-4,
|
||||
cliprange=0.2,
|
||||
total_timesteps=num_timesteps)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||
parser.add_argument('--env', help='environment ID', default='Hopper-v1')
|
||||
parser.add_argument('--seed', help='RNG seed', type=int, default=0)
|
||||
parser.add_argument('--num-timesteps', type=int, default=int(1e6))
|
||||
args = parser.parse_args()
|
||||
logger.configure()
|
||||
train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
87
baselines/results_plotter.py
Normal file
87
baselines/results_plotter.py
Normal file
@@ -0,0 +1,87 @@
|
||||
import numpy as np
|
||||
import matplotlib
|
||||
matplotlib.use('TkAgg') # Can change to 'Agg' for non-interactive mode
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
plt.rcParams['svg.fonttype'] = 'none'
|
||||
|
||||
from baselines.bench.monitor import load_results
|
||||
|
||||
X_TIMESTEPS = 'timesteps'
|
||||
X_EPISODES = 'episodes'
|
||||
X_WALLTIME = 'walltime_hrs'
|
||||
POSSIBLE_X_AXES = [X_TIMESTEPS, X_EPISODES, X_WALLTIME]
|
||||
EPISODES_WINDOW = 100
|
||||
COLORS = ['blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'black', 'purple', 'pink',
|
||||
'brown', 'orange', 'teal', 'coral', 'lightblue', 'lime', 'lavender', 'turquoise',
|
||||
'darkgreen', 'tan', 'salmon', 'gold', 'lightpurple', 'darkred', 'darkblue']
|
||||
|
||||
def rolling_window(a, window):
|
||||
shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
|
||||
strides = a.strides + (a.strides[-1],)
|
||||
return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)
|
||||
|
||||
def window_func(x, y, window, func):
|
||||
yw = rolling_window(y, window)
|
||||
yw_func = func(yw, axis=-1)
|
||||
return x[window-1:], yw_func
|
||||
|
||||
def ts2xy(ts, xaxis):
|
||||
if xaxis == X_TIMESTEPS:
|
||||
x = np.cumsum(ts.l.values)
|
||||
y = ts.r.values
|
||||
elif xaxis == X_EPISODES:
|
||||
x = np.arange(len(ts))
|
||||
y = ts.r.values
|
||||
elif xaxis == X_WALLTIME:
|
||||
x = ts.t.values / 3600.
|
||||
y = ts.r.values
|
||||
else:
|
||||
raise NotImplementedError
|
||||
return x, y
|
||||
|
||||
def plot_curves(xy_list, xaxis, title):
|
||||
plt.figure(figsize=(8,2))
|
||||
maxx = max(xy[0][-1] for xy in xy_list)
|
||||
minx = 0
|
||||
for (i, (x, y)) in enumerate(xy_list):
|
||||
color = COLORS[i]
|
||||
plt.scatter(x, y, s=2)
|
||||
x, y_mean = window_func(x, y, EPISODES_WINDOW, np.mean) #So returns average of last EPISODE_WINDOW episodes
|
||||
plt.plot(x, y_mean, color=color)
|
||||
plt.xlim(minx, maxx)
|
||||
plt.title(title)
|
||||
plt.xlabel(xaxis)
|
||||
plt.ylabel("Episode Rewards")
|
||||
plt.tight_layout()
|
||||
|
||||
def plot_results(dirs, num_timesteps, xaxis, task_name):
|
||||
tslist = []
|
||||
for dir in dirs:
|
||||
ts = load_results(dir)
|
||||
ts = ts[ts.l.cumsum() <= num_timesteps]
|
||||
tslist.append(ts)
|
||||
xy_list = [ts2xy(ts, xaxis) for ts in tslist]
|
||||
plot_curves(xy_list, xaxis, task_name)
|
||||
|
||||
# Example usage in jupyter-notebook
|
||||
# from baselines import log_viewer
|
||||
# %matplotlib inline
|
||||
# log_viewer.plot_results(["./log"], 10e6, log_viewer.X_TIMESTEPS, "Breakout")
|
||||
# Here ./log is a directory containing the monitor.csv files
|
||||
|
||||
def main():
|
||||
import argparse
|
||||
import os
|
||||
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||
parser.add_argument('--dirs', help='List of log directories', nargs = '*', default=['./log'])
|
||||
parser.add_argument('--num_timesteps', type=int, default=int(10e6))
|
||||
parser.add_argument('--xaxis', help = 'Varible on X-axis', default = X_TIMESTEPS)
|
||||
parser.add_argument('--task_name', help = 'Title of plot', default = 'Breakout')
|
||||
args = parser.parse_args()
|
||||
args.dirs = [os.path.abspath(dir) for dir in args.dirs]
|
||||
plot_results(args.dirs, args.num_timesteps, args.xaxis, args.task_name)
|
||||
plt.show()
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
Reference in New Issue
Block a user