microbatches in ppo2, custom frame size in WarpFrame, matching fc layer only when needed (#707)

* joshim5 changes (width and height to WarpFrame wrapper)

* match network output with action distribution via a linear layer only if necessary (#167)

* support color vs. grayscale option in WarpFrame wrapper (#166)

* support color vs. grayscale option in WarpFrame wrapper

* Support color in other wrappers

* Updated per Peters suggestions

* fixing test failures

* ppo2 with microbatches (#168)

* pass microbatch_size to the model during construction

* microbatch fixes and test (#169)

* microbatch fixes and test

* tiny cleanup

* added assertions to the test

* vpg-related fix

* Peterz joshim5 subclass ppo2 model (#170)

* microbatch fixes and test

* tiny cleanup

* added assertions to the test

* vpg-related fix

* subclassing the model to make microbatched version of model WIP

* made microbatched model a subclass of ppo2 Model

* flake8 complaint

* mpi-less ppo2 (resolving merge conflict)

* flake8 and mpi4py imports in ppo2/model.py

* more un-mpying
This commit is contained in:
pzhokhov
2018-11-09 11:18:05 -08:00
committed by GitHub
parent d80acbb4d1
commit 52255beda5
6 changed files with 353 additions and 217 deletions

View File

@@ -103,9 +103,9 @@ def test_coexistence(learn_fn, network_fn):
kwargs.update(learn_kwargs[learn_fn])
learn = partial(learn, env=env, network=network_fn, total_timesteps=0, **kwargs)
make_session(make_default=True, graph=tf.Graph());
make_session(make_default=True, graph=tf.Graph())
model1 = learn(seed=1)
make_session(make_default=True, graph=tf.Graph());
make_session(make_default=True, graph=tf.Graph())
model2 = learn(seed=2)
model1.step(env.observation_space.sample())

View File

@@ -0,0 +1,76 @@
import tensorflow as tf
import numpy as np
from baselines.ppo2.model import Model
class MicrobatchedModel(Model):
"""
Model that does training one microbatch at a time - when gradient computation
on the entire minibatch causes some overflow
"""
def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train,
nsteps, ent_coef, vf_coef, max_grad_norm, microbatch_size):
self.nmicrobatches = nbatch_train // microbatch_size
self.microbatch_size = microbatch_size
assert nbatch_train % microbatch_size == 0, 'microbatch_size ({}) should divide nbatch_train ({}) evenly'.format(microbatch_size, nbatch_train)
super().__init__(
policy=policy,
ob_space=ob_space,
ac_space=ac_space,
nbatch_act=nbatch_act,
nbatch_train=microbatch_size,
nsteps=nsteps,
ent_coef=ent_coef,
vf_coef=vf_coef,
max_grad_norm=max_grad_norm)
self.grads_ph = [tf.placeholder(dtype=g.dtype, shape=g.shape) for g in self.grads]
grads_ph_and_vars = list(zip(self.grads_ph, self.var))
self._apply_gradients_op = self.trainer.apply_gradients(grads_ph_and_vars)
def train(self, lr, cliprange, obs, returns, masks, actions, values, neglogpacs, states=None):
assert states is None, "microbatches with recurrent models are not supported yet"
# Here we calculate advantage A(s,a) = R + yV(s') - V(s)
# Returns = R + yV(s')
advs = returns - values
# Normalize the advantages
advs = (advs - advs.mean()) / (advs.std() + 1e-8)
# Initialize empty list for per-microbatch stats like pg_loss, vf_loss, entropy, approxkl (whatever is in self.stats_list)
stats_vs = []
for microbatch_idx in range(self.nmicrobatches):
_sli = range(microbatch_idx * self.microbatch_size, (microbatch_idx+1) * self.microbatch_size)
td_map = {
self.train_model.X: obs[_sli],
self.A:actions[_sli],
self.ADV:advs[_sli],
self.R:returns[_sli],
self.CLIPRANGE:cliprange,
self.OLDNEGLOGPAC:neglogpacs[_sli],
self.OLDVPRED:values[_sli]
}
# Compute gradient on a microbatch (note that variables do not change here) ...
grad_v, stats_v = self.sess.run([self.grads, self.stats_list], td_map)
if microbatch_idx == 0:
sum_grad_v = grad_v
else:
# .. and add to the total of the gradients
for i, g in enumerate(grad_v):
sum_grad_v[i] += g
stats_vs.append(stats_v)
feed_dict = {ph: sum_g / self.nmicrobatches for ph, sum_g in zip(self.grads_ph, sum_grad_v)}
feed_dict[self.LR] = lr
# Update variables using average of the gradients
self.sess.run(self._apply_gradients_op, feed_dict)
# Return average of the stats
return np.mean(np.array(stats_vs), axis=0).tolist()

157
baselines/ppo2/model.py Normal file
View File

@@ -0,0 +1,157 @@
import tensorflow as tf
import functools
from baselines.common.tf_util import get_session, save_variables, load_variables
from baselines.common.tf_util import initialize
try:
from baselines.common.mpi_adam_optimizer import MpiAdamOptimizer
from mpi4py import MPI
from baselines.common.mpi_util import sync_from_root
except ImportError:
MPI = None
class Model(object):
"""
We use this object to :
__init__:
- Creates the step_model
- Creates the train_model
train():
- Make the training part (feedforward and retropropagation of gradients)
save/load():
- Save load the model
"""
def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train,
nsteps, ent_coef, vf_coef, max_grad_norm, microbatch_size=None):
self.sess = sess = get_session()
with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE):
# CREATE OUR TWO MODELS
# act_model that is used for sampling
act_model = policy(nbatch_act, 1, sess)
# Train model for training
if microbatch_size is None:
train_model = policy(nbatch_train, nsteps, sess)
else:
train_model = policy(microbatch_size, nsteps, sess)
# CREATE THE PLACEHOLDERS
self.A = A = train_model.pdtype.sample_placeholder([None])
self.ADV = ADV = tf.placeholder(tf.float32, [None])
self.R = R = tf.placeholder(tf.float32, [None])
# Keep track of old actor
self.OLDNEGLOGPAC = OLDNEGLOGPAC = tf.placeholder(tf.float32, [None])
# Keep track of old critic
self.OLDVPRED = OLDVPRED = tf.placeholder(tf.float32, [None])
self.LR = LR = tf.placeholder(tf.float32, [])
# Cliprange
self.CLIPRANGE = CLIPRANGE = tf.placeholder(tf.float32, [])
neglogpac = train_model.pd.neglogp(A)
# Calculate the entropy
# Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy.
entropy = tf.reduce_mean(train_model.pd.entropy())
# CALCULATE THE LOSS
# Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss
# Clip the value to reduce variability during Critic training
# Get the predicted value
vpred = train_model.vf
vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, - CLIPRANGE, CLIPRANGE)
# Unclipped value
vf_losses1 = tf.square(vpred - R)
# Clipped value
vf_losses2 = tf.square(vpredclipped - R)
vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))
# Calculate ratio (pi current policy / pi old policy)
ratio = tf.exp(OLDNEGLOGPAC - neglogpac)
# Defining Loss = - J is equivalent to max J
pg_losses = -ADV * ratio
pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE)
# Final PG loss
pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2))
approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC))
clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE)))
# Total loss
loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef
# UPDATE THE PARAMETERS USING LOSS
# 1. Get the model parameters
params = tf.trainable_variables('ppo2_model')
# 2. Build our trainer
if MPI is not None:
self.trainer = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=LR, epsilon=1e-5)
else:
self.trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5)
# 3. Calculate the gradients
grads_and_var = self.trainer.compute_gradients(loss, params)
grads, var = zip(*grads_and_var)
if max_grad_norm is not None:
# Clip the gradients (normalize)
grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
grads_and_var = list(zip(grads, var))
# zip aggregate each gradient with parameters associated
# For instance zip(ABCD, xyza) => Ax, By, Cz, Da
self.grads = grads
self.var = var
self._train_op = self.trainer.apply_gradients(grads_and_var)
self.loss_names = ['policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac']
self.stats_list = [pg_loss, vf_loss, entropy, approxkl, clipfrac]
self.train_model = train_model
self.act_model = act_model
self.step = act_model.step
self.value = act_model.value
self.initial_state = act_model.initial_state
self.save = functools.partial(save_variables, sess=sess)
self.load = functools.partial(load_variables, sess=sess)
if MPI is None or MPI.COMM_WORLD.Get_rank() == 0:
initialize()
else:
global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="")
sync_from_root(sess, global_variables) #pylint: disable=E1101
def train(self, lr, cliprange, obs, returns, masks, actions, values, neglogpacs, states=None):
# Here we calculate advantage A(s,a) = R + yV(s') - V(s)
# Returns = R + yV(s')
advs = returns - values
# Normalize the advantages
advs = (advs - advs.mean()) / (advs.std() + 1e-8)
td_map = {
self.train_model.X : obs,
self.A : actions,
self.ADV : advs,
self.R : returns,
self.LR : lr,
self.CLIPRANGE : cliprange,
self.OLDNEGLOGPAC : neglogpacs,
self.OLDVPRED : values
}
if states is not None:
td_map[self.train_model.S] = states
td_map[self.train_model.M] = masks
return self.sess.run(
self.stats_list + [self._train_op],
td_map
)[:-1]

View File

@@ -1,226 +1,17 @@
import os
import time
import functools
import numpy as np
import os.path as osp
import tensorflow as tf
from baselines import logger
from collections import deque
from baselines.common import explained_variance, set_global_seeds
from baselines.common.policies import build_policy
from baselines.common.runners import AbstractEnvRunner
from baselines.common.tf_util import get_session, save_variables, load_variables
try:
from baselines.common.mpi_adam_optimizer import MpiAdamOptimizer
from mpi4py import MPI
from baselines.common.mpi_util import sync_from_root
except ImportError:
MPI = None
from baselines.ppo2.runner import Runner
from baselines.common.tf_util import initialize
class Model(object):
"""
We use this object to :
__init__:
- Creates the step_model
- Creates the train_model
train():
- Make the training part (feedforward and retropropagation of gradients)
save/load():
- Save load the model
"""
def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train,
nsteps, ent_coef, vf_coef, max_grad_norm):
sess = get_session()
with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE):
# CREATE OUR TWO MODELS
# act_model that is used for sampling
act_model = policy(nbatch_act, 1, sess)
# Train model for training
train_model = policy(nbatch_train, nsteps, sess)
# CREATE THE PLACEHOLDERS
A = train_model.pdtype.sample_placeholder([None])
ADV = tf.placeholder(tf.float32, [None])
R = tf.placeholder(tf.float32, [None])
# Keep track of old actor
OLDNEGLOGPAC = tf.placeholder(tf.float32, [None])
# Keep track of old critic
OLDVPRED = tf.placeholder(tf.float32, [None])
LR = tf.placeholder(tf.float32, [])
# Cliprange
CLIPRANGE = tf.placeholder(tf.float32, [])
neglogpac = train_model.pd.neglogp(A)
# Calculate the entropy
# Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy.
entropy = tf.reduce_mean(train_model.pd.entropy())
# CALCULATE THE LOSS
# Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss
# Clip the value to reduce variability during Critic training
# Get the predicted value
vpred = train_model.vf
vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, - CLIPRANGE, CLIPRANGE)
# Unclipped value
vf_losses1 = tf.square(vpred - R)
# Clipped value
vf_losses2 = tf.square(vpredclipped - R)
vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))
# Calculate ratio (pi current policy / pi old policy)
ratio = tf.exp(OLDNEGLOGPAC - neglogpac)
# Defining Loss = - J is equivalent to max J
pg_losses = -ADV * ratio
pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE)
# Final PG loss
pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2))
approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC))
clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE)))
# Total loss
loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef
# UPDATE THE PARAMETERS USING LOSS
# 1. Get the model parameters
params = tf.trainable_variables('ppo2_model')
# 2. Build our trainer
if MPI is not None:
trainer = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=LR, epsilon=1e-5)
else:
trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5)
# 3. Calculate the gradients
grads_and_var = trainer.compute_gradients(loss, params)
grads, var = zip(*grads_and_var)
if max_grad_norm is not None:
# Clip the gradients (normalize)
grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
grads_and_var = list(zip(grads, var))
# zip aggregate each gradient with parameters associated
# For instance zip(ABCD, xyza) => Ax, By, Cz, Da
_train = trainer.apply_gradients(grads_and_var)
def train(lr, cliprange, obs, returns, masks, actions, values, neglogpacs, states=None):
# Here we calculate advantage A(s,a) = R + yV(s') - V(s)
# Returns = R + yV(s')
advs = returns - values
# Normalize the advantages
advs = (advs - advs.mean()) / (advs.std() + 1e-8)
td_map = {train_model.X:obs, A:actions, ADV:advs, R:returns, LR:lr,
CLIPRANGE:cliprange, OLDNEGLOGPAC:neglogpacs, OLDVPRED:values}
if states is not None:
td_map[train_model.S] = states
td_map[train_model.M] = masks
return sess.run(
[pg_loss, vf_loss, entropy, approxkl, clipfrac, _train],
td_map
)[:-1]
self.loss_names = ['policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac']
self.train = train
self.train_model = train_model
self.act_model = act_model
self.step = act_model.step
self.value = act_model.value
self.initial_state = act_model.initial_state
self.save = functools.partial(save_variables, sess=sess)
self.load = functools.partial(load_variables, sess=sess)
if MPI is None or MPI.COMM_WORLD.Get_rank() == 0:
initialize()
global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="")
if MPI is not None:
sync_from_root(sess, global_variables) #pylint: disable=E1101
class Runner(AbstractEnvRunner):
"""
We use this object to make a mini batch of experiences
__init__:
- Initialize the runner
run():
- Make a mini batch
"""
def __init__(self, *, env, model, nsteps, gamma, lam):
super().__init__(env=env, model=model, nsteps=nsteps)
# Lambda used in GAE (General Advantage Estimation)
self.lam = lam
# Discount rate
self.gamma = gamma
def run(self):
# Here, we init the lists that will contain the mb of experiences
mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, mb_neglogpacs = [],[],[],[],[],[]
mb_states = self.states
epinfos = []
# For n in range number of steps
for _ in range(self.nsteps):
# Given observations, get action value and neglopacs
# We already have self.obs because Runner superclass run self.obs[:] = env.reset() on init
actions, values, self.states, neglogpacs = self.model.step(self.obs, S=self.states, M=self.dones)
mb_obs.append(self.obs.copy())
mb_actions.append(actions)
mb_values.append(values)
mb_neglogpacs.append(neglogpacs)
mb_dones.append(self.dones)
# Take actions in env and look the results
# Infos contains a ton of useful informations
self.obs[:], rewards, self.dones, infos = self.env.step(actions)
for info in infos:
maybeepinfo = info.get('episode')
if maybeepinfo: epinfos.append(maybeepinfo)
mb_rewards.append(rewards)
#batch of steps to batch of rollouts
mb_obs = np.asarray(mb_obs, dtype=self.obs.dtype)
mb_rewards = np.asarray(mb_rewards, dtype=np.float32)
mb_actions = np.asarray(mb_actions)
mb_values = np.asarray(mb_values, dtype=np.float32)
mb_neglogpacs = np.asarray(mb_neglogpacs, dtype=np.float32)
mb_dones = np.asarray(mb_dones, dtype=np.bool)
last_values = self.model.value(self.obs, S=self.states, M=self.dones)
# discount/bootstrap off value fn
mb_returns = np.zeros_like(mb_rewards)
mb_advs = np.zeros_like(mb_rewards)
lastgaelam = 0
for t in reversed(range(self.nsteps)):
if t == self.nsteps - 1:
nextnonterminal = 1.0 - self.dones
nextvalues = last_values
else:
nextnonterminal = 1.0 - mb_dones[t+1]
nextvalues = mb_values[t+1]
delta = mb_rewards[t] + self.gamma * nextvalues * nextnonterminal - mb_values[t]
mb_advs[t] = lastgaelam = delta + self.gamma * self.lam * nextnonterminal * lastgaelam
mb_returns = mb_advs + mb_values
return (*map(sf01, (mb_obs, mb_returns, mb_dones, mb_actions, mb_values, mb_neglogpacs)),
mb_states, epinfos)
# obs, returns, masks, actions, values, neglogpacs, states = runner.run()
def sf01(arr):
"""
swap and then flatten axes 0 and 1
"""
s = arr.shape
return arr.swapaxes(0, 1).reshape(s[0] * s[1], *s[2:])
def constfn(val):
def f(_):
@@ -230,7 +21,7 @@ def constfn(val):
def learn(*, network, env, total_timesteps, eval_env = None, seed=None, nsteps=2048, ent_coef=0.0, lr=3e-4,
vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95,
log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2,
save_interval=0, load_path=None, **network_kwargs):
save_interval=0, load_path=None, model_fn=None, **network_kwargs):
'''
Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347)
@@ -308,10 +99,14 @@ def learn(*, network, env, total_timesteps, eval_env = None, seed=None, nsteps=2
nbatch_train = nbatch // nminibatches
# Instantiate the model object (that creates act_model and train_model)
make_model = lambda : Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train,
if model_fn is None:
from baselines.ppo2.model import Model
model_fn = Model
model = model_fn(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train,
nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef,
max_grad_norm=max_grad_norm)
model = make_model()
if load_path is not None:
model.load(load_path)
# Instantiate the runner object
@@ -319,8 +114,6 @@ def learn(*, network, env, total_timesteps, eval_env = None, seed=None, nsteps=2
if eval_env is not None:
eval_runner = Runner(env = eval_env, model = model, nsteps = nsteps, gamma = gamma, lam= lam)
epinfobuf = deque(maxlen=100)
if eval_env is not None:
eval_epinfobuf = deque(maxlen=100)

76
baselines/ppo2/runner.py Normal file
View File

@@ -0,0 +1,76 @@
import numpy as np
from baselines.common.runners import AbstractEnvRunner
class Runner(AbstractEnvRunner):
"""
We use this object to make a mini batch of experiences
__init__:
- Initialize the runner
run():
- Make a mini batch
"""
def __init__(self, *, env, model, nsteps, gamma, lam):
super().__init__(env=env, model=model, nsteps=nsteps)
# Lambda used in GAE (General Advantage Estimation)
self.lam = lam
# Discount rate
self.gamma = gamma
def run(self):
# Here, we init the lists that will contain the mb of experiences
mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, mb_neglogpacs = [],[],[],[],[],[]
mb_states = self.states
epinfos = []
# For n in range number of steps
for _ in range(self.nsteps):
# Given observations, get action value and neglopacs
# We already have self.obs because Runner superclass run self.obs[:] = env.reset() on init
actions, values, self.states, neglogpacs = self.model.step(self.obs, S=self.states, M=self.dones)
mb_obs.append(self.obs.copy())
mb_actions.append(actions)
mb_values.append(values)
mb_neglogpacs.append(neglogpacs)
mb_dones.append(self.dones)
# Take actions in env and look the results
# Infos contains a ton of useful informations
self.obs[:], rewards, self.dones, infos = self.env.step(actions)
for info in infos:
maybeepinfo = info.get('episode')
if maybeepinfo: epinfos.append(maybeepinfo)
mb_rewards.append(rewards)
#batch of steps to batch of rollouts
mb_obs = np.asarray(mb_obs, dtype=self.obs.dtype)
mb_rewards = np.asarray(mb_rewards, dtype=np.float32)
mb_actions = np.asarray(mb_actions)
mb_values = np.asarray(mb_values, dtype=np.float32)
mb_neglogpacs = np.asarray(mb_neglogpacs, dtype=np.float32)
mb_dones = np.asarray(mb_dones, dtype=np.bool)
last_values = self.model.value(self.obs, S=self.states, M=self.dones)
# discount/bootstrap off value fn
mb_returns = np.zeros_like(mb_rewards)
mb_advs = np.zeros_like(mb_rewards)
lastgaelam = 0
for t in reversed(range(self.nsteps)):
if t == self.nsteps - 1:
nextnonterminal = 1.0 - self.dones
nextvalues = last_values
else:
nextnonterminal = 1.0 - mb_dones[t+1]
nextvalues = mb_values[t+1]
delta = mb_rewards[t] + self.gamma * nextvalues * nextnonterminal - mb_values[t]
mb_advs[t] = lastgaelam = delta + self.gamma * self.lam * nextnonterminal * lastgaelam
mb_returns = mb_advs + mb_values
return (*map(sf01, (mb_obs, mb_returns, mb_dones, mb_actions, mb_values, mb_neglogpacs)),
mb_states, epinfos)
# obs, returns, masks, actions, values, neglogpacs, states = runner.run()
def sf01(arr):
"""
swap and then flatten axes 0 and 1
"""
s = arr.shape
return arr.swapaxes(0, 1).reshape(s[0] * s[1], *s[2:])

View File

@@ -0,0 +1,34 @@
import gym
import tensorflow as tf
import numpy as np
from functools import partial
from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
from baselines.common.tf_util import make_session
from baselines.ppo2.ppo2 import learn
from baselines.ppo2.microbatched_model import MicrobatchedModel
def test_microbatches():
def env_fn():
env = gym.make('CartPole-v0')
env.seed(0)
return env
learn_fn = partial(learn, network='mlp', nsteps=32, total_timesteps=32, seed=0)
env_ref = DummyVecEnv([env_fn])
sess_ref = make_session(make_default=True, graph=tf.Graph())
learn_fn(env=env_ref)
vars_ref = {v.name: sess_ref.run(v) for v in tf.trainable_variables()}
env_test = DummyVecEnv([env_fn])
sess_test = make_session(make_default=True, graph=tf.Graph())
learn_fn(env=env_test, model_fn=partial(MicrobatchedModel, microbatch_size=2))
vars_test = {v.name: sess_test.run(v) for v in tf.trainable_variables()}
for v in vars_ref:
np.testing.assert_allclose(vars_ref[v], vars_test[v], atol=1e-3)
if __name__ == '__main__':
test_microbatches()