ACKTR + A2C

2017-08-18 09:25:39 -07:00
parent 882251878f
commit 3f676f7d1e
31 changed files with 2920 additions and 144 deletions
--- a/README.md
+++ b/README.md
@@ -9,10 +9,14 @@ These algorithms will make it easier for the research community to replicate, re
 You can install it by typing:

 ```bash
-pip install baselines
+git clone https://github.com/openai/baselines.git
+cd baselines
+pip install -e .
 ```

+- [A2C](baselines/a2c)
+- [ACKTR](baselines/acktr)
+- [DDPG](baselines/ddpg)
 - [DQN](baselines/deepq)
 - [PPO](baselines/pposgd)
 - [TRPO](baselines/trpo_mpi)
- [DDPG](baselines/ddpg)
--- a/baselines/a2c/init.py
+++ b/baselines/a2c/init.py
--- a/baselines/a2c/a2c.py
+++ b/baselines/a2c/a2c.py
@@ -0,0 +1,208 @@
+import os.path as osp
+import gym
+import time
+import joblib
+import logging
+import numpy as np
+import tensorflow as tf
+from baselines import logger
+
+from baselines.common import set_global_seeds, explained_variance
+from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
+from baselines.common.atari_wrappers import wrap_deepmind
+
+from baselines.a2c.utils import discount_with_dones
+from baselines.a2c.utils import Scheduler, make_path, find_trainable_variables
+from baselines.a2c.policies import CnnPolicy
+from baselines.a2c.utils import cat_entropy, mse
+
+class Model(object):
+
+    def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, nstack, num_procs,
+            ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4,
+            alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear'):
+        config = tf.ConfigProto(allow_soft_placement=True,
+                                intra_op_parallelism_threads=num_procs,
+                                inter_op_parallelism_threads=num_procs)
+        config.gpu_options.allow_growth = True
+        sess = tf.Session(config=config)
+        nact = ac_space.n
+        nbatch = nenvs*nsteps
+
+        A = tf.placeholder(tf.int32, [nbatch])
+        ADV = tf.placeholder(tf.float32, [nbatch])
+        R = tf.placeholder(tf.float32, [nbatch])
+        LR = tf.placeholder(tf.float32, [])
+
+        step_model = policy(sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False)
+        train_model = policy(sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True)
+
+        neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=A)
+        pg_loss = tf.reduce_mean(ADV * neglogpac)
+        vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R))
+        entropy = tf.reduce_mean(cat_entropy(train_model.pi))
+        loss = pg_loss - entropy*ent_coef + vf_loss * vf_coef
+
+        params = find_trainable_variables("model")
+        grads = tf.gradients(loss, params)
+        if max_grad_norm is not None:
+            grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
+        grads = list(zip(grads, params))
+        trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon)
+        _train = trainer.apply_gradients(grads)
+
+        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)
+
+        def train(obs, states, rewards, masks, actions, values):
+            advs = rewards - values
+            for step in range(len(obs)):
+                cur_lr = lr.value()
+            td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, LR:cur_lr}
+            if states != []:
+                td_map[train_model.S] = states
+                td_map[train_model.M] = masks
+            policy_loss, value_loss, policy_entropy, _ = sess.run(
+                [pg_loss, vf_loss, entropy, _train],
+                td_map
+            )
+            return policy_loss, value_loss, policy_entropy
+
+        def save(save_path):
+            ps = sess.run(params)
+            make_path(save_path)
+            joblib.dump(ps, save_path)
+
+        def load(load_path):
+            loaded_params = joblib.load(load_path)
+            restores = []
+            for p, loaded_p in zip(params, loaded_params):
+                restores.append(p.assign(loaded_p))
+            ps = sess.run(restores)
+
+        self.train = train
+        self.train_model = train_model
+        self.step_model = step_model
+        self.step = step_model.step
+        self.value = step_model.value
+        self.initial_state = step_model.initial_state
+        self.save = save
+        self.load = load
+        tf.global_variables_initializer().run(session=sess)
+
+class Runner(object):
+
+    def __init__(self, env, model, nsteps=5, nstack=4, gamma=0.99):
+        self.env = env
+        self.model = model
+        nh, nw, nc = env.observation_space.shape
+        nenv = env.num_envs
+        self.batch_ob_shape = (nenv*nsteps, nh, nw, nc*nstack)
+        self.obs = np.zeros((nenv, nh, nw, nc*nstack), dtype=np.uint8)
+        obs = env.reset()
+        self.update_obs(obs)
+        self.gamma = gamma
+        self.nsteps = nsteps
+        self.states = model.initial_state
+        self.dones = [False for _ in range(nenv)]
+
+    def update_obs(self, obs):
+        # Do frame-stacking here instead of the FrameStack wrapper to reduce
+        # IPC overhead
+        self.obs = np.roll(self.obs, shift=-1, axis=3)
+        self.obs[:, :, :, -1] = obs[:, :, :, 0]
+
+    def run(self):
+        mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[]
+        mb_states = self.states
+        for n in range(self.nsteps):
+            actions, values, states = self.model.step(self.obs, self.states, self.dones)
+            mb_obs.append(np.copy(self.obs))
+            mb_actions.append(actions)
+            mb_values.append(values)
+            mb_dones.append(self.dones)
+            obs, rewards, dones, _ = self.env.step(actions)
+            self.states = states
+            self.dones = dones
+            for n, done in enumerate(dones):
+                if done:
+                    self.obs[n] = self.obs[n]*0
+            self.update_obs(obs)
+            mb_rewards.append(rewards)
+        mb_dones.append(self.dones)
+        #batch of steps to batch of rollouts
+        mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape(self.batch_ob_shape)
+        mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0)
+        mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0)
+        mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0)
+        mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
+        mb_masks = mb_dones[:, :-1]
+        mb_dones = mb_dones[:, 1:]
+        last_values = self.model.value(self.obs, self.states, self.dones).tolist()
+        #discount/bootstrap off value fn
+        for n, (rewards, dones, value) in enumerate(zip(mb_rewards, mb_dones, last_values)):
+            rewards = rewards.tolist()
+            dones = dones.tolist()
+            if dones[-1] == 0:
+                rewards = discount_with_dones(rewards+[value], dones+[0], self.gamma)[:-1]
+            else:
+                rewards = discount_with_dones(rewards, dones, self.gamma)
+            mb_rewards[n] = rewards
+        mb_rewards = mb_rewards.flatten()
+        mb_actions = mb_actions.flatten()
+        mb_values = mb_values.flatten()
+        mb_masks = mb_masks.flatten()
+        return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values
+
+def learn(policy, env, seed, nsteps=5, nstack=4, total_timesteps=int(80e6), vf_coef=0.5, ent_coef=0.01, max_grad_norm=0.5, lr=7e-4, lrschedule='linear', epsilon=1e-5, alpha=0.99, gamma=0.99, log_interval=100):
+    tf.reset_default_graph()
+    set_global_seeds(seed)
+
+    nenvs = env.num_envs
+    ob_space = env.observation_space
+    ac_space = env.action_space
+    num_procs = len(env.remotes) # HACK
+    model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nenvs=nenvs, nsteps=nsteps, nstack=nstack, num_procs=num_procs, ent_coef=ent_coef, vf_coef=vf_coef,
+        max_grad_norm=max_grad_norm, lr=lr, alpha=alpha, epsilon=epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule)
+    runner = Runner(env, model, nsteps=nsteps, nstack=nstack, gamma=gamma)
+
+    nbatch = nenvs*nsteps
+    tstart = time.time()
+    for update in range(1, total_timesteps//nbatch+1):
+        obs, states, rewards, masks, actions, values = runner.run()
+        policy_loss, value_loss, policy_entropy = model.train(obs, states, rewards, masks, actions, values)
+        nseconds = time.time()-tstart
+        fps = int((update*nbatch)/nseconds)
+        if update % log_interval == 0 or update == 1:
+            ev = explained_variance(values, rewards)
+            logger.record_tabular("nupdates", update)
+            logger.record_tabular("total_timesteps", update*nbatch)
+            logger.record_tabular("fps", fps)
+            logger.record_tabular("policy_entropy", float(policy_entropy))
+            logger.record_tabular("value_loss", float(value_loss))
+            logger.record_tabular("explained_variance", float(ev))
+            logger.dump_tabular()
+    env.close()
+
+def main():
+    env_id = 'SpaceInvaders'
+    seed = 42
+    nenvs = 4
+
+    def make_env(rank):
+        def env_fn():
+            env = gym.make('{}NoFrameskip-v4'.format(env_id))
+            env.seed(seed + rank)
+            if logger.get_dir():
+                from baselines import bench
+                env = bench.Monitor(env, osp.join(logger.get_dir(), "{}.monitor.json".format(rank)))
+                gym.logger.setLevel(logging.WARN)
+            return wrap_deepmind(env)
+        return env_fn
+
+    set_global_seeds(seed)
+    env = SubprocVecEnv([make_env(i) for i in range(nenvs)])
+    policy = CnnPolicy
+    learn(policy, env, seed)
+
+if __name__ == '__main__':
+    main()
--- a/baselines/a2c/policies.py
+++ b/baselines/a2c/policies.py
@@ -0,0 +1,207 @@
+import numpy as np
+import tensorflow as tf
+from baselines.a2c.utils import conv, fc, conv_to_fc, batch_to_seq, seq_to_batch, lstm, lnlstm, sample, check_shape
+from baselines.common.distributions import make_pdtype
+import baselines.common.tf_util as U
+import gym
+
+class LnLstmPolicy(object):
+    def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, nlstm=256, reuse=False):
+        nbatch = nenv*nsteps
+        nh, nw, nc = ob_space.shape
+        ob_shape = (nbatch, nh, nw, nc*nstack)
+        nact = ac_space.n
+        X = tf.placeholder(tf.uint8, ob_shape) #obs
+        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
+        S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
+        with tf.variable_scope("model", reuse=reuse):
+            h = conv(tf.cast(X, tf.float32)/255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2))
+            h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))
+            h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2))
+            h3 = conv_to_fc(h3)
+            h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
+            xs = batch_to_seq(h4, nenv, nsteps)
+            ms = batch_to_seq(M, nenv, nsteps)
+            h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm)
+            h5 = seq_to_batch(h5)
+            pi = fc(h5, 'pi', nact, act=lambda x:x)
+            vf = fc(h5, 'v', 1, act=lambda x:x)
+
+        v0 = vf[:, 0]
+        a0 = sample(pi)
+        self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)
+
+        def step(ob, state, mask):
+            a, v, s = sess.run([a0, v0, snew], {X:ob, S:state, M:mask})
+            return a, v, s
+
+        def value(ob, state, mask):
+            return sess.run(v0, {X:ob, S:state, M:mask})
+
+        self.X = X
+        self.M = M
+        self.S = S
+        self.pi = pi
+        self.vf = vf
+        self.step = step
+        self.value = value
+
+class LstmPolicy(object):
+
+    def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, nlstm=256, reuse=False):
+        nbatch = nenv*nsteps
+        nh, nw, nc = ob_space.shape
+        ob_shape = (nbatch, nh, nw, nc*nstack)
+        nact = ac_space.n
+        X = tf.placeholder(tf.uint8, ob_shape) #obs
+        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
+        S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
+        with tf.variable_scope("model", reuse=reuse):
+            h = conv(tf.cast(X, tf.float32)/255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2))
+            h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))
+            h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2))
+            h3 = conv_to_fc(h3)
+            h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
+            xs = batch_to_seq(h4, nenv, nsteps)
+            ms = batch_to_seq(M, nenv, nsteps)
+            h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
+            h5 = seq_to_batch(h5)
+            pi = fc(h5, 'pi', nact, act=lambda x:x)
+            vf = fc(h5, 'v', 1, act=lambda x:x)
+
+        v0 = vf[:, 0]
+        a0 = sample(pi)
+        self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)
+
+        def step(ob, state, mask):
+            a, v, s = sess.run([a0, v0, snew], {X:ob, S:state, M:mask})
+            return a, v, s
+
+        def value(ob, state, mask):
+            return sess.run(v0, {X:ob, S:state, M:mask})
+
+        self.X = X
+        self.M = M
+        self.S = S
+        self.pi = pi
+        self.vf = vf
+        self.step = step
+        self.value = value
+
+class CnnPolicy(object):
+
+    def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False):
+        nbatch = nenv*nsteps
+        nh, nw, nc = ob_space.shape
+        ob_shape = (nbatch, nh, nw, nc*nstack)
+        nact = ac_space.n
+        X = tf.placeholder(tf.uint8, ob_shape) #obs
+        with tf.variable_scope("model", reuse=reuse):
+            h = conv(tf.cast(X, tf.float32)/255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2))
+            h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))
+            h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2))
+            h3 = conv_to_fc(h3)
+            h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
+            pi = fc(h4, 'pi', nact, act=lambda x:x)
+            vf = fc(h4, 'v', 1, act=lambda x:x)
+
+        v0 = vf[:, 0]
+        a0 = sample(pi)
+        self.initial_state = [] #not stateful
+
+        def step(ob, *_args, **_kwargs):
+            a, v = sess.run([a0, v0], {X:ob})
+            return a, v, [] #dummy state
+
+        def value(ob, *_args, **_kwargs):
+            return sess.run(v0, {X:ob})
+
+        self.X = X
+        self.pi = pi
+        self.vf = vf
+        self.step = step
+        self.value = value
+
+class AcerCnnPolicy(object):
+
+    def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False):
+        nbatch = nenv * nsteps
+        nh, nw, nc = ob_space.shape
+        ob_shape = (nbatch, nh, nw, nc * nstack)
+        nact = ac_space.n
+        X = tf.placeholder(tf.uint8, ob_shape)  # obs
+        with tf.variable_scope("model", reuse=reuse):
+            h = conv(tf.cast(X, tf.float32) / 255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2))
+            h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))
+            h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2))
+            h3 = conv_to_fc(h3)
+            h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
+            pi_logits = fc(h4, 'pi', nact, act=lambda x: x, init_scale=0.01)
+            pi = tf.nn.softmax(pi_logits)
+            q = fc(h4, 'q', nact, act=lambda x: x)
+
+        a = sample(pi_logits)  # could change this to use self.pi instead
+        self.initial_state = []  # not stateful
+        self.X = X
+        self.pi = pi  # actual policy params now
+        self.q = q
+
+        def step(ob, *args, **kwargs):
+            # returns actions, mus, states
+            a0, pi0 = sess.run([a, pi], {X: ob})
+            return a0, pi0, []  # dummy state
+
+        def out(ob, *args, **kwargs):
+            pi0, q0 = sess.run([pi, q], {X: ob})
+            return pi0, q0
+
+        def act(ob, *args, **kwargs):
+            return sess.run(a, {X: ob})
+
+        self.step = step
+        self.out = out
+        self.act = act
+
+class AcerLstmPolicy(object):
+
+    def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False, nlstm=256):
+        nbatch = nenv * nsteps
+        nh, nw, nc = ob_space.shape
+        ob_shape = (nbatch, nh, nw, nc * nstack)
+        nact = ac_space.n
+        X = tf.placeholder(tf.uint8, ob_shape)  # obs
+        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
+        S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
+        with tf.variable_scope("model", reuse=reuse):
+            h = conv(tf.cast(X, tf.float32) / 255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2))
+            h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))
+            h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2))
+            h3 = conv_to_fc(h3)
+            h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
+
+            # lstm
+            xs = batch_to_seq(h4, nenv, nsteps)
+            ms = batch_to_seq(M, nenv, nsteps)
+            h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
+            h5 = seq_to_batch(h5)
+
+            pi_logits = fc(h5, 'pi', nact, act=lambda x: x, init_scale=0.01)
+            pi = tf.nn.softmax(pi_logits)
+            q = fc(h5, 'q', nact, act=lambda x: x)
+
+        a = sample(pi_logits)  # could change this to use self.pi instead
+        self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)
+        self.X = X
+        self.M = M
+        self.S = S
+        self.pi = pi  # actual policy params now
+        self.q = q
+
+        def step(ob, state, mask, *args, **kwargs):
+            # returns actions, mus, states
+            a0, pi0, s = sess.run([a, pi, snew], {X: ob, S: state, M: mask})
+            return a0, pi0, s
+
+        self.step = step
+
+# For Mujoco. Taken from PPOSGD
--- a/baselines/a2c/run_atari.py
+++ b/baselines/a2c/run_atari.py
@@ -0,0 +1,41 @@
+#!/usr/bin/env python
+import os, logging, gym
+from baselines import logger
+from baselines.common import set_global_seeds
+from baselines import bench
+from baselines.a2c.a2c import learn
+from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
+from baselines.common.atari_wrappers import wrap_deepmind
+from baselines.a2c.policies import CnnPolicy, LstmPolicy, LnLstmPolicy
+
+def train(env_id, num_timesteps, seed, policy, lrschedule, num_cpu):
+    num_timesteps //= 4
+
+    def make_env(rank):
+        def _thunk():
+            env = gym.make(env_id)
+            env.seed(seed + rank)
+            env = bench.Monitor(env, os.path.join(logger.get_dir(), "{}.monitor.json".format(rank)))
+            gym.logger.setLevel(logging.WARN)
+            return wrap_deepmind(env)
+        return _thunk
+
+    set_global_seeds(seed)
+    env = SubprocVecEnv([make_env(i) for i in range(num_cpu)])
+
+    if policy == 'cnn':
+        policy_fn = CnnPolicy
+    elif policy == 'lstm':
+        policy_fn = LstmPolicy
+    elif policy == 'lnlstm':
+        policy_fn = LnLstmPolicy
+    learn(policy_fn, env, seed, total_timesteps=num_timesteps, lrschedule=lrschedule)
+    env.close()
+
+
+def main():
+    train('BreakoutNoFrameskip-v4', num_timesteps=int(40e6), seed=0, policy='cnn', lrschedule='linear', num_cpu=16)
+
+
+if __name__ == '__main__':
+    main()
--- a/baselines/a2c/utils.py
+++ b/baselines/a2c/utils.py
@@ -0,0 +1,255 @@
+import os
+import gym
+import numpy as np
+import tensorflow as tf
+from gym import spaces
+from collections import deque
+
+def sample(logits):
+    noise = tf.random_uniform(tf.shape(logits))
+    return tf.argmax(logits - tf.log(-tf.log(noise)), 1)
+
+def cat_entropy(logits):
+    a0 = logits - tf.reduce_max(logits, 1, keep_dims=True)
+    ea0 = tf.exp(a0)
+    z0 = tf.reduce_sum(ea0, 1, keep_dims=True)
+    p0 = ea0 / z0
+    return tf.reduce_sum(p0 * (tf.log(z0) - a0), 1)
+
+def cat_entropy_softmax(p0):
+    return - tf.reduce_sum(p0 * tf.log(p0 + 1e-6), axis = 1)
+
+def mse(pred, target):
+    return tf.square(pred-target)/2.
+
+def ortho_init(scale=1.0):
+    def _ortho_init(shape, dtype, partition_info=None):
+        #lasagne ortho init for tf
+        shape = tuple(shape)
+        if len(shape) == 2:
+            flat_shape = shape
+        elif len(shape) == 4: # assumes NHWC
+            flat_shape = (np.prod(shape[:-1]), shape[-1])
+        else:
+            raise NotImplementedError
+        a = np.random.normal(0.0, 1.0, flat_shape)
+        u, _, v = np.linalg.svd(a, full_matrices=False)
+        q = u if u.shape == flat_shape else v # pick the one with the correct shape
+        q = q.reshape(shape)
+        return (scale * q[:shape[0], :shape[1]]).astype(np.float32)
+    return _ortho_init
+
+def conv(x, scope, nf, rf, stride, pad='VALID', act=tf.nn.relu, init_scale=1.0):
+    with tf.variable_scope(scope):
+        nin = x.get_shape()[3].value
+        w = tf.get_variable("w", [rf, rf, nin, nf], initializer=ortho_init(init_scale))
+        b = tf.get_variable("b", [nf], initializer=tf.constant_initializer(0.0))
+        z = tf.nn.conv2d(x, w, strides=[1, stride, stride, 1], padding=pad)+b
+        h = act(z)
+        return h
+
+def fc(x, scope, nh, act=tf.nn.relu, init_scale=1.0):
+    with tf.variable_scope(scope):
+        nin = x.get_shape()[1].value
+        w = tf.get_variable("w", [nin, nh], initializer=ortho_init(init_scale))
+        b = tf.get_variable("b", [nh], initializer=tf.constant_initializer(0.0))
+        z = tf.matmul(x, w)+b
+        h = act(z)
+        return h
+
+def batch_to_seq(h, nbatch, nsteps, flat=False):
+    if flat:
+        h = tf.reshape(h, [nbatch, nsteps])
+    else:
+        h = tf.reshape(h, [nbatch, nsteps, -1])
+    return [tf.squeeze(v, [1]) for v in tf.split(axis=1, num_or_size_splits=nsteps, value=h)]
+
+def seq_to_batch(h, flat = False):
+    shape = h[0].get_shape().as_list()
+    if not flat:
+        assert(len(shape) > 1)
+        nh = h[0].get_shape()[-1].value
+        return tf.reshape(tf.concat(axis=1, values=h), [-1, nh])
+    else:
+        return tf.reshape(tf.stack(values=h, axis=1), [-1])
+
+def lstm(xs, ms, s, scope, nh, init_scale=1.0):
+    nbatch, nin = [v.value for v in xs[0].get_shape()]
+    nsteps = len(xs)
+    with tf.variable_scope(scope):
+        wx = tf.get_variable("wx", [nin, nh*4], initializer=ortho_init(init_scale))
+        wh = tf.get_variable("wh", [nh, nh*4], initializer=ortho_init(init_scale))
+        b = tf.get_variable("b", [nh*4], initializer=tf.constant_initializer(0.0))
+
+    c, h = tf.split(axis=1, num_or_size_splits=2, value=s)
+    for idx, (x, m) in enumerate(zip(xs, ms)):
+        c = c*(1-m)
+        h = h*(1-m)
+        z = tf.matmul(x, wx) + tf.matmul(h, wh) + b
+        i, f, o, u = tf.split(axis=1, num_or_size_splits=4, value=z)
+        i = tf.nn.sigmoid(i)
+        f = tf.nn.sigmoid(f)
+        o = tf.nn.sigmoid(o)
+        u = tf.tanh(u)
+        c = f*c + i*u
+        h = o*tf.tanh(c)
+        xs[idx] = h
+    s = tf.concat(axis=1, values=[c, h])
+    return xs, s
+
+def _ln(x, g, b, e=1e-5, axes=[1]):
+    u, s = tf.nn.moments(x, axes=axes, keep_dims=True)
+    x = (x-u)/tf.sqrt(s+e)
+    x = x*g+b
+    return x
+
+def lnlstm(xs, ms, s, scope, nh, init_scale=1.0):
+    nbatch, nin = [v.value for v in xs[0].get_shape()]
+    nsteps = len(xs)
+    with tf.variable_scope(scope):
+        wx = tf.get_variable("wx", [nin, nh*4], initializer=ortho_init(init_scale))
+        gx = tf.get_variable("gx", [nh*4], initializer=tf.constant_initializer(1.0))
+        bx = tf.get_variable("bx", [nh*4], initializer=tf.constant_initializer(0.0))
+
+        wh = tf.get_variable("wh", [nh, nh*4], initializer=ortho_init(init_scale))
+        gh = tf.get_variable("gh", [nh*4], initializer=tf.constant_initializer(1.0))
+        bh = tf.get_variable("bh", [nh*4], initializer=tf.constant_initializer(0.0))
+
+        b = tf.get_variable("b", [nh*4], initializer=tf.constant_initializer(0.0))
+
+        gc = tf.get_variable("gc", [nh], initializer=tf.constant_initializer(1.0))
+        bc = tf.get_variable("bc", [nh], initializer=tf.constant_initializer(0.0))
+
+    c, h = tf.split(axis=1, num_or_size_splits=2, value=s)
+    for idx, (x, m) in enumerate(zip(xs, ms)):
+        c = c*(1-m)
+        h = h*(1-m)
+        z = _ln(tf.matmul(x, wx), gx, bx) + _ln(tf.matmul(h, wh), gh, bh) + b
+        i, f, o, u = tf.split(axis=1, num_or_size_splits=4, value=z)
+        i = tf.nn.sigmoid(i)
+        f = tf.nn.sigmoid(f)
+        o = tf.nn.sigmoid(o)
+        u = tf.tanh(u)
+        c = f*c + i*u
+        h = o*tf.tanh(_ln(c, gc, bc))
+        xs[idx] = h
+    s = tf.concat(axis=1, values=[c, h])
+    return xs, s
+
+def conv_to_fc(x):
+    nh = np.prod([v.value for v in x.get_shape()[1:]])
+    x = tf.reshape(x, [-1, nh])
+    return x
+
+def discount_with_dones(rewards, dones, gamma):
+    discounted = []
+    r = 0
+    for reward, done in zip(rewards[::-1], dones[::-1]):
+        r = reward + gamma*r*(1.-done) # fixed off by one bug
+        discounted.append(r)
+    return discounted[::-1]
+
+def find_trainable_variables(key):
+    with tf.variable_scope(key):
+        return tf.trainable_variables()
+
+def make_path(f):
+    return os.makedirs(f, exist_ok=True)
+
+def constant(p):
+    return 1
+
+def linear(p):
+    return 1-p
+
+schedules = {
+    'linear':linear,
+    'constant':constant
+}
+
+class Scheduler(object):
+
+    def __init__(self, v, nvalues, schedule):
+        self.n = 0.
+        self.v = v
+        self.nvalues = nvalues
+        self.schedule = schedules[schedule]
+
+    def value(self):
+        current_value = self.v*self.schedule(self.n/self.nvalues)
+        self.n += 1.
+        return current_value
+
+    def value_steps(self, steps):
+        return self.v*self.schedule(steps/self.nvalues)
+
+
+class EpisodeStats:
+    def __init__(self, nsteps, nenvs):
+        self.episode_rewards = []
+        for i in range(nenvs):
+            self.episode_rewards.append([])
+        self.lenbuffer = deque(maxlen=40)  # rolling buffer for episode lengths
+        self.rewbuffer = deque(maxlen=40)  # rolling buffer for episode rewards
+        self.nsteps = nsteps
+        self.nenvs = nenvs
+
+    def feed(self, rewards, masks):
+        rewards = np.reshape(rewards, [self.nenvs, self.nsteps])
+        masks = np.reshape(masks, [self.nenvs, self.nsteps])
+        for i in range(0, self.nenvs):
+            for j in range(0, self.nsteps):
+                self.episode_rewards[i].append(rewards[i][j])
+                if masks[i][j]:
+                    l = len(self.episode_rewards[i])
+                    s = sum(self.episode_rewards[i])
+                    self.lenbuffer.append(l)
+                    self.rewbuffer.append(s)
+                    self.episode_rewards[i] = []
+
+    def mean_length(self):
+        if self.lenbuffer:
+            return np.mean(self.lenbuffer)
+        else:
+            return 0  # on the first params dump, no episodes are finished
+
+    def mean_reward(self):
+        if self.rewbuffer:
+            return np.mean(self.rewbuffer)
+        else:
+            return 0
+
+
+# For ACER
+def get_by_index(x, idx):
+    assert(len(x.get_shape()) == 2)
+    assert(len(idx.get_shape()) == 1)
+    idx_flattened = tf.range(0, x.shape[0]) * x.shape[1] + idx
+    y = tf.gather(tf.reshape(x, [-1]),  # flatten input
+                  idx_flattened)  # use flattened indices
+    return y
+
+def check_shape(ts,shapes):
+    i = 0
+    for (t,shape) in zip(ts,shapes):
+        assert t.get_shape().as_list()==shape, "id " + str(i) + " shape " + str(t.get_shape()) + str(shape)
+        i += 1
+
+def avg_norm(t):
+    return tf.reduce_mean(tf.sqrt(tf.reduce_sum(tf.square(t), axis=-1)))
+
+def myadd(g1, g2, param):
+    print([g1, g2, param.name])
+    assert (not (g1 is None and g2 is None)), param.name
+    if g1 is None:
+        return g2
+    elif g2 is None:
+        return g1
+    else:
+        return g1 + g2
+
+def my_explained_variance(qpred, q):
+    _, vary = tf.nn.moments(q, axes=[0, 1])
+    _, varpred = tf.nn.moments(q - qpred, axes=[0, 1])
+    check_shape([vary, varpred], [[]] * 2)
+    return 1.0 - (varpred / vary)
--- a/baselines/acktr/init.py
+++ b/baselines/acktr/init.py
--- a/baselines/acktr/acktr_cont.py
+++ b/baselines/acktr/acktr_cont.py
@@ -0,0 +1,138 @@
+import numpy as np
+import tensorflow as tf
+from baselines import logger
+from baselines import common
+from baselines.common import tf_util as U
+from baselines.acktr import kfac
+from baselines.acktr.filters import ZFilter
+
+def pathlength(path):
+    return path["reward"].shape[0]# Loss function that we'll differentiate to get the policy gradient
+
+def rollout(env, policy, max_pathlength, animate=False, obfilter=None):
+    """
+    Simulate the env and policy for max_pathlength steps
+    """
+    ob = env.reset()
+    prev_ob = np.float32(np.zeros(ob.shape))
+    if obfilter: ob = obfilter(ob)
+    terminated = False
+
+    obs = []
+    acs = []
+    ac_dists = []
+    logps = []
+    rewards = []
+    for _ in range(max_pathlength):
+        if animate:
+            env.render()
+        state = np.concatenate([ob, prev_ob], -1)
+        obs.append(state)
+        ac, ac_dist, logp = policy.act(state)
+        acs.append(ac)
+        ac_dists.append(ac_dist)
+        logps.append(logp)
+        prev_ob = np.copy(ob)
+        scaled_ac = env.action_space.low + (ac + 1.) * 0.5 * (env.action_space.high - env.action_space.low)
+        scaled_ac = np.clip(scaled_ac, env.action_space.low, env.action_space.high)
+        ob, rew, done, _ = env.step(scaled_ac)
+        if obfilter: ob = obfilter(ob)
+        rewards.append(rew)
+        if done:
+            terminated = True
+            break
+    return {"observation" : np.array(obs), "terminated" : terminated,
+            "reward" : np.array(rewards), "action" : np.array(acs),
+            "action_dist": np.array(ac_dists), "logp" : np.array(logps)}
+
+def learn(env, policy, vf, gamma, lam, timesteps_per_batch, num_timesteps,
+    animate=False, callback=None, optimizer="adam", desired_kl=0.002):
+
+    obfilter = ZFilter(env.observation_space.shape)
+
+    max_pathlength = env.spec.timestep_limit
+    stepsize = tf.Variable(initial_value=np.float32(np.array(0.03)), name='stepsize')
+    inputs, loss, loss_sampled = policy.update_info
+    optim = kfac.KfacOptimizer(learning_rate=stepsize, cold_lr=stepsize*(1-0.9), momentum=0.9, kfac_update=2,\
+                                epsilon=1e-2, stats_decay=0.99, async=1, cold_iter=1,
+                                weight_decay_dict=policy.wd_dict, max_grad_norm=None)
+    pi_var_list = []
+    for var in tf.trainable_variables():
+        if "pi" in var.name:
+            pi_var_list.append(var)
+
+    update_op, q_runner = optim.minimize(loss, loss_sampled, var_list=pi_var_list)
+    do_update = U.function(inputs, update_op)
+    U.initialize()
+
+    # start queue runners
+    enqueue_threads = []
+    coord = tf.train.Coordinator()
+    for qr in [q_runner, vf.q_runner]:
+        assert (qr != None)
+        enqueue_threads.extend(qr.create_threads(U.get_session(), coord=coord, start=True))
+
+    i = 0
+    timesteps_so_far = 0
+    while True:
+        if timesteps_so_far > num_timesteps:
+            break
+        logger.log("********** Iteration %i ************"%i)
+
+        # Collect paths until we have enough timesteps
+        timesteps_this_batch = 0
+        paths = []
+        while True:
+            path = rollout(env, policy, max_pathlength, animate=(len(paths)==0 and (i % 10 == 0) and animate), obfilter=obfilter)
+            paths.append(path)
+            n = pathlength(path)
+            timesteps_this_batch += n
+            timesteps_so_far += n
+            if timesteps_this_batch > timesteps_per_batch:
+                break
+
+        # Estimate advantage function
+        vtargs = []
+        advs = []
+        for path in paths:
+            rew_t = path["reward"]
+            return_t = common.discount(rew_t, gamma)
+            vtargs.append(return_t)
+            vpred_t = vf.predict(path)
+            vpred_t = np.append(vpred_t, 0.0 if path["terminated"] else vpred_t[-1])
+            delta_t = rew_t + gamma*vpred_t[1:] - vpred_t[:-1]
+            adv_t = common.discount(delta_t, gamma * lam)
+            advs.append(adv_t)
+        # Update value function
+        vf.fit(paths, vtargs)
+
+        # Build arrays for policy update
+        ob_no = np.concatenate([path["observation"] for path in paths])
+        action_na = np.concatenate([path["action"] for path in paths])
+        oldac_dist = np.concatenate([path["action_dist"] for path in paths])
+        logp_n = np.concatenate([path["logp"] for path in paths])
+        adv_n = np.concatenate(advs)
+        standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8)
+
+        # Policy update
+        do_update(ob_no, action_na, standardized_adv_n)
+
+        # Adjust stepsize
+        kl = policy.compute_kl(ob_no, oldac_dist)
+        if kl > desired_kl * 2:
+            logger.log("kl too high")
+            U.eval(tf.assign(stepsize, stepsize / 1.5))
+        elif kl < desired_kl / 2:
+            logger.log("kl too low")
+            U.eval(tf.assign(stepsize, stepsize * 1.5))
+        else:
+            logger.log("kl just right!")
+
+        logger.record_tabular("EpRewMean", np.mean([path["reward"].sum() for path in paths]))
+        logger.record_tabular("EpRewSEM", np.std([path["reward"].sum()/np.sqrt(len(paths)) for path in paths]))
+        logger.record_tabular("EpLenMean", np.mean([pathlength(path) for path in paths]))
+        logger.record_tabular("KL", kl)
+        if callback:
+            callback()
+        logger.dump_tabular()
+        i += 1
--- a/baselines/acktr/acktr_disc.py
+++ b/baselines/acktr/acktr_disc.py
@@ -0,0 +1,214 @@
+import os.path as osp
+import time
+import joblib
+import numpy as np
+import tensorflow as tf
+from baselines import logger
+
+from baselines.common import set_global_seeds, explained_variance
+
+from baselines.acktr.utils import discount_with_dones
+from baselines.acktr.utils import Scheduler, find_trainable_variables
+from baselines.acktr.utils import cat_entropy, mse
+from baselines.acktr import kfac
+
+
+class Model(object):
+
+    def __init__(self, policy, ob_space, ac_space, nenvs,total_timesteps, nprocs=32, nsteps=20,
+                 nstack=4, ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5,
+                 kfac_clip=0.001, lrschedule='linear'):
+        config = tf.ConfigProto(allow_soft_placement=True,
+                                intra_op_parallelism_threads=nprocs,
+                                inter_op_parallelism_threads=nprocs)
+        config.gpu_options.allow_growth = True
+        self.sess = sess = tf.Session(config=config)
+        nact = ac_space.n
+        nbatch = nenvs * nsteps
+        A = tf.placeholder(tf.int32, [nbatch])
+        ADV = tf.placeholder(tf.float32, [nbatch])
+        R = tf.placeholder(tf.float32, [nbatch])
+        PG_LR = tf.placeholder(tf.float32, [])
+        VF_LR = tf.placeholder(tf.float32, [])
+
+        self.model = step_model = policy(sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False)
+        self.model2 = train_model = policy(sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True)
+
+        logpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=A)
+        self.logits = logits = train_model.pi
+
+        ##training loss
+        pg_loss = tf.reduce_mean(ADV*logpac)
+        entropy = tf.reduce_mean(cat_entropy(train_model.pi))
+        pg_loss = pg_loss - ent_coef * entropy
+        vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R))
+        train_loss = pg_loss + vf_coef * vf_loss
+
+
+        ##Fisher loss construction
+        self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(logpac)
+        sample_net = train_model.vf + tf.random_normal(tf.shape(train_model.vf))
+        self.vf_fisher = vf_fisher_loss = - vf_fisher_coef*tf.reduce_mean(tf.pow(train_model.vf - tf.stop_gradient(sample_net), 2))
+        self.joint_fisher = joint_fisher_loss = pg_fisher_loss + vf_fisher_loss
+
+        self.params=params = find_trainable_variables("model")
+
+        self.grads_check = grads = tf.gradients(train_loss,params)
+
+        with tf.device('/gpu:0'):
+            self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip,\
+                momentum=0.9, kfac_update=1, epsilon=0.01,\
+                stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=max_grad_norm)
+
+            update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss, var_list=params)
+            train_op, q_runner = optim.apply_gradients(list(zip(grads,params)))
+        self.q_runner = q_runner
+        self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)
+
+        def train(obs, states, rewards, masks, actions, values):
+            advs = rewards - values
+            for step in range(len(obs)):
+                cur_lr = self.lr.value()
+
+            td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, PG_LR:cur_lr}
+            if states != []:
+                td_map[train_model.S] = states
+                td_map[train_model.M] = masks
+
+            policy_loss, value_loss, policy_entropy, _ = sess.run(
+                [pg_loss, vf_loss, entropy, train_op],
+                td_map
+            )
+            return policy_loss, value_loss, policy_entropy
+
+        def save(save_path):
+            ps = sess.run(params)
+            joblib.dump(ps, save_path)
+
+        def load(load_path):
+            loaded_params = joblib.load(load_path)
+            restores = []
+            for p, loaded_p in zip(params, loaded_params):
+                restores.append(p.assign(loaded_p))
+            sess.run(restores)
+
+
+
+        self.train = train
+        self.save = save
+        self.load = load
+        self.train_model = train_model
+        self.step_model = step_model
+        self.step = step_model.step
+        self.value = step_model.value
+        self.initial_state = step_model.initial_state
+        tf.global_variables_initializer().run(session=sess)
+
+class Runner(object):
+
+    def __init__(self, env, model, nsteps, nstack, gamma):
+        self.env = env
+        self.model = model
+        nh, nw, nc = env.observation_space.shape
+        nenv = env.num_envs
+        self.batch_ob_shape = (nenv*nsteps, nh, nw, nc*nstack)
+        self.obs = np.zeros((nenv, nh, nw, nc*nstack), dtype=np.uint8)
+        obs = env.reset()
+        self.update_obs(obs)
+        self.gamma = gamma
+        self.nsteps = nsteps
+        self.states = model.initial_state
+        self.dones = [False for _ in range(nenv)]
+
+    def update_obs(self, obs):
+        self.obs = np.roll(self.obs, shift=-1, axis=3)
+        self.obs[:, :, :, -1] = obs[:, :, :, 0]
+
+    def run(self):
+        mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[]
+        mb_states = self.states
+        for n in range(self.nsteps):
+            actions, values, states = self.model.step(self.obs, self.states, self.dones)
+            mb_obs.append(np.copy(self.obs))
+            mb_actions.append(actions)
+            mb_values.append(values)
+            mb_dones.append(self.dones)
+            obs, rewards, dones, _ = self.env.step(actions)
+            self.states = states
+            self.dones = dones
+            for n, done in enumerate(dones):
+                if done:
+                    self.obs[n] = self.obs[n]*0
+            self.update_obs(obs)
+            mb_rewards.append(rewards)
+        mb_dones.append(self.dones)
+        #batch of steps to batch of rollouts
+        mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape(self.batch_ob_shape)
+        mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0)
+        mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0)
+        mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0)
+        mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
+        mb_masks = mb_dones[:, :-1]
+        mb_dones = mb_dones[:, 1:]
+        last_values = self.model.value(self.obs, self.states, self.dones).tolist()
+        #discount/bootstrap off value fn
+        for n, (rewards, dones, value) in enumerate(zip(mb_rewards, mb_dones, last_values)):
+            rewards = rewards.tolist()
+            dones = dones.tolist()
+            if dones[-1] == 0:
+                rewards = discount_with_dones(rewards+[value], dones+[0], self.gamma)[:-1]
+            else:
+                rewards = discount_with_dones(rewards, dones, self.gamma)
+            mb_rewards[n] = rewards
+        mb_rewards = mb_rewards.flatten()
+        mb_actions = mb_actions.flatten()
+        mb_values = mb_values.flatten()
+        mb_masks = mb_masks.flatten()
+        return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values
+
+def learn(policy, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interval=1, nprocs=32, nsteps=20,
+                 nstack=4, ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5,
+                 kfac_clip=0.001, save_interval=None, lrschedule='linear'):
+    tf.reset_default_graph()
+    set_global_seeds(seed)
+
+    nenvs = env.num_envs
+    ob_space = env.observation_space
+    ac_space = env.action_space
+    make_model = lambda : Model(policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=nprocs, nsteps
+                                =nsteps, nstack=nstack, ent_coef=ent_coef, vf_coef=vf_coef, vf_fisher_coef=
+                                vf_fisher_coef, lr=lr, max_grad_norm=max_grad_norm, kfac_clip=kfac_clip,
+                                lrschedule=lrschedule)
+    if save_interval and logger.get_dir():
+        import cloudpickle
+        with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh:
+            fh.write(cloudpickle.dumps(make_model))
+    model = make_model()
+
+    runner = Runner(env, model, nsteps=nsteps, nstack=nstack, gamma=gamma)
+    nbatch = nenvs*nsteps
+    tstart = time.time()
+    enqueue_threads = model.q_runner.create_threads(model.sess, coord=tf.train.Coordinator(), start=True)
+    for update in range(1, total_timesteps//nbatch+1):
+        obs, states, rewards, masks, actions, values = runner.run()
+        policy_loss, value_loss, policy_entropy = model.train(obs, states, rewards, masks, actions, values)
+        model.old_obs = obs
+        nseconds = time.time()-tstart
+        fps = int((update*nbatch)/nseconds)
+        if update % log_interval == 0 or update == 1:
+            ev = explained_variance(values, rewards)
+            logger.record_tabular("nupdates", update)
+            logger.record_tabular("total_timesteps", update*nbatch)
+            logger.record_tabular("fps", fps)
+            logger.record_tabular("policy_entropy", float(policy_entropy))
+            logger.record_tabular("policy_loss", float(policy_loss))
+            logger.record_tabular("value_loss", float(value_loss))
+            logger.record_tabular("explained_variance", float(ev))
+            logger.dump_tabular()
+
+        if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir():
+            savepath = osp.join(logger.get_dir(), 'checkpoint%.5i'%update)
+            print('Saving to', savepath)
+            model.save(savepath)
+
+    env.close()
--- a/baselines/acktr/filters.py
+++ b/baselines/acktr/filters.py
@@ -0,0 +1,98 @@
+from baselines.acktr.running_stat import RunningStat
+from collections import deque
+import numpy as np
+
+class Filter(object):
+    def __call__(self, x, update=True):
+        raise NotImplementedError
+    def reset(self):
+        pass
+
+class IdentityFilter(Filter):
+    def __call__(self, x, update=True):
+        return x
+
+class CompositionFilter(Filter):
+    def __init__(self, fs):
+        self.fs = fs
+    def __call__(self, x, update=True):
+        for f in self.fs:
+            x = f(x)
+        return x
+    def output_shape(self, input_space):
+        out = input_space.shape
+        for f in self.fs:
+            out = f.output_shape(out)
+        return out
+
+class ZFilter(Filter):
+    """
+    y = (x-mean)/std
+    using running estimates of mean,std
+    """
+
+    def __init__(self, shape, demean=True, destd=True, clip=10.0):
+        self.demean = demean
+        self.destd = destd
+        self.clip = clip
+
+        self.rs = RunningStat(shape)
+
+    def __call__(self, x, update=True):
+        if update: self.rs.push(x)
+        if self.demean:
+            x = x - self.rs.mean
+        if self.destd:
+            x = x / (self.rs.std+1e-8)
+        if self.clip:
+            x = np.clip(x, -self.clip, self.clip)
+        return x
+    def output_shape(self, input_space):
+        return input_space.shape
+
+class AddClock(Filter):
+    def __init__(self):
+        self.count = 0
+    def reset(self):
+        self.count = 0
+    def __call__(self, x, update=True):
+        return np.append(x, self.count/100.0)
+    def output_shape(self, input_space):
+        return (input_space.shape[0]+1,)
+
+class FlattenFilter(Filter):
+    def __call__(self, x, update=True):
+        return x.ravel()
+    def output_shape(self, input_space):
+        return (int(np.prod(input_space.shape)),)
+
+class Ind2OneHotFilter(Filter):
+    def __init__(self, n):
+        self.n = n
+    def __call__(self, x, update=True):
+        out = np.zeros(self.n)
+        out[x] = 1
+        return out
+    def output_shape(self, input_space):
+        return (input_space.n,)
+
+class DivFilter(Filter):
+    def __init__(self, divisor):
+        self.divisor = divisor
+    def __call__(self, x, update=True):
+        return x / self.divisor
+    def output_shape(self, input_space):
+        return input_space.shape
+
+class StackFilter(Filter):
+    def __init__(self, length):
+        self.stack = deque(maxlen=length)
+    def reset(self):
+        self.stack.clear()
+    def __call__(self, x, update=True):
+        self.stack.append(x)
+        while len(self.stack) < self.stack.maxlen:
+            self.stack.append(x)
+        return np.concatenate(self.stack, axis=-1)
+    def output_shape(self, input_space):
+        return input_space.shape[:-1] + (input_space.shape[-1] * self.stack.maxlen,)
--- a/baselines/acktr/kfac.py
+++ b/baselines/acktr/kfac.py
@@ -0,0 +1,926 @@
+import tensorflow as tf
+import numpy as np
+import re
+from baselines.acktr.kfac_utils import *
+from functools import reduce
+
+KFAC_OPS = ['MatMul', 'Conv2D', 'BiasAdd']
+KFAC_DEBUG = False
+
+
+class KfacOptimizer():
+
+    def __init__(self, learning_rate=0.01, momentum=0.9, clip_kl=0.01, kfac_update=2, stats_accum_iter=60, full_stats_init=False, cold_iter=100, cold_lr=None, async=False, async_stats=False, epsilon=1e-2, stats_decay=0.95, blockdiag_bias=False, channel_fac=False, factored_damping=False, approxT2=False, use_float64=False, weight_decay_dict={},max_grad_norm=0.5):
+        self.max_grad_norm = max_grad_norm
+        self._lr = learning_rate
+        self._momentum = momentum
+        self._clip_kl = clip_kl
+        self._channel_fac = channel_fac
+        self._kfac_update = kfac_update
+        self._async = async
+        self._async_stats = async_stats
+        self._epsilon = epsilon
+        self._stats_decay = stats_decay
+        self._blockdiag_bias = blockdiag_bias
+        self._approxT2 = approxT2
+        self._use_float64 = use_float64
+        self._factored_damping = factored_damping
+        self._cold_iter = cold_iter
+        if cold_lr == None:
+            # good heuristics
+            self._cold_lr = self._lr# * 3.
+        else:
+            self._cold_lr = cold_lr
+        self._stats_accum_iter = stats_accum_iter
+        self._weight_decay_dict = weight_decay_dict
+        self._diag_init_coeff = 0.
+        self._full_stats_init = full_stats_init
+        if not self._full_stats_init:
+            self._stats_accum_iter = self._cold_iter
+
+        self.sgd_step = tf.Variable(0, name='KFAC/sgd_step', trainable=False)
+        self.global_step = tf.Variable(
+            0, name='KFAC/global_step', trainable=False)
+        self.cold_step = tf.Variable(0, name='KFAC/cold_step', trainable=False)
+        self.factor_step = tf.Variable(
+            0, name='KFAC/factor_step', trainable=False)
+        self.stats_step = tf.Variable(
+            0, name='KFAC/stats_step', trainable=False)
+        self.vFv = tf.Variable(0., name='KFAC/vFv', trainable=False)
+
+        self.factors = {}
+        self.param_vars = []
+        self.stats = {}
+        self.stats_eigen = {}
+
+    def getFactors(self, g, varlist):
+        graph = tf.get_default_graph()
+        factorTensors = {}
+        fpropTensors = []
+        bpropTensors = []
+        opTypes = []
+        fops = []
+
+        def searchFactors(gradient, graph):
+            # hard coded search stratergy
+            bpropOp = gradient.op
+            bpropOp_name = bpropOp.name
+
+            bTensors = []
+            fTensors = []
+
+            # combining additive gradient, assume they are the same op type and
+            # indepedent
+            if 'AddN' in bpropOp_name:
+                factors = []
+                for g in gradient.op.inputs:
+                    factors.append(searchFactors(g, graph))
+                op_names = [item['opName'] for item in factors]
+                # TO-DO: need to check all the attribute of the ops as well
+                print (gradient.name)
+                print (op_names)
+                print (len(np.unique(op_names)))
+                assert len(np.unique(op_names)) == 1, gradient.name + \
+                    ' is shared among different computation OPs'
+
+                bTensors = reduce(lambda x, y: x + y,
+                                  [item['bpropFactors'] for item in factors])
+                if len(factors[0]['fpropFactors']) > 0:
+                    fTensors = reduce(
+                        lambda x, y: x + y, [item['fpropFactors'] for item in factors])
+                fpropOp_name = op_names[0]
+                fpropOp = factors[0]['op']
+            else:
+                fpropOp_name = re.search(
+                    'gradientsSampled(_[0-9]+|)/(.+?)_grad', bpropOp_name).group(2)
+                fpropOp = graph.get_operation_by_name(fpropOp_name)
+                if fpropOp.op_def.name in KFAC_OPS:
+                    # Known OPs
+                    ###
+                    bTensor = [
+                        i for i in bpropOp.inputs if 'gradientsSampled' in i.name][-1]
+                    bTensorShape = fpropOp.outputs[0].get_shape()
+                    if bTensor.get_shape()[0].value == None:
+                        bTensor.set_shape(bTensorShape)
+                    bTensors.append(bTensor)
+                    ###
+                    if fpropOp.op_def.name == 'BiasAdd':
+                        fTensors = []
+                    else:
+                        fTensors.append(
+                            [i for i in fpropOp.inputs if param.op.name not in i.name][0])
+                    fpropOp_name = fpropOp.op_def.name
+                else:
+                    # unknown OPs, block approximation used
+                    bInputsList = [i for i in bpropOp.inputs[
+                        0].op.inputs if 'gradientsSampled' in i.name if 'Shape' not in i.name]
+                    if len(bInputsList) > 0:
+                        bTensor = bInputsList[0]
+                        bTensorShape = fpropOp.outputs[0].get_shape()
+                        if len(bTensor.get_shape()) > 0 and bTensor.get_shape()[0].value == None:
+                            bTensor.set_shape(bTensorShape)
+                        bTensors.append(bTensor)
+                    fpropOp_name = opTypes.append('UNK-' + fpropOp.op_def.name)
+
+            return {'opName': fpropOp_name, 'op': fpropOp, 'fpropFactors': fTensors, 'bpropFactors': bTensors}
+
+        for t, param in zip(g, varlist):
+            if KFAC_DEBUG:
+                print(('get factor for '+param.name))
+            factors = searchFactors(t, graph)
+            factorTensors[param] = factors
+
+        ########
+        # check associated weights and bias for homogeneous coordinate representation
+        # and check redundent factors
+        # TO-DO: there may be a bug to detect associate bias and weights for
+        # forking layer, e.g. in inception models.
+        for param in varlist:
+            factorTensors[param]['assnWeights'] = None
+            factorTensors[param]['assnBias'] = None
+        for param in varlist:
+            if factorTensors[param]['opName'] == 'BiasAdd':
+                factorTensors[param]['assnWeights'] = None
+                for item in varlist:
+                    if len(factorTensors[item]['bpropFactors']) > 0:
+                        if (set(factorTensors[item]['bpropFactors']) == set(factorTensors[param]['bpropFactors'])) and (len(factorTensors[item]['fpropFactors']) > 0):
+                            factorTensors[param]['assnWeights'] = item
+                            factorTensors[item]['assnBias'] = param
+                            factorTensors[param]['bpropFactors'] = factorTensors[
+                                item]['bpropFactors']
+
+        ########
+
+        ########
+        # concatenate the additive gradients along the batch dimension, i.e.
+        # assuming independence structure
+        for key in ['fpropFactors', 'bpropFactors']:
+            for i, param in enumerate(varlist):
+                if len(factorTensors[param][key]) > 0:
+                    if (key + '_concat') not in factorTensors[param]:
+                        name_scope = factorTensors[param][key][0].name.split(':')[
+                            0]
+                        with tf.name_scope(name_scope):
+                            factorTensors[param][
+                                key + '_concat'] = tf.concat(factorTensors[param][key], 0)
+                else:
+                    factorTensors[param][key + '_concat'] = None
+                for j, param2 in enumerate(varlist[(i + 1):]):
+                    if (len(factorTensors[param][key]) > 0) and (set(factorTensors[param2][key]) == set(factorTensors[param][key])):
+                        factorTensors[param2][key] = factorTensors[param][key]
+                        factorTensors[param2][
+                            key + '_concat'] = factorTensors[param][key + '_concat']
+        ########
+
+        if KFAC_DEBUG:
+            for items in zip(varlist, fpropTensors, bpropTensors, opTypes):
+                print((items[0].name, factorTensors[item]))
+        self.factors = factorTensors
+        return factorTensors
+
+    def getStats(self, factors, varlist):
+        if len(self.stats) == 0:
+            # initialize stats variables on CPU because eigen decomp is
+            # computed on CPU
+            with tf.device('/cpu'):
+                tmpStatsCache = {}
+
+                # search for tensor factors and
+                # use block diag approx for the bias units
+                for var in varlist:
+                    fpropFactor = factors[var]['fpropFactors_concat']
+                    bpropFactor = factors[var]['bpropFactors_concat']
+                    opType = factors[var]['opName']
+                    if opType == 'Conv2D':
+                        Kh = var.get_shape()[0]
+                        Kw = var.get_shape()[1]
+                        C = fpropFactor.get_shape()[-1]
+
+                        Oh = bpropFactor.get_shape()[1]
+                        Ow = bpropFactor.get_shape()[2]
+                        if Oh == 1 and Ow == 1 and self._channel_fac:
+                            # factorization along the channels do not support
+                            # homogeneous coordinate
+                            var_assnBias = factors[var]['assnBias']
+                            if var_assnBias:
+                                factors[var]['assnBias'] = None
+                                factors[var_assnBias]['assnWeights'] = None
+                ##
+
+                for var in varlist:
+                    fpropFactor = factors[var]['fpropFactors_concat']
+                    bpropFactor = factors[var]['bpropFactors_concat']
+                    opType = factors[var]['opName']
+                    self.stats[var] = {'opName': opType,
+                                       'fprop_concat_stats': [],
+                                       'bprop_concat_stats': [],
+                                       'assnWeights': factors[var]['assnWeights'],
+                                       'assnBias': factors[var]['assnBias'],
+                                       }
+                    if fpropFactor is not None:
+                        if fpropFactor not in tmpStatsCache:
+                            if opType == 'Conv2D':
+                                Kh = var.get_shape()[0]
+                                Kw = var.get_shape()[1]
+                                C = fpropFactor.get_shape()[-1]
+
+                                Oh = bpropFactor.get_shape()[1]
+                                Ow = bpropFactor.get_shape()[2]
+                                if Oh == 1 and Ow == 1 and self._channel_fac:
+                                    # factorization along the channels
+                                    # assume independence bewteen input channels and spatial
+                                    # 2K-1 x 2K-1 covariance matrix and C x C covariance matrix
+                                    # factorization along the channels do not
+                                    # support homogeneous coordinate, assnBias
+                                    # is always None
+                                    fpropFactor2_size = Kh * Kw
+                                    slot_fpropFactor_stats2 = tf.Variable(tf.diag(tf.ones(
+                                        [fpropFactor2_size])) * self._diag_init_coeff, name='KFAC_STATS/' + fpropFactor.op.name, trainable=False)
+                                    self.stats[var]['fprop_concat_stats'].append(
+                                        slot_fpropFactor_stats2)
+
+                                    fpropFactor_size = C
+                                else:
+                                    # 2K-1 x 2K-1 x C x C covariance matrix
+                                    # assume BHWC
+                                    fpropFactor_size = Kh * Kw * C
+                            else:
+                                # D x D covariance matrix
+                                fpropFactor_size = fpropFactor.get_shape()[-1]
+
+                            # use homogeneous coordinate
+                            if not self._blockdiag_bias and self.stats[var]['assnBias']:
+                                fpropFactor_size += 1
+
+                            slot_fpropFactor_stats = tf.Variable(tf.diag(tf.ones(
+                                [fpropFactor_size])) * self._diag_init_coeff, name='KFAC_STATS/' + fpropFactor.op.name, trainable=False)
+                            self.stats[var]['fprop_concat_stats'].append(
+                                slot_fpropFactor_stats)
+                            if opType != 'Conv2D':
+                                tmpStatsCache[fpropFactor] = self.stats[
+                                    var]['fprop_concat_stats']
+                        else:
+                            self.stats[var][
+                                'fprop_concat_stats'] = tmpStatsCache[fpropFactor]
+
+                    if bpropFactor is not None:
+                        # no need to collect backward stats for bias vectors if
+                        # using homogeneous coordinates
+                        if not((not self._blockdiag_bias) and self.stats[var]['assnWeights']):
+                            if bpropFactor not in tmpStatsCache:
+                                slot_bpropFactor_stats = tf.Variable(tf.diag(tf.ones([bpropFactor.get_shape(
+                                )[-1]])) * self._diag_init_coeff, name='KFAC_STATS/' + bpropFactor.op.name, trainable=False)
+                                self.stats[var]['bprop_concat_stats'].append(
+                                    slot_bpropFactor_stats)
+                                tmpStatsCache[bpropFactor] = self.stats[
+                                    var]['bprop_concat_stats']
+                            else:
+                                self.stats[var][
+                                    'bprop_concat_stats'] = tmpStatsCache[bpropFactor]
+
+        return self.stats
+
+    def compute_and_apply_stats(self, loss_sampled, var_list=None):
+        varlist = var_list
+        if varlist is None:
+            varlist = tf.trainable_variables()
+
+        stats = self.compute_stats(loss_sampled, var_list=varlist)
+        return self.apply_stats(stats)
+
+    def compute_stats(self, loss_sampled, var_list=None):
+        varlist = var_list
+        if varlist is None:
+            varlist = tf.trainable_variables()
+
+        gs = tf.gradients(loss_sampled, varlist, name='gradientsSampled')
+        self.gs = gs
+        factors = self.getFactors(gs, varlist)
+        stats = self.getStats(factors, varlist)
+
+        updateOps = []
+        statsUpdates = {}
+        statsUpdates_cache = {}
+        for var in varlist:
+            opType = factors[var]['opName']
+            fops = factors[var]['op']
+            fpropFactor = factors[var]['fpropFactors_concat']
+            fpropStats_vars = stats[var]['fprop_concat_stats']
+            bpropFactor = factors[var]['bpropFactors_concat']
+            bpropStats_vars = stats[var]['bprop_concat_stats']
+            SVD_factors = {}
+            for stats_var in fpropStats_vars:
+                stats_var_dim = int(stats_var.get_shape()[0])
+                if stats_var not in statsUpdates_cache:
+                    old_fpropFactor = fpropFactor
+                    B = (tf.shape(fpropFactor)[0])  # batch size
+                    if opType == 'Conv2D':
+                        strides = fops.get_attr("strides")
+                        padding = fops.get_attr("padding")
+                        convkernel_size = var.get_shape()[0:3]
+
+                        KH = int(convkernel_size[0])
+                        KW = int(convkernel_size[1])
+                        C = int(convkernel_size[2])
+                        flatten_size = int(KH * KW * C)
+
+                        Oh = int(bpropFactor.get_shape()[1])
+                        Ow = int(bpropFactor.get_shape()[2])
+
+                        if Oh == 1 and Ow == 1 and self._channel_fac:
+                                # factorization along the channels
+                                # assume independence among input channels
+                                # factor = B x 1 x 1 x (KH xKW x C)
+                                # patches = B x Oh x Ow x (KH xKW x C)
+                            if len(SVD_factors) == 0:
+                                if KFAC_DEBUG:
+                                    print(('approx %s act factor with rank-1 SVD factors' % (var.name)))
+                                # find closest rank-1 approx to the feature map
+                                S, U, V = tf.batch_svd(tf.reshape(
+                                    fpropFactor, [-1, KH * KW, C]))
+                                # get rank-1 approx slides
+                                sqrtS1 = tf.expand_dims(tf.sqrt(S[:, 0, 0]), 1)
+                                patches_k = U[:, :, 0] * sqrtS1  # B x KH*KW
+                                full_factor_shape = fpropFactor.get_shape()
+                                patches_k.set_shape(
+                                    [full_factor_shape[0], KH * KW])
+                                patches_c = V[:, :, 0] * sqrtS1  # B x C
+                                patches_c.set_shape([full_factor_shape[0], C])
+                                SVD_factors[C] = patches_c
+                                SVD_factors[KH * KW] = patches_k
+                            fpropFactor = SVD_factors[stats_var_dim]
+
+                        else:
+                            # poor mem usage implementation
+                            patches = tf.extract_image_patches(fpropFactor, ksizes=[1, convkernel_size[
+                                                               0], convkernel_size[1], 1], strides=strides, rates=[1, 1, 1, 1], padding=padding)
+
+                            if self._approxT2:
+                                if KFAC_DEBUG:
+                                    print(('approxT2 act fisher for %s' % (var.name)))
+                                # T^2 terms * 1/T^2, size: B x C
+                                fpropFactor = tf.reduce_mean(patches, [1, 2])
+                            else:
+                                # size: (B x Oh x Ow) x C
+                                fpropFactor = tf.reshape(
+                                    patches, [-1, flatten_size]) / Oh / Ow
+                    fpropFactor_size = int(fpropFactor.get_shape()[-1])
+                    if stats_var_dim == (fpropFactor_size + 1) and not self._blockdiag_bias:
+                        if opType == 'Conv2D' and not self._approxT2:
+                            # correct padding for numerical stability (we
+                            # divided out OhxOw from activations for T1 approx)
+                            fpropFactor = tf.concat([fpropFactor, tf.ones(
+                                [tf.shape(fpropFactor)[0], 1]) / Oh / Ow], 1)
+                        else:
+                            # use homogeneous coordinates
+                            fpropFactor = tf.concat(
+                                [fpropFactor, tf.ones([tf.shape(fpropFactor)[0], 1])], 1)
+
+                    # average over the number of data points in a batch
+                    # divided by B
+                    cov = tf.matmul(fpropFactor, fpropFactor,
+                                    transpose_a=True) / tf.cast(B, tf.float32)
+                    updateOps.append(cov)
+                    statsUpdates[stats_var] = cov
+                    if opType != 'Conv2D':
+                        # HACK: for convolution we recompute fprop stats for
+                        # every layer including forking layers
+                        statsUpdates_cache[stats_var] = cov
+
+            for stats_var in bpropStats_vars:
+                stats_var_dim = int(stats_var.get_shape()[0])
+                if stats_var not in statsUpdates_cache:
+                    old_bpropFactor = bpropFactor
+                    bpropFactor_shape = bpropFactor.get_shape()
+                    B = tf.shape(bpropFactor)[0]  # batch size
+                    C = int(bpropFactor_shape[-1])  # num channels
+                    if opType == 'Conv2D' or len(bpropFactor_shape) == 4:
+                        if fpropFactor is not None:
+                            if self._approxT2:
+                                if KFAC_DEBUG:
+                                    print(('approxT2 grad fisher for %s' % (var.name)))
+                                bpropFactor = tf.reduce_sum(
+                                    bpropFactor, [1, 2])  # T^2 terms * 1/T^2
+                            else:
+                                bpropFactor = tf.reshape(
+                                    bpropFactor, [-1, C]) * Oh * Ow  # T * 1/T terms
+                        else:
+                            # just doing block diag approx. spatial independent
+                            # structure does not apply here. summing over
+                            # spatial locations
+                            if KFAC_DEBUG:
+                                print(('block diag approx fisher for %s' % (var.name)))
+                            bpropFactor = tf.reduce_sum(bpropFactor, [1, 2])
+
+                    # assume sampled loss is averaged. TO-DO:figure out better
+                    # way to handle this
+                    bpropFactor *= tf.to_float(B)
+                    ##
+
+                    cov_b = tf.matmul(
+                        bpropFactor, bpropFactor, transpose_a=True) / tf.to_float(tf.shape(bpropFactor)[0])
+
+                    updateOps.append(cov_b)
+                    statsUpdates[stats_var] = cov_b
+                    statsUpdates_cache[stats_var] = cov_b
+
+        if KFAC_DEBUG:
+            aKey = list(statsUpdates.keys())[0]
+            statsUpdates[aKey] = tf.Print(statsUpdates[aKey],
+                                          [tf.convert_to_tensor('step:'),
+                                           self.global_step,
+                                           tf.convert_to_tensor(
+                                               'computing stats'),
+                                           ])
+        self.statsUpdates = statsUpdates
+        return statsUpdates
+
+    def apply_stats(self, statsUpdates):
+        """ compute stats and update/apply the new stats to the running average
+        """
+
+        def updateAccumStats():
+            if self._full_stats_init:
+                return tf.cond(tf.greater(self.sgd_step, self._cold_iter), lambda: tf.group(*self._apply_stats(statsUpdates, accumulate=True, accumulateCoeff=1. / self._stats_accum_iter)), tf.no_op)
+            else:
+                return tf.group(*self._apply_stats(statsUpdates, accumulate=True, accumulateCoeff=1. / self._stats_accum_iter))
+
+        def updateRunningAvgStats(statsUpdates, fac_iter=1):
+            # return tf.cond(tf.greater_equal(self.factor_step,
+            # tf.convert_to_tensor(fac_iter)), lambda:
+            # tf.group(*self._apply_stats(stats_list, varlist)), tf.no_op)
+            return tf.group(*self._apply_stats(statsUpdates))
+
+        if self._async_stats:
+            # asynchronous stats update
+            update_stats = self._apply_stats(statsUpdates)
+
+            queue = tf.FIFOQueue(1, [item.dtype for item in update_stats], shapes=[
+                                 item.get_shape() for item in update_stats])
+            enqueue_op = queue.enqueue(update_stats)
+
+            def dequeue_stats_op():
+                return queue.dequeue()
+            self.qr_stats = tf.train.QueueRunner(queue, [enqueue_op])
+            update_stats_op = tf.cond(tf.equal(queue.size(), tf.convert_to_tensor(
+                0)), tf.no_op, lambda: tf.group(*[dequeue_stats_op(), ]))
+        else:
+            # synchronous stats update
+            update_stats_op = tf.cond(tf.greater_equal(
+                self.stats_step, self._stats_accum_iter), lambda: updateRunningAvgStats(statsUpdates), updateAccumStats)
+        self._update_stats_op = update_stats_op
+        return update_stats_op
+
+    def _apply_stats(self, statsUpdates, accumulate=False, accumulateCoeff=0.):
+        updateOps = []
+        # obtain the stats var list
+        for stats_var in statsUpdates:
+            stats_new = statsUpdates[stats_var]
+            if accumulate:
+                # simple superbatch averaging
+                update_op = tf.assign_add(
+                    stats_var, accumulateCoeff * stats_new, use_locking=True)
+            else:
+                # exponential running averaging
+                update_op = tf.assign(
+                    stats_var, stats_var * self._stats_decay, use_locking=True)
+                update_op = tf.assign_add(
+                    update_op, (1. - self._stats_decay) * stats_new, use_locking=True)
+            updateOps.append(update_op)
+
+        with tf.control_dependencies(updateOps):
+            stats_step_op = tf.assign_add(self.stats_step, 1)
+
+        if KFAC_DEBUG:
+            stats_step_op = (tf.Print(stats_step_op,
+                                      [tf.convert_to_tensor('step:'),
+                                       self.global_step,
+                                       tf.convert_to_tensor('fac step:'),
+                                       self.factor_step,
+                                       tf.convert_to_tensor('sgd step:'),
+                                       self.sgd_step,
+                                       tf.convert_to_tensor('Accum:'),
+                                       tf.convert_to_tensor(accumulate),
+                                       tf.convert_to_tensor('Accum coeff:'),
+                                       tf.convert_to_tensor(accumulateCoeff),
+                                       tf.convert_to_tensor('stat step:'),
+                                       self.stats_step, updateOps[0], updateOps[1]]))
+        return [stats_step_op, ]
+
+    def getStatsEigen(self, stats=None):
+        if len(self.stats_eigen) == 0:
+            stats_eigen = {}
+            if stats is None:
+                stats = self.stats
+
+            tmpEigenCache = {}
+            with tf.device('/cpu:0'):
+                for var in stats:
+                    for key in ['fprop_concat_stats', 'bprop_concat_stats']:
+                        for stats_var in stats[var][key]:
+                            if stats_var not in tmpEigenCache:
+                                stats_dim = stats_var.get_shape()[1].value
+                                e = tf.Variable(tf.ones(
+                                    [stats_dim]), name='KFAC_FAC/' + stats_var.name.split(':')[0] + '/e', trainable=False)
+                                Q = tf.Variable(tf.diag(tf.ones(
+                                    [stats_dim])), name='KFAC_FAC/' + stats_var.name.split(':')[0] + '/Q', trainable=False)
+                                stats_eigen[stats_var] = {'e': e, 'Q': Q}
+                                tmpEigenCache[
+                                    stats_var] = stats_eigen[stats_var]
+                            else:
+                                stats_eigen[stats_var] = tmpEigenCache[
+                                    stats_var]
+            self.stats_eigen = stats_eigen
+        return self.stats_eigen
+
+    def computeStatsEigen(self):
+        """ compute the eigen decomp using copied var stats to avoid concurrent read/write from other queue """
+        # TO-DO: figure out why this op has delays (possibly moving
+        # eigenvectors around?)
+        with tf.device('/cpu:0'):
+            def removeNone(tensor_list):
+                local_list = []
+                for item in tensor_list:
+                    if item is not None:
+                        local_list.append(item)
+                return local_list
+
+            def copyStats(var_list):
+                print("copying stats to buffer tensors before eigen decomp")
+                redundant_stats = {}
+                copied_list = []
+                for item in var_list:
+                    if item is not None:
+                        if item not in redundant_stats:
+                            if self._use_float64:
+                                redundant_stats[item] = tf.cast(
+                                    tf.identity(item), tf.float64)
+                            else:
+                                redundant_stats[item] = tf.identity(item)
+                        copied_list.append(redundant_stats[item])
+                    else:
+                        copied_list.append(None)
+                return copied_list
+            #stats = [copyStats(self.fStats), copyStats(self.bStats)]
+            #stats = [self.fStats, self.bStats]
+
+            stats_eigen = self.stats_eigen
+            computedEigen = {}
+            eigen_reverse_lookup = {}
+            updateOps = []
+            # sync copied stats
+            # with tf.control_dependencies(removeNone(stats[0]) +
+            # removeNone(stats[1])):
+            with tf.control_dependencies([]):
+                for stats_var in stats_eigen:
+                    if stats_var not in computedEigen:
+                        eigens = tf.self_adjoint_eig(stats_var)
+                        e = eigens[0]
+                        Q = eigens[1]
+                        if self._use_float64:
+                            e = tf.cast(e, tf.float32)
+                            Q = tf.cast(Q, tf.float32)
+                        updateOps.append(e)
+                        updateOps.append(Q)
+                        computedEigen[stats_var] = {'e': e, 'Q': Q}
+                        eigen_reverse_lookup[e] = stats_eigen[stats_var]['e']
+                        eigen_reverse_lookup[Q] = stats_eigen[stats_var]['Q']
+
+            self.eigen_reverse_lookup = eigen_reverse_lookup
+            self.eigen_update_list = updateOps
+
+            if KFAC_DEBUG:
+                self.eigen_update_list = [item for item in updateOps]
+                with tf.control_dependencies(updateOps):
+                    updateOps.append(tf.Print(tf.constant(
+                        0.), [tf.convert_to_tensor('computed factor eigen')]))
+
+        return updateOps
+
+    def applyStatsEigen(self, eigen_list):
+        updateOps = []
+        print(('updating %d eigenvalue/vectors' % len(eigen_list)))
+        for i, (tensor, mark) in enumerate(zip(eigen_list, self.eigen_update_list)):
+            stats_eigen_var = self.eigen_reverse_lookup[mark]
+            updateOps.append(
+                tf.assign(stats_eigen_var, tensor, use_locking=True))
+
+        with tf.control_dependencies(updateOps):
+            factor_step_op = tf.assign_add(self.factor_step, 1)
+            updateOps.append(factor_step_op)
+            if KFAC_DEBUG:
+                updateOps.append(tf.Print(tf.constant(
+                    0.), [tf.convert_to_tensor('updated kfac factors')]))
+        return updateOps
+
+    def getKfacPrecondUpdates(self, gradlist, varlist):
+        updatelist = []
+        vg = 0.
+
+        assert len(self.stats) > 0
+        assert len(self.stats_eigen) > 0
+        assert len(self.factors) > 0
+        counter = 0
+
+        grad_dict = {var: grad for grad, var in zip(gradlist, varlist)}
+
+        for grad, var in zip(gradlist, varlist):
+            GRAD_RESHAPE = False
+            GRAD_TRANSPOSE = False
+
+            fpropFactoredFishers = self.stats[var]['fprop_concat_stats']
+            bpropFactoredFishers = self.stats[var]['bprop_concat_stats']
+
+            if (len(fpropFactoredFishers) + len(bpropFactoredFishers)) > 0:
+                counter += 1
+                GRAD_SHAPE = grad.get_shape()
+                if len(grad.get_shape()) > 2:
+                    # reshape conv kernel parameters
+                    KW = int(grad.get_shape()[0])
+                    KH = int(grad.get_shape()[1])
+                    C = int(grad.get_shape()[2])
+                    D = int(grad.get_shape()[3])
+
+                    if len(fpropFactoredFishers) > 1 and self._channel_fac:
+                        # reshape conv kernel parameters into tensor
+                        grad = tf.reshape(grad, [KW * KH, C, D])
+                    else:
+                        # reshape conv kernel parameters into 2D grad
+                        grad = tf.reshape(grad, [-1, D])
+                    GRAD_RESHAPE = True
+                elif len(grad.get_shape()) == 1:
+                    # reshape bias or 1D parameters
+                    D = int(grad.get_shape()[0])
+
+                    grad = tf.expand_dims(grad, 0)
+                    GRAD_RESHAPE = True
+                else:
+                    # 2D parameters
+                    C = int(grad.get_shape()[0])
+                    D = int(grad.get_shape()[1])
+
+                if (self.stats[var]['assnBias'] is not None) and not self._blockdiag_bias:
+                    # use homogeneous coordinates only works for 2D grad.
+                    # TO-DO: figure out how to factorize bias grad
+                    # stack bias grad
+                    var_assnBias = self.stats[var]['assnBias']
+                    grad = tf.concat(
+                        [grad, tf.expand_dims(grad_dict[var_assnBias], 0)], 0)
+
+                # project gradient to eigen space and reshape the eigenvalues
+                # for broadcasting
+                eigVals = []
+
+                for idx, stats in enumerate(self.stats[var]['fprop_concat_stats']):
+                    Q = self.stats_eigen[stats]['Q']
+                    e = detectMinVal(self.stats_eigen[stats][
+                                     'e'], var, name='act', debug=KFAC_DEBUG)
+
+                    Q, e = factorReshape(Q, e, grad, facIndx=idx, ftype='act')
+                    eigVals.append(e)
+                    grad = gmatmul(Q, grad, transpose_a=True, reduce_dim=idx)
+
+                for idx, stats in enumerate(self.stats[var]['bprop_concat_stats']):
+                    Q = self.stats_eigen[stats]['Q']
+                    e = detectMinVal(self.stats_eigen[stats][
+                                     'e'], var, name='grad', debug=KFAC_DEBUG)
+
+                    Q, e = factorReshape(Q, e, grad, facIndx=idx, ftype='grad')
+                    eigVals.append(e)
+                    grad = gmatmul(grad, Q, transpose_b=False, reduce_dim=idx)
+                ##
+
+                #####
+                # whiten using eigenvalues
+                weightDecayCoeff = 0.
+                if var in self._weight_decay_dict:
+                    weightDecayCoeff = self._weight_decay_dict[var]
+                    if KFAC_DEBUG:
+                        print(('weight decay coeff for %s is %f' % (var.name, weightDecayCoeff)))
+
+                if self._factored_damping:
+                    if KFAC_DEBUG:
+                        print(('use factored damping for %s' % (var.name)))
+                    coeffs = 1.
+                    num_factors = len(eigVals)
+                    # compute the ratio of two trace norm of the left and right
+                    # KFac matrices, and their generalization
+                    if len(eigVals) == 1:
+                        damping = self._epsilon + weightDecayCoeff
+                    else:
+                        damping = tf.pow(
+                            self._epsilon + weightDecayCoeff, 1. / num_factors)
+                    eigVals_tnorm_avg = [tf.reduce_mean(
+                        tf.abs(e)) for e in eigVals]
+                    for e, e_tnorm in zip(eigVals, eigVals_tnorm_avg):
+                        eig_tnorm_negList = [
+                            item for item in eigVals_tnorm_avg if item != e_tnorm]
+                        if len(eigVals) == 1:
+                            adjustment = 1.
+                        elif len(eigVals) == 2:
+                            adjustment = tf.sqrt(
+                                e_tnorm / eig_tnorm_negList[0])
+                        else:
+                            eig_tnorm_negList_prod = reduce(
+                                lambda x, y: x * y, eig_tnorm_negList)
+                            adjustment = tf.pow(
+                                tf.pow(e_tnorm, num_factors - 1.) / eig_tnorm_negList_prod, 1. / num_factors)
+                        coeffs *= (e + adjustment * damping)
+                else:
+                    coeffs = 1.
+                    damping = (self._epsilon + weightDecayCoeff)
+                    for e in eigVals:
+                        coeffs *= e
+                    coeffs += damping
+
+                #grad = tf.Print(grad, [tf.convert_to_tensor('1'), tf.convert_to_tensor(var.name), grad.get_shape()])
+
+                grad /= coeffs
+
+                #grad = tf.Print(grad, [tf.convert_to_tensor('2'), tf.convert_to_tensor(var.name), grad.get_shape()])
+                #####
+                # project gradient back to euclidean space
+                for idx, stats in enumerate(self.stats[var]['fprop_concat_stats']):
+                    Q = self.stats_eigen[stats]['Q']
+                    grad = gmatmul(Q, grad, transpose_a=False, reduce_dim=idx)
+
+                for idx, stats in enumerate(self.stats[var]['bprop_concat_stats']):
+                    Q = self.stats_eigen[stats]['Q']
+                    grad = gmatmul(grad, Q, transpose_b=True, reduce_dim=idx)
+                ##
+
+                #grad = tf.Print(grad, [tf.convert_to_tensor('3'), tf.convert_to_tensor(var.name), grad.get_shape()])
+                if (self.stats[var]['assnBias'] is not None) and not self._blockdiag_bias:
+                    # use homogeneous coordinates only works for 2D grad.
+                    # TO-DO: figure out how to factorize bias grad
+                    # un-stack bias grad
+                    var_assnBias = self.stats[var]['assnBias']
+                    C_plus_one = int(grad.get_shape()[0])
+                    grad_assnBias = tf.reshape(tf.slice(grad,
+                                                        begin=[
+                                                            C_plus_one - 1, 0],
+                                                        size=[1, -1]), var_assnBias.get_shape())
+                    grad_assnWeights = tf.slice(grad,
+                                                begin=[0, 0],
+                                                size=[C_plus_one - 1, -1])
+                    grad_dict[var_assnBias] = grad_assnBias
+                    grad = grad_assnWeights
+
+                #grad = tf.Print(grad, [tf.convert_to_tensor('4'), tf.convert_to_tensor(var.name), grad.get_shape()])
+                if GRAD_RESHAPE:
+                    grad = tf.reshape(grad, GRAD_SHAPE)
+
+                grad_dict[var] = grad
+
+        print(('projecting %d gradient matrices' % counter))
+
+        for g, var in zip(gradlist, varlist):
+            grad = grad_dict[var]
+            ### clipping ###
+            if KFAC_DEBUG:
+                print(('apply clipping to %s' % (var.name)))
+            tf.Print(grad, [tf.sqrt(tf.reduce_sum(tf.pow(grad, 2)))], "Euclidean norm of new grad")
+            local_vg = tf.reduce_sum(grad * g * (self._lr * self._lr))
+            vg += local_vg
+
+        # recale everything
+        if KFAC_DEBUG:
+            print('apply vFv clipping')
+
+        scaling = tf.minimum(1., tf.sqrt(self._clip_kl / vg))
+        if KFAC_DEBUG:
+            scaling = tf.Print(scaling, [tf.convert_to_tensor(
+                'clip: '), scaling, tf.convert_to_tensor(' vFv: '), vg])
+        with tf.control_dependencies([tf.assign(self.vFv, vg)]):
+            updatelist = [grad_dict[var] for var in varlist]
+            for i, item in enumerate(updatelist):
+                updatelist[i] = scaling * item
+
+        return updatelist
+
+    def compute_gradients(self, loss, var_list=None):
+        varlist = var_list
+        if varlist is None:
+            varlist = tf.trainable_variables()
+        g = tf.gradients(loss, varlist)
+
+        return [(a, b) for a, b in zip(g, varlist)]
+
+    def apply_gradients_kfac(self, grads):
+        g, varlist = list(zip(*grads))
+
+        if len(self.stats_eigen) == 0:
+            self.getStatsEigen()
+
+        qr = None
+        # launch eigen-decomp on a queue thread
+        if self._async:
+            print('Use async eigen decomp')
+            # get a list of factor loading tensors
+            factorOps_dummy = self.computeStatsEigen()
+
+            # define a queue for the list of factor loading tensors
+            queue = tf.FIFOQueue(1, [item.dtype for item in factorOps_dummy], shapes=[
+                                 item.get_shape() for item in factorOps_dummy])
+            enqueue_op = tf.cond(tf.logical_and(tf.equal(tf.mod(self.stats_step, self._kfac_update), tf.convert_to_tensor(
+                0)), tf.greater_equal(self.stats_step, self._stats_accum_iter)), lambda: queue.enqueue(self.computeStatsEigen()), tf.no_op)
+
+            def dequeue_op():
+                return queue.dequeue()
+
+            qr = tf.train.QueueRunner(queue, [enqueue_op])
+
+        updateOps = []
+        global_step_op = tf.assign_add(self.global_step, 1)
+        updateOps.append(global_step_op)
+
+        with tf.control_dependencies([global_step_op]):
+
+            # compute updates
+            assert self._update_stats_op != None
+            updateOps.append(self._update_stats_op)
+            dependency_list = []
+            if not self._async:
+                dependency_list.append(self._update_stats_op)
+
+            with tf.control_dependencies(dependency_list):
+                def no_op_wrapper():
+                    return tf.group(*[tf.assign_add(self.cold_step, 1)])
+
+                if not self._async:
+                    # synchronous eigen-decomp updates
+                    updateFactorOps = tf.cond(tf.logical_and(tf.equal(tf.mod(self.stats_step, self._kfac_update),
+                                                                      tf.convert_to_tensor(0)),
+                                                             tf.greater_equal(self.stats_step, self._stats_accum_iter)), lambda: tf.group(*self.applyStatsEigen(self.computeStatsEigen())), no_op_wrapper)
+                else:
+                    # asynchronous eigen-decomp updates using queue
+                    updateFactorOps = tf.cond(tf.greater_equal(self.stats_step, self._stats_accum_iter),
+                                              lambda: tf.cond(tf.equal(queue.size(), tf.convert_to_tensor(0)),
+                                                              tf.no_op,
+
+                                                              lambda: tf.group(
+                                                                  *self.applyStatsEigen(dequeue_op())),
+                                                              ),
+                                              no_op_wrapper)
+
+                updateOps.append(updateFactorOps)
+
+                with tf.control_dependencies([updateFactorOps]):
+                    def gradOp():
+                        return list(g)
+
+                    def getKfacGradOp():
+                        return self.getKfacPrecondUpdates(g, varlist)
+                    u = tf.cond(tf.greater(self.factor_step,
+                                           tf.convert_to_tensor(0)), getKfacGradOp, gradOp)
+
+                    optim = tf.train.MomentumOptimizer(
+                        self._lr * (1. - self._momentum), self._momentum)
+                    #optim = tf.train.AdamOptimizer(self._lr, epsilon=0.01)
+
+                    def optimOp():
+                        def updateOptimOp():
+                            if self._full_stats_init:
+                                return tf.cond(tf.greater(self.factor_step, tf.convert_to_tensor(0)), lambda: optim.apply_gradients(list(zip(u, varlist))), tf.no_op)
+                            else:
+                                return optim.apply_gradients(list(zip(u, varlist)))
+                        if self._full_stats_init:
+                            return tf.cond(tf.greater_equal(self.stats_step, self._stats_accum_iter), updateOptimOp, tf.no_op)
+                        else:
+                            return tf.cond(tf.greater_equal(self.sgd_step, self._cold_iter), updateOptimOp, tf.no_op)
+                    updateOps.append(optimOp())
+
+        return tf.group(*updateOps), qr
+
+    def apply_gradients(self, grads):
+        coldOptim = tf.train.MomentumOptimizer(
+            self._cold_lr, self._momentum)
+
+        def coldSGDstart():
+            sgd_grads, sgd_var = zip(*grads)
+
+            if self.max_grad_norm != None:
+                sgd_grads, sgd_grad_norm = tf.clip_by_global_norm(sgd_grads,self.max_grad_norm)
+
+            sgd_grads = list(zip(sgd_grads,sgd_var))
+
+            sgd_step_op = tf.assign_add(self.sgd_step, 1)
+            coldOptim_op = coldOptim.apply_gradients(sgd_grads)
+            if KFAC_DEBUG:
+                with tf.control_dependencies([sgd_step_op, coldOptim_op]):
+                    sgd_step_op = tf.Print(
+                        sgd_step_op, [self.sgd_step, tf.convert_to_tensor('doing cold sgd step')])
+            return tf.group(*[sgd_step_op, coldOptim_op])
+
+        kfacOptim_op, qr = self.apply_gradients_kfac(grads)
+
+        def warmKFACstart():
+            return kfacOptim_op
+
+        return tf.cond(tf.greater(self.sgd_step, self._cold_iter), warmKFACstart, coldSGDstart), qr
+
+    def minimize(self, loss, loss_sampled, var_list=None):
+        grads = self.compute_gradients(loss, var_list=var_list)
+        update_stats_op = self.compute_and_apply_stats(
+            loss_sampled, var_list=var_list)
+        return self.apply_gradients(grads)
--- a/baselines/acktr/kfac_utils.py
+++ b/baselines/acktr/kfac_utils.py
@@ -0,0 +1,124 @@
+import tensorflow as tf
+import numpy as np
+
+
+def gmatmul(a, b, transpose_a=False, transpose_b=False, reduce_dim=None):
+    if reduce_dim == None:
+        # general batch matmul
+        if len(a.get_shape()) == 3 and len(b.get_shape()) == 3:
+            return tf.batch_matmul(a, b, adj_x=transpose_a, adj_y=transpose_b)
+        elif len(a.get_shape()) == 3 and len(b.get_shape()) == 2:
+            if transpose_b:
+                N = b.get_shape()[0].value
+            else:
+                N = b.get_shape()[1].value
+            B = a.get_shape()[0].value
+            if transpose_a:
+                K = a.get_shape()[1].value
+                a = tf.reshape(tf.transpose(a, [0, 2, 1]), [-1, K])
+            else:
+                K = a.get_shape()[-1].value
+                a = tf.reshape(a, [-1, K])
+            result = tf.matmul(a, b, transpose_b=transpose_b)
+            result = tf.reshape(result, [B, -1, N])
+            return result
+        elif len(a.get_shape()) == 2 and len(b.get_shape()) == 3:
+            if transpose_a:
+                M = a.get_shape()[1].value
+            else:
+                M = a.get_shape()[0].value
+            B = b.get_shape()[0].value
+            if transpose_b:
+                K = b.get_shape()[-1].value
+                b = tf.transpose(tf.reshape(b, [-1, K]), [1, 0])
+            else:
+                K = b.get_shape()[1].value
+                b = tf.transpose(tf.reshape(
+                    tf.transpose(b, [0, 2, 1]), [-1, K]), [1, 0])
+            result = tf.matmul(a, b, transpose_a=transpose_a)
+            result = tf.transpose(tf.reshape(result, [M, B, -1]), [1, 0, 2])
+            return result
+        else:
+            return tf.matmul(a, b, transpose_a=transpose_a, transpose_b=transpose_b)
+    else:
+        # weird batch matmul
+        if len(a.get_shape()) == 2 and len(b.get_shape()) > 2:
+            # reshape reduce_dim to the left most dim in b
+            b_shape = b.get_shape()
+            if reduce_dim != 0:
+                b_dims = list(range(len(b_shape)))
+                b_dims.remove(reduce_dim)
+                b_dims.insert(0, reduce_dim)
+                b = tf.transpose(b, b_dims)
+            b_t_shape = b.get_shape()
+            b = tf.reshape(b, [int(b_shape[reduce_dim]), -1])
+            result = tf.matmul(a, b, transpose_a=transpose_a,
+                               transpose_b=transpose_b)
+            result = tf.reshape(result, b_t_shape)
+            if reduce_dim != 0:
+                b_dims = list(range(len(b_shape)))
+                b_dims.remove(0)
+                b_dims.insert(reduce_dim, 0)
+                result = tf.transpose(result, b_dims)
+            return result
+
+        elif len(a.get_shape()) > 2 and len(b.get_shape()) == 2:
+            # reshape reduce_dim to the right most dim in a
+            a_shape = a.get_shape()
+            outter_dim = len(a_shape) - 1
+            reduce_dim = len(a_shape) - reduce_dim - 1
+            if reduce_dim != outter_dim:
+                a_dims = list(range(len(a_shape)))
+                a_dims.remove(reduce_dim)
+                a_dims.insert(outter_dim, reduce_dim)
+                a = tf.transpose(a, a_dims)
+            a_t_shape = a.get_shape()
+            a = tf.reshape(a, [-1, int(a_shape[reduce_dim])])
+            result = tf.matmul(a, b, transpose_a=transpose_a,
+                               transpose_b=transpose_b)
+            result = tf.reshape(result, a_t_shape)
+            if reduce_dim != outter_dim:
+                a_dims = list(range(len(a_shape)))
+                a_dims.remove(outter_dim)
+                a_dims.insert(reduce_dim, outter_dim)
+                result = tf.transpose(result, a_dims)
+            return result
+
+        elif len(a.get_shape()) == 2 and len(b.get_shape()) == 2:
+            return tf.matmul(a, b, transpose_a=transpose_a, transpose_b=transpose_b)
+
+        assert False, 'something went wrong'
+
+
+def clipoutNeg(vec, threshold=1e-6):
+    mask = tf.cast(vec > threshold, tf.float32)
+    return mask * vec
+
+
+def detectMinVal(input_mat, var, threshold=1e-6, name='', debug=False):
+    eigen_min = tf.reduce_min(input_mat)
+    eigen_max = tf.reduce_max(input_mat)
+    eigen_ratio = eigen_max / eigen_min
+    input_mat_clipped = clipoutNeg(input_mat, threshold)
+
+    if debug:
+        input_mat_clipped = tf.cond(tf.logical_or(tf.greater(eigen_ratio, 0.), tf.less(eigen_ratio, -500)), lambda: input_mat_clipped, lambda: tf.Print(
+            input_mat_clipped, [tf.convert_to_tensor('screwed ratio ' + name + ' eigen values!!!'), tf.convert_to_tensor(var.name), eigen_min, eigen_max, eigen_ratio]))
+
+    return input_mat_clipped
+
+
+def factorReshape(Q, e, grad, facIndx=0, ftype='act'):
+    grad_shape = grad.get_shape()
+    if ftype == 'act':
+        assert e.get_shape()[0] == grad_shape[facIndx]
+        expanded_shape = [1, ] * len(grad_shape)
+        expanded_shape[facIndx] = -1
+        e = tf.reshape(e, expanded_shape)
+    if ftype == 'grad':
+        assert e.get_shape()[0] == grad_shape[len(grad_shape) - facIndx - 1]
+        expanded_shape = [1, ] * len(grad_shape)
+        expanded_shape[len(grad_shape) - facIndx - 1] = -1
+        e = tf.reshape(e, expanded_shape)
+
+    return Q, e
--- a/baselines/acktr/policies.py
+++ b/baselines/acktr/policies.py
@@ -0,0 +1,80 @@
+import numpy as np
+import tensorflow as tf
+from baselines.acktr.utils import conv, fc, dense, conv_to_fc, sample, kl_div
+from baselines.common.distributions import make_pdtype
+import baselines.common.tf_util as U
+import gym
+
+class CnnPolicy(object):
+
+    def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False):
+        nbatch = nenv*nsteps
+        nh, nw, nc = ob_space.shape
+        ob_shape = (nbatch, nh, nw, nc*nstack)
+        nact = ac_space.n
+        X = tf.placeholder(tf.uint8, ob_shape) #obs
+        with tf.variable_scope("model", reuse=reuse):
+            h = conv(tf.cast(X, tf.float32)/255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2))
+            h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))
+            h3 = conv(h2, 'c3', nf=32, rf=3, stride=1, init_scale=np.sqrt(2))
+            h3 = conv_to_fc(h3)
+            h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
+            pi = fc(h4, 'pi', nact, act=lambda x:x)
+            vf = fc(h4, 'v', 1, act=lambda x:x)
+
+        v0 = vf[:, 0]
+        a0 = sample(pi)
+        self.initial_state = [] #not stateful
+
+        def step(ob, *_args, **_kwargs):
+            a, v = sess.run([a0, v0], {X:ob})
+            return a, v, [] #dummy state
+
+        def value(ob, *_args, **_kwargs):
+            return sess.run(v0, {X:ob})
+
+        self.X = X
+        self.pi = pi
+        self.vf = vf
+        self.step = step
+        self.value = value
+
+
+class GaussianMlpPolicy(object):
+    def __init__(self, ob_dim, ac_dim):
+        # Here we'll construct a bunch of expressions, which will be used in two places:
+        # (1) When sampling actions
+        # (2) When computing loss functions, for the policy update
+        # Variables specific to (1) have the word "sampled" in them,
+        # whereas variables specific to (2) have the word "old" in them
+        ob_no = tf.placeholder(tf.float32, shape=[None, ob_dim*2], name="ob") # batch of observations
+        oldac_na = tf.placeholder(tf.float32, shape=[None, ac_dim], name="ac") # batch of actions previous actions
+        oldac_dist = tf.placeholder(tf.float32, shape=[None, ac_dim*2], name="oldac_dist") # batch of actions previous action distributions
+        adv_n = tf.placeholder(tf.float32, shape=[None], name="adv") # advantage function estimate
+        oldlogprob_n = tf.placeholder(tf.float32, shape=[None], name='oldlogprob') # log probability of previous actions
+        wd_dict = {}
+        h1 = tf.nn.tanh(dense(ob_no, 64, "h1", weight_init=U.normc_initializer(1.0), bias_init=0.0, weight_loss_dict=wd_dict))
+        h2 = tf.nn.tanh(dense(h1, 64, "h2", weight_init=U.normc_initializer(1.0), bias_init=0.0, weight_loss_dict=wd_dict))
+        mean_na = dense(h2, ac_dim, "mean", weight_init=U.normc_initializer(0.1), bias_init=0.0, weight_loss_dict=wd_dict) # Mean control output
+        self.wd_dict = wd_dict
+        self.logstd_1a = logstd_1a = tf.get_variable("logstd", [ac_dim], tf.float32, tf.zeros_initializer()) # Variance on outputs
+        logstd_1a = tf.expand_dims(logstd_1a, 0)
+        std_1a = tf.exp(logstd_1a)
+        std_na = tf.tile(std_1a, [tf.shape(mean_na)[0], 1])
+        ac_dist = tf.concat([tf.reshape(mean_na, [-1, ac_dim]), tf.reshape(std_na, [-1, ac_dim])], 1)
+        sampled_ac_na = tf.random_normal(tf.shape(ac_dist[:,ac_dim:])) * ac_dist[:,ac_dim:] + ac_dist[:,:ac_dim] # This is the sampled action we'll perform.
+        logprobsampled_n = - U.sum(tf.log(ac_dist[:,ac_dim:]), axis=1) - 0.5 * tf.log(2.0*np.pi)*ac_dim - 0.5 * U.sum(tf.square(ac_dist[:,:ac_dim] - sampled_ac_na) / (tf.square(ac_dist[:,ac_dim:])), axis=1) # Logprob of sampled action
+        logprob_n = - U.sum(tf.log(ac_dist[:,ac_dim:]), axis=1) - 0.5 * tf.log(2.0*np.pi)*ac_dim - 0.5 * U.sum(tf.square(ac_dist[:,:ac_dim] - oldac_na) / (tf.square(ac_dist[:,ac_dim:])), axis=1) # Logprob of previous actions under CURRENT policy (whereas oldlogprob_n is under OLD policy)
+        kl = U.mean(kl_div(oldac_dist, ac_dist, ac_dim))
+        #kl = .5 * U.mean(tf.square(logprob_n - oldlogprob_n)) # Approximation of KL divergence between old policy used to generate actions, and new policy used to compute logprob_n
+        surr = - U.mean(adv_n * logprob_n) # Loss function that we'll differentiate to get the policy gradient
+        surr_sampled = - U.mean(logprob_n) # Sampled loss of the policy
+        self._act = U.function([ob_no], [sampled_ac_na, ac_dist, logprobsampled_n]) # Generate a new action and its logprob
+        #self.compute_kl = U.function([ob_no, oldac_na, oldlogprob_n], kl) # Compute (approximate) KL divergence between old policy and new policy
+        self.compute_kl = U.function([ob_no, oldac_dist], kl)
+        self.update_info = ((ob_no, oldac_na, adv_n), surr, surr_sampled) # Input and output variables needed for computing loss
+        U.initialize() # Initialize uninitialized TF variables
+
+    def act(self, ob):
+        ac, ac_dist, logp = self._act(ob[None])
+        return ac[0], ac_dist[0], logp[0]
--- a/baselines/acktr/run_atari.py
+++ b/baselines/acktr/run_atari.py
@@ -0,0 +1,36 @@
+#!/usr/bin/env python
+import os, logging, gym
+from baselines import logger
+from baselines.common import set_global_seeds
+from baselines import bench
+from baselines.acktr.acktr_disc import learn
+from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
+from baselines.common.atari_wrappers import wrap_deepmind
+from baselines.acktr.policies import CnnPolicy
+
+def train(env_id, num_timesteps, seed, num_cpu):
+    num_timesteps //= 4
+
+    def make_env(rank):
+        def _thunk():
+            env = gym.make(env_id)
+            env.seed(seed + rank)
+            if logger.get_dir():
+                env = bench.Monitor(env, os.path.join(logger.get_dir(), "{}.monitor.json".format(rank)))
+            gym.logger.setLevel(logging.WARN)
+            return wrap_deepmind(env)
+        return _thunk
+
+    set_global_seeds(seed)
+    env = SubprocVecEnv([make_env(i) for i in range(num_cpu)])
+
+    policy_fn = CnnPolicy
+    learn(policy_fn, env, seed, total_timesteps=num_timesteps, nprocs=num_cpu)
+    env.close()
+
+def main():
+    train('BreakoutNoFrameskip-v4', num_timesteps=int(40e6), seed=0, num_cpu=32)
+
+
+if __name__ == '__main__':
+    main()
--- a/baselines/acktr/run_mujoco.py
+++ b/baselines/acktr/run_mujoco.py
@@ -0,0 +1,42 @@
+#!/usr/bin/env python
+import argparse
+import logging
+import os
+import tensorflow as tf
+import gym
+from baselines import logger
+from baselines.common import set_global_seeds
+from baselines import bench
+from baselines.acktr.acktr_cont import learn
+from baselines.acktr.policies import GaussianMlpPolicy
+from baselines.acktr.value_functions import NeuralNetValueFunction
+
+def train(env_id, num_timesteps, seed):
+    env=gym.make(env_id)
+    if logger.get_dir():
+        env = bench.Monitor(env, os.path.join(logger.get_dir(), "monitor.json"))
+    set_global_seeds(seed)
+    env.seed(seed)
+    gym.logger.setLevel(logging.WARN)
+
+    with tf.Session(config=tf.ConfigProto()) as session:
+        ob_dim = env.observation_space.shape[0]
+        ac_dim = env.action_space.shape[0]
+        with tf.variable_scope("vf"):
+            vf = NeuralNetValueFunction(ob_dim, ac_dim)
+        with tf.variable_scope("pi"):
+            policy = GaussianMlpPolicy(ob_dim, ac_dim)
+
+        learn(env, policy=policy, vf=vf,
+            gamma=0.99, lam=0.97, timesteps_per_batch=2500,
+            desired_kl=0.002,
+            num_timesteps=num_timesteps, animate=False)
+
+        env.close()
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser(description='Run Mujoco benchmark.')
+    parser.add_argument('--env_id', type=str, default="Reacher-v1")
+    args = parser.parse_args()
+    train(args.env_id, num_timesteps=1e6, seed=1)
--- a/baselines/acktr/running_stat.py
+++ b/baselines/acktr/running_stat.py
@@ -0,0 +1,46 @@
+import numpy as np
+
+# http://www.johndcook.com/blog/standard_deviation/
+class RunningStat(object):
+    def __init__(self, shape):
+        self._n = 0
+        self._M = np.zeros(shape)
+        self._S = np.zeros(shape)
+    def push(self, x):
+        x = np.asarray(x)
+        assert x.shape == self._M.shape
+        self._n += 1
+        if self._n == 1:
+            self._M[...] = x
+        else:
+            oldM = self._M.copy()
+            self._M[...] = oldM + (x - oldM)/self._n
+            self._S[...] = self._S + (x - oldM)*(x - self._M)
+    @property
+    def n(self):
+        return self._n
+    @property
+    def mean(self):
+        return self._M
+    @property
+    def var(self):
+        return self._S/(self._n - 1) if self._n > 1 else np.square(self._M)
+    @property
+    def std(self):
+        return np.sqrt(self.var)
+    @property
+    def shape(self):
+        return self._M.shape
+
+def test_running_stat():
+    for shp in ((), (3,), (3,4)):
+        li = []
+        rs = RunningStat(shp)
+        for _ in range(5):
+            val = np.random.randn(*shp)
+            rs.push(val)
+            li.append(val)
+            m = np.mean(li, axis=0)
+            assert np.allclose(rs.mean, m)
+            v = np.square(m) if (len(li) == 1) else np.var(li, ddof=1, axis=0)
+            assert np.allclose(rs.var, v)
--- a/baselines/acktr/utils.py
+++ b/baselines/acktr/utils.py
@@ -0,0 +1,200 @@
+import os
+import numpy as np
+import tensorflow as tf
+import baselines.common.tf_util as U
+from collections import deque
+
+def sample(logits):
+    noise = tf.random_uniform(tf.shape(logits))
+    return tf.argmax(logits - tf.log(-tf.log(noise)), 1)
+
+def std(x):
+    mean = tf.reduce_mean(x)
+    var = tf.reduce_mean(tf.square(x-mean))
+    return tf.sqrt(var)
+
+def cat_entropy(logits):
+    a0 = logits - tf.reduce_max(logits, 1, keep_dims=True)
+    ea0 = tf.exp(a0)
+    z0 = tf.reduce_sum(ea0, 1, keep_dims=True)
+    p0 = ea0 / z0
+    return tf.reduce_sum(p0 * (tf.log(z0) - a0), 1)
+
+def cat_entropy_softmax(p0):
+    return - tf.reduce_sum(p0 * tf.log(p0 + 1e-6), axis = 1)
+
+def mse(pred, target):
+    return tf.square(pred-target)/2.
+
+def ortho_init(scale=1.0):
+    def _ortho_init(shape, dtype, partition_info=None):
+        #lasagne ortho init for tf
+        shape = tuple(shape)
+        if len(shape) == 2:
+            flat_shape = shape
+        elif len(shape) == 4: # assumes NHWC
+            flat_shape = (np.prod(shape[:-1]), shape[-1])
+        else:
+            raise NotImplementedError
+        a = np.random.normal(0.0, 1.0, flat_shape)
+        u, _, v = np.linalg.svd(a, full_matrices=False)
+        q = u if u.shape == flat_shape else v # pick the one with the correct shape
+        q = q.reshape(shape)
+        return (scale * q[:shape[0], :shape[1]]).astype(np.float32)
+    return _ortho_init
+
+def conv(x, scope, nf, rf, stride, pad='VALID', act=tf.nn.relu, init_scale=1.0):
+    with tf.variable_scope(scope):
+        nin = x.get_shape()[3].value
+        w = tf.get_variable("w", [rf, rf, nin, nf], initializer=ortho_init(init_scale))
+        b = tf.get_variable("b", [nf], initializer=tf.constant_initializer(0.0))
+        z = tf.nn.conv2d(x, w, strides=[1, stride, stride, 1], padding=pad)+b
+        h = act(z)
+        return h
+
+def fc(x, scope, nh, act=tf.nn.relu, init_scale=1.0):
+    with tf.variable_scope(scope):
+        nin = x.get_shape()[1].value
+        w = tf.get_variable("w", [nin, nh], initializer=ortho_init(init_scale))
+        b = tf.get_variable("b", [nh], initializer=tf.constant_initializer(0.0))
+        z = tf.matmul(x, w)+b
+        h = act(z)
+        return h
+
+def dense(x, size, name, weight_init=None, bias_init=0, weight_loss_dict=None, reuse=None):
+    with tf.variable_scope(name, reuse=reuse):
+        assert (len(U.scope_name().split('/')) == 2)
+
+        w = tf.get_variable("w", [x.get_shape()[1], size], initializer=weight_init)
+        b = tf.get_variable("b", [size], initializer=tf.constant_initializer(bias_init))
+        weight_decay_fc = 3e-4
+
+        if weight_loss_dict is not None:
+            weight_decay = tf.multiply(tf.nn.l2_loss(w), weight_decay_fc, name='weight_decay_loss')
+            if weight_loss_dict is not None:
+                weight_loss_dict[w] = weight_decay_fc
+                weight_loss_dict[b] = 0.0
+
+            tf.add_to_collection(U.scope_name().split('/')[0] + '_' + 'losses', weight_decay)
+
+        return tf.nn.bias_add(tf.matmul(x, w), b)
+
+def conv_to_fc(x):
+    nh = np.prod([v.value for v in x.get_shape()[1:]])
+    x = tf.reshape(x, [-1, nh])
+    return x
+
+def kl_div(action_dist1, action_dist2, action_size):
+    mean1, std1 = action_dist1[:, :action_size], action_dist1[:, action_size:]
+    mean2, std2 = action_dist2[:, :action_size], action_dist2[:, action_size:]
+
+    numerator = tf.square(mean1 - mean2) + tf.square(std1) - tf.square(std2)
+    denominator = 2 * tf.square(std2) + 1e-8
+    return tf.reduce_sum(
+        numerator/denominator + tf.log(std2) - tf.log(std1),reduction_indices=-1)
+
+def discount_with_dones(rewards, dones, gamma):
+    discounted = []
+    r = 0
+    for reward, done in zip(rewards[::-1], dones[::-1]):
+        r = reward + gamma*r*(1.-done) # fixed off by one bug
+        discounted.append(r)
+    return discounted[::-1]
+
+def find_trainable_variables(key):
+    with tf.variable_scope(key):
+        return tf.trainable_variables()
+
+def make_path(f):
+    return os.makedirs(f, exist_ok=True)
+
+def constant(p):
+    return 1
+
+def linear(p):
+    return 1-p
+
+
+def middle_drop(p):
+    eps = 0.75
+    if 1-p<eps:
+        return eps*0.1
+    return 1-p
+
+def double_linear_con(p):
+    p *= 2
+    eps = 0.125
+    if 1-p<eps:
+        return eps
+    return 1-p
+
+
+def double_middle_drop(p):
+    eps1 = 0.75
+    eps2 = 0.25
+    if 1-p<eps1:
+        if 1-p<eps2:
+            return eps2*0.5
+        return eps1*0.1
+    return 1-p
+
+
+schedules = {
+    'linear':linear,
+    'constant':constant,
+    'double_linear_con':double_linear_con,
+    'middle_drop':middle_drop,
+    'double_middle_drop':double_middle_drop
+}
+
+class Scheduler(object):
+
+    def __init__(self, v, nvalues, schedule):
+        self.n = 0.
+        self.v = v
+        self.nvalues = nvalues
+        self.schedule = schedules[schedule]
+
+    def value(self):
+        current_value = self.v*self.schedule(self.n/self.nvalues)
+        self.n += 1.
+        return current_value
+
+    def value_steps(self, steps):
+        return self.v*self.schedule(steps/self.nvalues)
+
+
+class EpisodeStats:
+    def __init__(self, nsteps, nenvs):
+        self.episode_rewards = []
+        for i in range(nenvs):
+            self.episode_rewards.append([])
+        self.lenbuffer = deque(maxlen=40)  # rolling buffer for episode lengths
+        self.rewbuffer = deque(maxlen=40)  # rolling buffer for episode rewards
+        self.nsteps = nsteps
+        self.nenvs = nenvs
+
+    def feed(self, rewards, masks):
+        rewards = np.reshape(rewards, [self.nenvs, self.nsteps])
+        masks = np.reshape(masks, [self.nenvs, self.nsteps])
+        for i in range(0, self.nenvs):
+            for j in range(0, self.nsteps):
+                self.episode_rewards[i].append(rewards[i][j])
+                if masks[i][j]:
+                    l = len(self.episode_rewards[i])
+                    s = sum(self.episode_rewards[i])
+                    self.lenbuffer.append(l)
+                    self.rewbuffer.append(s)
+                    self.episode_rewards[i] = []
+
+    def mean_length(self):
+        if self.lenbuffer:
+            return np.mean(self.lenbuffer)
+        else:
+            return 0  # on the first params dump, no episodes are finished
+
+    def mean_reward(self):
+        if self.rewbuffer:
+            return np.mean(self.rewbuffer)
+        else:
+            return 0
--- a/baselines/acktr/value_functions.py
+++ b/baselines/acktr/value_functions.py
@@ -0,0 +1,50 @@
+from baselines import logger
+import numpy as np
+from baselines import common
+from baselines.common import tf_util as U
+import tensorflow as tf
+import kfac
+from utils import dense
+
+class NeuralNetValueFunction(object):
+    def __init__(self, ob_dim, ac_dim): #pylint: disable=W0613
+        X = tf.placeholder(tf.float32, shape=[None, ob_dim*2+ac_dim*2+2]) # batch of observations
+        vtarg_n = tf.placeholder(tf.float32, shape=[None], name='vtarg')
+        wd_dict = {}
+        h1 = tf.nn.elu(dense(X, 64, "h1", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict))
+        h2 = tf.nn.elu(dense(h1, 64, "h2", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict))
+        vpred_n = dense(h2, 1, "hfinal", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)[:,0]
+        sample_vpred_n = vpred_n + tf.random_normal(tf.shape(vpred_n))
+        wd_loss = tf.get_collection("vf_losses", None)
+        loss = U.mean(tf.square(vpred_n - vtarg_n)) + tf.add_n(wd_loss)
+        loss_sampled = U.mean(tf.square(vpred_n - tf.stop_gradient(sample_vpred_n)))
+        self._predict = U.function([X], vpred_n)
+        optim = kfac.KfacOptimizer(learning_rate=0.001, cold_lr=0.001*(1-0.9), momentum=0.9, \
+                                    clip_kl=0.3, epsilon=0.1, stats_decay=0.95, \
+                                    async=1, kfac_update=2, cold_iter=50, \
+                                    weight_decay_dict=wd_dict, max_grad_norm=None)
+        vf_var_list = []
+        for var in tf.trainable_variables():
+            if "vf" in var.name:
+                vf_var_list.append(var)
+
+        update_op, self.q_runner = optim.minimize(loss, loss_sampled, var_list=vf_var_list)
+        self.do_update = U.function([X, vtarg_n], update_op) #pylint: disable=E1101
+        U.initialize() # Initialize uninitialized TF variables
+    def _preproc(self, path):
+        l = pathlength(path)
+        al = np.arange(l).reshape(-1,1)/10.0
+        act = path["action_dist"].astype('float32')
+        X = np.concatenate([path['observation'], act, al, np.ones((l, 1))], axis=1)
+        return X
+    def predict(self, path):
+        return self._predict(self._preproc(path))
+    def fit(self, paths, targvals):
+        X = np.concatenate([self._preproc(p) for p in paths])
+        y = np.concatenate(targvals)
+        logger.record_tabular("EVBefore", common.explained_variance(self._predict(X), y))
+        for _ in range(25): self.do_update(X, y)
+        logger.record_tabular("EVAfter", common.explained_variance(self._predict(X), y))
+
+def pathlength(path):
+    return path["reward"].shape[0]
--- a/baselines/bench/benchmarks.py
+++ b/baselines/bench/benchmarks.py
@@ -22,6 +22,13 @@ def get_task(benchmark, env_id):
    """Get a task by env_id. Return None if the benchmark doesn't have the env"""
    return next(filter(lambda task: task['env_id'] == env_id, benchmark['tasks']), None)

+def find_task_for_env_id_in_any_benchmark(env_id):
+    for bm in _BENCHMARKS:
+        for task in bm["tasks"]:
+            if task["env_id"]==env_id:
+                return bm, task
+    return None, None
+
 _ATARI_SUFFIX = 'NoFrameskip-v4'

 register_benchmark({
@@ -49,30 +56,61 @@ register_benchmark({
 })


+# MuJoCo
+
 _mujocosmall = [
    'InvertedDoublePendulum-v1', 'InvertedPendulum-v1',
    'HalfCheetah-v1', 'Hopper-v1', 'Walker2d-v1',
    'Reacher-v1', 'Swimmer-v1']
-
 register_benchmark({
    'name' : 'Mujoco1M',
    'description' : 'Some small 2D MuJoCo tasks, run for 1M timesteps',
    'tasks' : [{'env_id' : _envid, 'trials' : 3, 'num_timesteps' : int(1e6)} for _envid in _mujocosmall]
 })
-
-_roboschool_mujoco = [
-    'RoboschoolInvertedDoublePendulum-v0', 'RoboschoolInvertedPendulum-v0',      # cartpole
-    'RoboschoolHalfCheetah-v0', 'RoboschoolHopper-v0', 'RoboschoolWalker2d-v0',  # forward walkers
-    'RoboschoolReacher-v0'
+register_benchmark({
+    'name' : 'MujocoWalkers',
+    'description' : 'MuJoCo forward walkers, run for 8M, humanoid 100M',
+    'tasks' : [
+        {'env_id' : "Hopper-v1",          'trials' : 4, 'num_timesteps' :   8*1000000 },
+        {'env_id' : "Walker2d-v1",        'trials' : 4, 'num_timesteps' :   8*1000000 },
+        {'env_id' : "Humanoid-v1",        'trials' : 4, 'num_timesteps' : 100*1000000 },
    ]
+})
+# To reproduce:
+# python3 baselines/baselines/ppo2/ppo2_run_benchmark.py gce MujocoWalkers myrun_ppo2_whiteobs1_cpu8
+# (observation input filters necessary)
+
+
+# Roboschool

 register_benchmark({
-    'name' : 'RoboschoolMujoco2M',
-    'description' : 'Same small 2D tasks, still improving up to 2M',
-    'tasks' : [{'env_id' : _envid, 'trials' : 3, 'num_timesteps' : int(2e6)} for _envid in _roboschool_mujoco]
+    'name' : 'Roboschool8M',
+    'description' : 'Small 2D tasks, up to 30 minutes to complete on 8 cores',
+    'tasks' : [
+        {'env_id' : "RoboschoolReacher-v1",                 'trials' : 4, 'num_timesteps' :  2*1000000 },
+        {'env_id' : "RoboschoolAnt-v1",                     'trials' : 4, 'num_timesteps' :  8*1000000 },
+        {'env_id' : "RoboschoolHalfCheetah-v1",             'trials' : 4, 'num_timesteps' :  8*1000000 },
+        {'env_id' : "RoboschoolHopper-v1",                  'trials' : 4, 'num_timesteps' :  8*1000000 },
+        {'env_id' : "RoboschoolWalker2d-v1",                'trials' : 4, 'num_timesteps' :  8*1000000 },
+        ]
 })
+register_benchmark({
+    'name' : 'RoboschoolHarder',
+    'description' : 'Test your might!!! Up to 12 hours on 32 cores',
+    'tasks' : [
+        {'env_id' : "RoboschoolHumanoid-v1",              'trials' : 4, 'num_timesteps' : 100*1000000 },
+        {'env_id' : "RoboschoolHumanoidFlagrun-v1",       'trials' : 4, 'num_timesteps' : 200*1000000 },
+        {'env_id' : "RoboschoolHumanoidFlagrunHarder-v1", 'trials' : 4, 'num_timesteps' : 400*1000000 },
+        ]
+})
+# To reproduce:
+# python3 baselines/baselines/ppo2/ppo2_run_benchmark.py gce Roboschool8M     myrun_ppo2_cpu8
+# python3 baselines/baselines/ppo2/ppo2_run_benchmark.py gce RoboschoolHarder myrun_ppo2_cpu32_large_samples65536
+# (Large network, train on 65536 samples each iteration. Also, _large is really necessary only for Harder)


+# Other
+
 _atari50 =  [ # actually 49
            'Alien', 'Amidar', 'Assault', 'Asterix', 'Asteroids', 
            'Atlantis', 'BankHeist', 'BattleZone', 'BeamRider',  'Bowling', 
@@ -91,3 +129,12 @@ register_benchmark({
    'description' :'7 Atari games from Mnih et al. (2013), with pixel observations, 40M frames',
    'tasks'  : [{'env_id' : _game + _ATARI_SUFFIX, 'trials' : 3, 'num_timesteps' : int(40e6)} for _game in _atari50]
 })
+
+def env_shortname(s):
+    "Make typical names above shorter, while keeping recognizable"
+    s = s.replace("NoFrameskip", "")
+    if s[:10]=="Roboschool": s = s[10:]
+    i = s.rfind("-v")
+    if i!=-1: s = s[:i]
+
+    return s.lower()
--- a/baselines/bench/monitor.py
+++ b/baselines/bench/monitor.py
@@ -117,7 +117,7 @@ class LoadMonitorResultsError(Exception):
 def get_monitor_files(dir):
    return glob(path.join(dir, "*" + Monitor.EXT))

-def load_results(dir):
+def load_results(dir, raw_episodes=False):
    fnames = get_monitor_files(dir)
    if not fnames:
        raise LoadMonitorResultsError("no monitor files of the form *%s found in %s" % (Monitor.EXT, dir))
@@ -137,10 +137,13 @@ def load_results(dir):
    for header in headers[1:]:
        assert header['env_id'] == header0['env_id'], "mixing data from two envs"
    episodes = sorted(episodes, key=lambda e: e['abstime'])
-    return {
-        'env_info': {'env_id': header0['env_id'], 'gym_version': header0['gym_version']},
-        'episode_end_times': [e['abstime'] for e in episodes],
-        'episode_lengths': [e['l'] for e in episodes],
-        'episode_rewards': [e['r'] for e in episodes],
-        'initial_reset_time': min([min(header['t_start'] for header in headers)])
-    }
+    if raw_episodes: 
+        return episodes
+    else:
+        return {
+            'env_info': {'env_id': header0['env_id'], 'gym_version': header0['gym_version']},
+            'episode_end_times': [e['abstime'] for e in episodes],
+            'episode_lengths': [e['l'] for e in episodes],
+            'episode_rewards': [e['r'] for e in episodes],
+            'initial_reset_time': min([min(header['t_start'] for header in headers)])
+        }
--- a/baselines/common/distributions.py
+++ b/baselines/common/distributions.py
@@ -108,7 +108,7 @@ class BernoulliPdType(PdType):
 #     def flatparam(self):
 #         return self.logits
 #     def mode(self):
-#         return U.argmax(self.logits, axis=1)
+#         return U.argmax(self.logits, axis=-1)
 #     def logp(self, x):
 #         return -tf.nn.sparse_softmax_cross_entropy_with_logits(self.logits, x)
 #     def kl(self, other):
@@ -118,7 +118,7 @@ class BernoulliPdType(PdType):
 #         return tf.nn.softmax_cross_entropy_with_logits(self.logits, self.ps)
 #     def sample(self):
 #         u = tf.random_uniform(tf.shape(self.logits))
-#         return U.argmax(self.logits - tf.log(-tf.log(u)), axis=1)
+#         return U.argmax(self.logits - tf.log(-tf.log(u)), axis=-1)

 class CategoricalPd(Pd):
    def __init__(self, logits):
@@ -126,27 +126,33 @@ class CategoricalPd(Pd):
    def flatparam(self):
        return self.logits
    def mode(self):
-        return U.argmax(self.logits, axis=1)
+        return U.argmax(self.logits, axis=-1)
    def neglogp(self, x):
-        return tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=x)
+        # return tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=x)
+        # Note: we can't use sparse_softmax_cross_entropy_with_logits because
+        #       the implementation does not allow second-order derivatives...
+        one_hot_actions = tf.one_hot(x, self.logits.get_shape().as_list()[-1])
+        return tf.nn.softmax_cross_entropy_with_logits(
+            logits=self.logits,
+            labels=one_hot_actions)
    def kl(self, other):
-        a0 = self.logits - U.max(self.logits, axis=1, keepdims=True)
-        a1 = other.logits - U.max(other.logits, axis=1, keepdims=True)
+        a0 = self.logits - U.max(self.logits, axis=-1, keepdims=True)
+        a1 = other.logits - U.max(other.logits, axis=-1, keepdims=True)
        ea0 = tf.exp(a0)
        ea1 = tf.exp(a1)
-        z0 = U.sum(ea0, axis=1, keepdims=True)
-        z1 = U.sum(ea1, axis=1, keepdims=True)
+        z0 = U.sum(ea0, axis=-1, keepdims=True)
+        z1 = U.sum(ea1, axis=-1, keepdims=True)
        p0 = ea0 / z0
-        return U.sum(p0 * (a0 - tf.log(z0) - a1 + tf.log(z1)), axis=1)
+        return U.sum(p0 * (a0 - tf.log(z0) - a1 + tf.log(z1)), axis=-1)
    def entropy(self):
-        a0 = self.logits - U.max(self.logits, axis=1, keepdims=True)
+        a0 = self.logits - U.max(self.logits, axis=-1, keepdims=True)
        ea0 = tf.exp(a0)
-        z0 = U.sum(ea0, axis=1, keepdims=True)
+        z0 = U.sum(ea0, axis=-1, keepdims=True)
        p0 = ea0 / z0
-        return U.sum(p0 * (tf.log(z0) - a0), axis=1)
+        return U.sum(p0 * (tf.log(z0) - a0), axis=-1)
    def sample(self):
        u = tf.random_uniform(tf.shape(self.logits))
-        return tf.argmax(self.logits - tf.log(-tf.log(u)), axis=1)
+        return tf.argmax(self.logits - tf.log(-tf.log(u)), axis=-1)
    @classmethod
    def fromflat(cls, flat):
        return cls(flat)
@@ -177,7 +183,7 @@ class MultiCategoricalPd(Pd):
 class DiagGaussianPd(Pd):
    def __init__(self, flat):
        self.flat = flat
-        mean, logstd = tf.split(axis=len(flat.get_shape()) - 1, num_or_size_splits=2, value=flat)
+        mean, logstd = tf.split(axis=len(flat.shape)-1, num_or_size_splits=2, value=flat)
        self.mean = mean
        self.logstd = logstd
        self.std = tf.exp(logstd)
@@ -186,14 +192,14 @@ class DiagGaussianPd(Pd):
    def mode(self):
        return self.mean
    def neglogp(self, x):
-        return 0.5 * U.sum(tf.square((x - self.mean) / self.std), axis=len(x.get_shape()) - 1) \
+        return 0.5 * U.sum(tf.square((x - self.mean) / self.std), axis=-1) \
               + 0.5 * np.log(2.0 * np.pi) * tf.to_float(tf.shape(x)[-1]) \
-               + U.sum(self.logstd, axis=len(x.get_shape()) - 1)
+               + U.sum(self.logstd, axis=-1)
    def kl(self, other):
        assert isinstance(other, DiagGaussianPd)
        return U.sum(other.logstd - self.logstd + (tf.square(self.std) + tf.square(self.mean - other.mean)) / (2.0 * tf.square(other.std)) - 0.5, axis=-1)
    def entropy(self):
-        return U.sum(self.logstd + .5 * np.log(2.0 * np.pi * np.e), -1)
+        return U.sum(self.logstd + .5 * np.log(2.0 * np.pi * np.e), axis=-1)
    def sample(self):
        return self.mean + self.std * tf.random_normal(tf.shape(self.mean))
    @classmethod
@@ -209,11 +215,11 @@ class BernoulliPd(Pd):
    def mode(self):
        return tf.round(self.ps)
    def neglogp(self, x):
-        return U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=tf.to_float(x)), axis=1)
+        return U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=tf.to_float(x)), axis=-1)
    def kl(self, other):
-        return U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=other.logits, labels=self.ps), axis=1) - U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps), axis=1)
+        return U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=other.logits, labels=self.ps), axis=-1) - U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps), axis=-1)
    def entropy(self):
-        return U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps), axis=1)
+        return U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps), axis=-1)
    def sample(self):
        u = tf.random_uniform(tf.shape(self.ps))
        return tf.to_float(math_ops.less(u, self.ps))
@@ -286,4 +292,3 @@ def validate_probtype(probtype, pdparam):
    klval_ll = - entval - logliks.mean() #pylint: disable=E1101
    klval_ll_stderr = logliks.std() / np.sqrt(N) #pylint: disable=E1101
    assert np.abs(klval - klval_ll) < 3 * klval_ll_stderr # within 3 sigmas
-
--- a/baselines/common/vec_env/init.py
+++ b/baselines/common/vec_env/init.py
@@ -0,0 +1,19 @@
+class VecEnv(object):
+    """
+    Vectorized environment base class
+    """
+    def step(self, vac):
+        """
+        Apply sequence of actions to sequence of environments
+        actions -> (observations, rewards, news)
+
+        where 'news' is a boolean vector indicating whether each element is new.
+        """
+        raise NotImplementedError
+    def reset(self):
+        """
+        Reset all environments
+        """
+        raise NotImplementedError
+    def close(self):
+        pass
--- a/baselines/common/vec_env/subproc_vec_env.py
+++ b/baselines/common/vec_env/subproc_vec_env.py
@@ -0,0 +1,74 @@
+import numpy as np
+from multiprocessing import Process, Pipe
+from baselines.common.vec_env import VecEnv
+
+def worker(remote, env_fn_wrapper):
+    env = env_fn_wrapper.x()
+    while True:
+        cmd, data = remote.recv()
+        if cmd == 'step':
+            ob, reward, done, info = env.step(data)
+            if done:
+                ob = env.reset()
+            remote.send((ob, reward, done, info))
+        elif cmd == 'reset':
+            ob = env.reset()
+            remote.send(ob)
+        elif cmd == 'close':
+            remote.close()
+            break
+        elif cmd == 'get_spaces':
+            remote.send((env.action_space, env.observation_space))
+        else:
+            raise NotImplementedError
+
+class CloudpickleWrapper(object):
+    """
+    Uses cloudpickle to serialize contents (otherwise multiprocessing tries to use pickle)
+    """
+    def __init__(self, x):
+        self.x = x
+    def __getstate__(self):
+        import cloudpickle
+        return cloudpickle.dumps(self.x)
+    def __setstate__(self, ob):
+        import pickle
+        self.x = pickle.loads(ob)
+
+class SubprocVecEnv(VecEnv):
+    def __init__(self, env_fns):
+        """
+        envs: list of gym environments to run in subprocesses
+        """
+        nenvs = len(env_fns)
+        self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(nenvs)])        
+        self.ps = [Process(target=worker, args=(work_remote, CloudpickleWrapper(env_fn))) 
+            for (work_remote, env_fn) in zip(self.work_remotes, env_fns)]
+        for p in self.ps:
+            p.start()
+
+        self.remotes[0].send(('get_spaces', None))
+        self.action_space, self.observation_space = self.remotes[0].recv()
+
+
+    def step(self, actions):
+        for remote, action in zip(self.remotes, actions):
+            remote.send(('step', action))
+        results = [remote.recv() for remote in self.remotes]
+        obs, rews, dones, infos = zip(*results)
+        return np.stack(obs), np.stack(rews), np.stack(dones), infos
+
+    def reset(self):
+        for remote in self.remotes:
+            remote.send(('reset', None))
+        return np.stack([remote.recv() for remote in self.remotes])
+
+    def close(self):
+        for remote in self.remotes:
+            remote.send(('close', None))
+        for p in self.ps:
+            p.join()
+
+    @property
+    def num_envs(self):
+        return len(self.remotes)
--- a/baselines/deepq/README.md
+++ b/baselines/deepq/README.md
@@ -15,7 +15,7 @@ python -m baselines.deepq.experiments.enjoy_cartpole
 ```


-Be sure to check out the source code of [both](experiments/train_cartpole.py) [files](experiments/enjoy_cartpole.py)!
+Be sure to check out the source code of [both](baselines/deepq/experiments/train_cartpole.py) [files](baselines/deepq/experiments/enjoy_cartpole.py)!

 ## If you wish to apply DQN to solve a problem.

@@ -49,4 +49,4 @@ Once you pick a model, you can download it and visualize the learned policy. Be
 python -m baselines.deepq.experiments.atari.download_model --blob model-atari-duel-pong-1 --model-dir /tmp/models
 python -m baselines.deepq.experiments.atari.enjoy --model-dir /tmp/models/model-atari-duel-pong-1 --env Pong --dueling

-```
+```
--- a/baselines/deepq/experiments/atari/train.py
+++ b/baselines/deepq/experiments/atari/train.py
@@ -43,7 +43,6 @@ def parse_args():
    parser.add_argument("--target-update-freq", type=int, default=40000, help="number of iterations between every target network update")
    parser.add_argument("--param-noise-update-freq", type=int, default=50, help="number of iterations between every re-scaling of the parameter noise")
    parser.add_argument("--param-noise-reset-freq", type=int, default=10000, help="maximum number of steps to take per episode before re-perturbing the exploration policy")
-    parser.add_argument("--param-noise-threshold", type=float, default=0.05, help="the desired KL divergence between perturbed and non-perturbed policy. set to < 0 to use a KL divergence relative to the eps-greedy exploration")
    # Bells and whistles
    boolean_flag(parser, "double-q", default=True, help="whether or not to use double q learning")
    boolean_flag(parser, "dueling", default=False, help="whether or not to use dueling model")
@@ -202,14 +201,11 @@ if __name__ == '__main__':
                    reset = True

                update_eps = 0.01  # ensures that we cannot get stuck completely
-                if args.param_noise_threshold >= 0.:
-                    update_param_noise_threshold = args.param_noise_threshold
-                else:
-                    # Compute the threshold such that the KL divergence between perturbed and non-perturbed
-                    # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
-                    # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
-                    # for detailed explanation.
-                    update_param_noise_threshold = -np.log(1. - exploration.value(num_iters) + exploration.value(num_iters) / float(env.action_space.n))
+                # Compute the threshold such that the KL divergence between perturbed and non-perturbed
+                # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
+                # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
+                # for detailed explanation.
+                update_param_noise_threshold = -np.log(1. - exploration.value(num_iters) + exploration.value(num_iters) / float(env.action_space.n))
                kwargs['reset'] = reset
                kwargs['update_param_noise_threshold'] = update_param_noise_threshold
                kwargs['update_param_noise_scale'] = (num_iters % args.param_noise_update_freq == 0)
--- a/baselines/deepq/simple.py
+++ b/baselines/deepq/simple.py
@@ -95,7 +95,6 @@ def learn(env,
          prioritized_replay_eps=1e-6,
          num_cpu=16,
          param_noise=False,
-          param_noise_threshold=0.05,
          callback=None):
    """Train a deepq model.

@@ -225,14 +224,11 @@ def learn(env,
                update_param_noise_threshold = 0.
            else:
                update_eps = 0.
-                if param_noise_threshold >= 0.:
-                    update_param_noise_threshold = param_noise_threshold
-                else:
-                    # Compute the threshold such that the KL divergence between perturbed and non-perturbed
-                    # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
-                    # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
-                    # for detailed explanation.
-                    update_param_noise_threshold = -np.log(1. - exploration.value(t) + exploration.value(t) / float(env.action_space.n))
+                # Compute the threshold such that the KL divergence between perturbed and non-perturbed
+                # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
+                # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
+                # for detailed explanation.
+                update_param_noise_threshold = -np.log(1. - exploration.value(t) + exploration.value(t) / float(env.action_space.n))
                kwargs['reset'] = reset
                kwargs['update_param_noise_threshold'] = update_param_noise_threshold
                kwargs['update_param_noise_scale'] = True
--- a/baselines/logger.py
+++ b/baselines/logger.py
@@ -1,13 +1,3 @@
-"""
-
-See README.md for a description of the logging API.
-
-OFF state corresponds to having Logger.CURRENT == Logger.DEFAULT
-ON state is otherwise
-
-"""
-
-from collections import OrderedDict
 import os
 import sys
 import shutil
@@ -17,7 +7,7 @@ import time
 import datetime
 import tempfile

-LOG_OUTPUT_FORMATS = ['stdout', 'log', 'json', 'tensorboard']
+LOG_OUTPUT_FORMATS = ['stdout', 'log', 'json']

 DEBUG = 10
 INFO = 20
@@ -49,9 +39,12 @@ class HumanOutputFormat(OutputFormat):

    def writekvs(self, kvs):
        # Create strings for printing
-        key2str = OrderedDict()
+        key2str = {}
        for (key, val) in kvs.items():
-            valstr = '%-8.3g' % (val,) if hasattr(val, '__float__') else val
+            if isinstance(val, float):
+                valstr = '%-8.3g' % (val,)
+            else:
+                valstr = str(val)
            key2str[self._truncate(key)] = self._truncate(valstr)

        # Find max widths
@@ -61,7 +54,7 @@ class HumanOutputFormat(OutputFormat):
        # Write out the data
        dashes = '-' * (keywidth + valwidth + 7)
        lines = [dashes]
-        for (key, val) in key2str.items():
+        for (key, val) in sorted(key2str.items()):
            lines.append('| %s%s | %s%s |' % (
                key,
                ' ' * (keywidth - len(key)),
@@ -150,7 +143,6 @@ def make_output_format(format, ev_dir):
 # API
 # ================================================================

-
 def logkv(key, val):
    """
    Log a value of some diagnostic
@@ -158,6 +150,12 @@ def logkv(key, val):
    """
    Logger.CURRENT.logkv(key, val)

+def logkvs(d):
+    """
+    Log a dictionary of key-value pairs
+    """
+    for (k, v) in d.items():
+        logkv(k, v)

 def dumpkvs():
    """
@@ -168,10 +166,8 @@ def dumpkvs():
    """
    Logger.CURRENT.dumpkvs()

-
-# for backwards compatibility
-record_tabular = logkv
-dump_tabular = dumpkvs
+def getkvs():
+    return Logger.CURRENT.name2val    


 def log(*args, level=INFO):
@@ -203,7 +199,6 @@ def set_level(level):
    """
    Logger.CURRENT.set_level(level)

-
 def get_dir():
    """
    Get directory that log files are being written to.
@@ -211,18 +206,20 @@ def get_dir():
    """
    return Logger.CURRENT.get_dir()

+record_tabular = logkv
+dump_tabular = dumpkvs
+
 # ================================================================
 # Backend
 # ================================================================

-
 class Logger(object):
    DEFAULT = None  # A logger with no output files. (See right below class definition)
                    # So that you can still log to the terminal without setting up any output files
    CURRENT = None  # Current logger being used by the free functions above

    def __init__(self, dir, output_formats):
-        self.name2val = OrderedDict()  # values this iteration
+        self.name2val = {}  # values this iteration
        self.level = INFO
        self.dir = dir
        self.output_formats = output_formats
@@ -233,6 +230,7 @@ class Logger(object):
        self.name2val[key] = val

    def dumpkvs(self):
+        if self.level == DISABLED: return
        for fmt in self.output_formats:
            fmt.writekvs(self.name2val)
        self.name2val.clear()
@@ -259,57 +257,30 @@ class Logger(object):
        for fmt in self.output_formats:
            fmt.writeseq(args)

+Logger.DEFAULT = Logger.CURRENT = Logger(dir=None, output_formats=[HumanOutputFormat(sys.stdout)])
+
+def configure(dir=None, format_strs=None):
+    assert Logger.CURRENT is Logger.DEFAULT,\
+        "Only call logger.configure() when it's in the default state. Try calling logger.reset() first."
+    prevlogger = Logger.CURRENT
+    if dir is None:
+        dir = os.getenv('OPENAI_LOGDIR')
+    if dir is None:
+        dir = osp.join(tempfile.gettempdir(), 
+            datetime.datetime.now().strftime("openai-%Y-%m-%d-%H-%M-%S-%f"))
+    if format_strs is None:
+        format_strs = LOG_OUTPUT_FORMATS
+    output_formats = [make_output_format(f, dir) for f in format_strs]
+    Logger.CURRENT = Logger(dir=dir, output_formats=output_formats)
+    log('Logging to %s'%dir)
+
+def reset():
+    Logger.CURRENT = Logger.DEFAULT
+    log('Reset logger')
+

 # ================================================================

-Logger.DEFAULT = Logger(output_formats=[HumanOutputFormat(sys.stdout)], dir=None)
-Logger.CURRENT = Logger.DEFAULT
-
-
-class session(object):
-    """
-    Context manager that sets up the loggers for an experiment.
-    """
-
-    CURRENT = None  # Set to a LoggerContext object using enter/exit or context manager
-
-    def __init__(self, dir=None, format_strs=None):
-        if dir is None:
-            dir = os.getenv('OPENAI_LOGDIR')
-        if dir is None:
-            dir = osp.join(tempfile.gettempdir(), 
-                datetime.datetime.now().strftime("openai-%Y-%m-%d-%H-%M-%S-%f"))
-        self.dir = dir
-        if format_strs is None:
-            format_strs = LOG_OUTPUT_FORMATS
-        output_formats = [make_output_format(f, dir) for f in format_strs]
-        Logger.CURRENT = Logger(dir=dir, output_formats=output_formats)
-        print('Logging to', dir)
-
-    def __enter__(self):
-        os.makedirs(self.evaluation_dir(), exist_ok=True)
-        output_formats = [make_output_format(f, self.evaluation_dir()) 
-                            for f in LOG_OUTPUT_FORMATS]
-        Logger.CURRENT = Logger(dir=self.dir, output_formats=output_formats)
-        os.environ['OPENAI_LOGDIR'] = self.evaluation_dir()
-
-    def __exit__(self, *args):
-        Logger.CURRENT.close()
-        Logger.CURRENT = Logger.DEFAULT
-
-    def evaluation_dir(self):
-        return self.dir
-
-def _setup():
-    logdir = os.getenv('OPENAI_LOGDIR')
-    if logdir:
-        session(logdir).__enter__()
-
-_setup()
-
-# ================================================================
-
-
 def _demo():
    info("hi")
    debug("shouldn't appear")
@@ -319,19 +290,19 @@ def _demo():
    if os.path.exists(dir):
        shutil.rmtree(dir)
    with session(dir=dir):
-        record_tabular("a", 3)
-        record_tabular("b", 2.5)
-        dump_tabular()
-        record_tabular("b", -2.5)
-        record_tabular("a", 5.5)
-        dump_tabular()
+        logkv("a", 3)
+        logkv("b", 2.5)
+        dumpkvs()
+        logkv("b", -2.5)
+        logkv("a", 5.5)
+        dumpkvs()
        info("^^^ should see a = 5.5")

-    record_tabular("b", -2.5)
-    dump_tabular()
+    logkv("b", -2.5)
+    dumpkvs()

-    record_tabular("a", "longasslongasslongasslongasslongasslongassvalue")
-    dump_tabular()
+    logkv("a", "longasslongasslongasslongasslongasslongassvalue")
+    dumpkvs()


 if __name__ == "__main__":
--- a/baselines/pposgd/run_atari.py
+++ b/baselines/pposgd/run_atari.py
@@ -22,7 +22,6 @@ def train(env_id, num_timesteps, seed, num_cpu):
    rank = MPI.COMM_WORLD.Get_rank()
    sess = U.single_threaded_session()
    sess.__enter__()
-    logger.session().__enter__()
    if rank != 0: logger.set_level(logger.DISABLED)
    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
    set_global_seeds(workerseed)
--- a/baselines/pposgd/run_mujoco.py
+++ b/baselines/pposgd/run_mujoco.py
@@ -9,7 +9,6 @@ import sys
 def train(env_id, num_timesteps, seed):
    from baselines.pposgd import mlp_policy, pposgd_simple
    U.make_session(num_cpu=1).__enter__()
-    logger.session().__enter__()
    set_global_seeds(seed)
    env = gym.make(env_id)
    def policy_fn(name, ob_space, ac_space):
--- a/baselines/trpo_mpi/run_atari.py
+++ b/baselines/trpo_mpi/run_atari.py
@@ -24,7 +24,6 @@ def train(env_id, num_timesteps, seed, num_cpu):
    rank = MPI.COMM_WORLD.Get_rank()
    sess = U.single_threaded_session()
    sess.__enter__()
-    logger.session().__enter__()
    if rank != 0:
        logger.set_level(logger.DISABLED)

--- a/baselines/trpo_mpi/run_mujoco.py
+++ b/baselines/trpo_mpi/run_mujoco.py
@@ -19,7 +19,6 @@ def train(env_id, num_timesteps, seed):
    if whoami == "parent":
        return
    import baselines.common.tf_util as U
-    logger.session().__enter__()
    sess = U.single_threaded_session()
    sess.__enter__()