From 9070ee7ef36b66ea1d9ab48457b86fba85d2e68c Mon Sep 17 00:00:00 2001
From: pzhokhov <peterzhokhoff@gmail.com>
Date: Tue, 11 Sep 2018 11:01:51 -0700
Subject: [PATCH] tighten flake8, autopep8 to fix trailing whitespaces and
 blank lines with whitespaces (#87)

---
 baselines/a2c/a2c.py                          | 16 +++++-----
 baselines/a2c/runner.py                       |  4 +--
 baselines/acer/acer.py                        | 32 +++++++++----------
 baselines/bench/__init__.py                   |  2 +-
 baselines/bench/monitor.py                    |  2 +-
 baselines/common/cg.py                        |  2 +-
 baselines/common/cmd_util.py                  |  2 +-
 baselines/common/console_util.py              |  2 +-
 baselines/common/input.py                     | 18 +++++------
 baselines/common/math_util.py                 |  2 +-
 baselines/common/mpi_adam.py                  |  2 +-
 baselines/common/mpi_fork.py                  |  2 +-
 baselines/common/mpi_moments.py               |  4 +--
 baselines/common/policies.py                  | 18 +++++------
 baselines/common/running_mean_std.py          | 32 +++++++++----------
 .../common/tests/envs/fixed_sequence_env.py   |  2 +-
 baselines/common/tests/envs/mnist_env.py      |  4 +--
 baselines/common/tests/test_cartpole.py       |  6 ++--
 baselines/common/tests/test_doc_examples.py   |  8 ++---
 baselines/common/tests/test_fixed_sequence.py |  6 ++--
 baselines/common/tests/test_identity.py       |  4 +--
 baselines/common/tests/test_mnist.py          | 16 +++++-----
 baselines/common/tests/test_serialization.py  | 18 +++++------
 baselines/common/tests/util.py                |  4 +--
 baselines/common/tf_util.py                   |  2 +-
 baselines/common/vec_env/dummy_vec_env.py     |  6 ++--
 baselines/ddpg/memory.py                      |  2 +-
 baselines/ddpg/models.py                      |  4 +--
 baselines/deepq/deepq.py                      |  8 ++---
 .../deepq/experiments/enjoy_mountaincar.py    |  2 +-
 baselines/deepq/experiments/train_pong.py     |  2 +-
 baselines/deepq/models.py                     |  6 ++--
 baselines/deepq/utils.py                      | 10 +++---
 baselines/her/experiment/play.py              |  2 +-
 baselines/her/experiment/plot.py              |  4 +--
 baselines/ppo1/run_humanoid.py                | 24 +++++++-------
 baselines/ppo2/ppo2.py                        | 26 +++++++--------
 baselines/results_plotter.py                  |  2 +-
 baselines/run.py                              |  4 +--
 baselines/trpo_mpi/defaults.py                |  4 +--
 baselines/trpo_mpi/trpo_mpi.py                | 32 +++++++++----------
 setup.cfg                                     |  2 +-
 setup.py                                      |  2 +-
 43 files changed, 176 insertions(+), 176 deletions(-)

diff --git a/baselines/a2c/a2c.py b/baselines/a2c/a2c.py
index 729a58b..d085040 100644
--- a/baselines/a2c/a2c.py
+++ b/baselines/a2c/a2c.py
@@ -97,21 +97,21 @@ def learn(
     load_path=None,
     **network_kwargs):
 
-    ''' 
+    '''
     Main entrypoint for A2C algorithm. Train a policy with given network architecture on a given environment using a2c algorithm.
 
     Parameters:
     -----------
 
     network:            policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
-                        specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns 
+                        specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
                         tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
                         neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
                         See baselines.common/policies.py/lstm for more details on using recurrent nets in policies
-                
+
 
     env:                RL environment. Should implement interface similar to VecEnv (baselines.common/vec_env) or be wrapped with DummyVecEnv (baselines.common/vec_env/dummy_vec_env.py)
-                    
+
 
     seed:               seed to make random number sequence in the alorightm reproducible. By default is None which means seed from system noise generator (not reproducible)
 
@@ -128,7 +128,7 @@ def learn(
 
     lr:                 float, learning rate for RMSProp (current implementation has RMSProp hardcoded in) (default: 7e-4)
 
-    lrschedule:         schedule of learning rate. Can be 'linear', 'constant', or a function [0..1] -> [0..1] that takes fraction of the training progress as input and 
+    lrschedule:         schedule of learning rate. Can be 'linear', 'constant', or a function [0..1] -> [0..1] that takes fraction of the training progress as input and
                         returns fraction of the learning rate (specified as lr) as output
 
     epsilon:            float, RMSProp epsilon (stabilizes square root computation in denominator of RMSProp update) (default: 1e-5)
@@ -140,17 +140,17 @@ def learn(
     log_interval:       int, specifies how frequently the logs are printed out (default: 100)
 
     **network_kwargs:   keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
-                        For instance, 'mlp' network architecture has arguments num_hidden and num_layers. 
+                        For instance, 'mlp' network architecture has arguments num_hidden and num_layers.
 
     '''
-    
+
 
 
     set_global_seeds(seed)
 
     nenvs = env.num_envs
     policy = build_policy(env, network, **network_kwargs)
-   
+
     model = Model(policy=policy, env=env, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef,
         max_grad_norm=max_grad_norm, lr=lr, alpha=alpha, epsilon=epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule)
     if load_path is not None:
diff --git a/baselines/a2c/runner.py b/baselines/a2c/runner.py
index 60b5e1d..f03e0d9 100644
--- a/baselines/a2c/runner.py
+++ b/baselines/a2c/runner.py
@@ -9,7 +9,7 @@ class Runner(AbstractEnvRunner):
         self.gamma = gamma
         self.batch_action_shape = [x if x is not None else -1 for x in model.train_model.action.shape.as_list()]
         self.ob_dtype = model.train_model.X.dtype.as_numpy_dtype
-    
+
     def run(self):
         mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[]
         mb_states = self.states
@@ -51,7 +51,7 @@ class Runner(AbstractEnvRunner):
                     rewards = discount_with_dones(rewards, dones, self.gamma)
 
                 mb_rewards[n] = rewards
-    
+
         mb_actions = mb_actions.reshape(self.batch_action_shape)
 
         mb_rewards = mb_rewards.flatten()
diff --git a/baselines/acer/acer.py b/baselines/acer/acer.py
index 4a865f1..4e2e00f 100644
--- a/baselines/acer/acer.py
+++ b/baselines/acer/acer.py
@@ -70,7 +70,7 @@ class Model(object):
         MU = tf.placeholder(tf.float32, [nbatch, nact]) # mu's
         LR = tf.placeholder(tf.float32, [])
         eps = 1e-6
-    
+
         step_ob_placeholder = tf.placeholder(dtype=ob_space.dtype, shape=(nenvs,) + ob_space.shape[:-1] + (ob_space.shape[-1] * nstack,))
         train_ob_placeholder = tf.placeholder(dtype=ob_space.dtype, shape=(nenvs*(nsteps+1),) + ob_space.shape[:-1] + (ob_space.shape[-1] * nstack,))
         with tf.variable_scope('acer_model', reuse=tf.AUTO_REUSE):
@@ -78,7 +78,7 @@ class Model(object):
             step_model = policy(observ_placeholder=step_ob_placeholder, sess=sess)
             train_model = policy(observ_placeholder=train_ob_placeholder, sess=sess)
 
-    
+
         params = find_trainable_variables("acer_model")
         print("Params {}".format(len(params)))
         for var in params:
@@ -97,10 +97,10 @@ class Model(object):
             polyak_model = policy(observ_placeholder=train_ob_placeholder, sess=sess)
 
         # Notation: (var) = batch variable, (var)s = seqeuence variable, (var)_i = variable index by action at step i
-        
+
         # action probability distributions according to train_model, polyak_model and step_model
         # poilcy.pi is probability distribution parameters; to obtain distribution that sums to 1 need to take softmax
-        train_model_p = tf.nn.softmax(train_model.pi)  
+        train_model_p = tf.nn.softmax(train_model.pi)
         polyak_model_p = tf.nn.softmax(polyak_model.pi)
         step_model_p = tf.nn.softmax(step_model.pi)
         v = tf.reduce_sum(train_model_p * train_model.q, axis = -1) # shape is [nenvs * (nsteps + 1)]
@@ -119,7 +119,7 @@ class Model(object):
         qret = q_retrace(R, D, q_i, v, rho_i, nenvs, nsteps, gamma)
 
         # Calculate losses
-        # Entropy   
+        # Entropy
         # entropy = tf.reduce_mean(strip(train_model.pd.entropy(), nenvs, nsteps))
         entropy = tf.reduce_mean(cat_entropy_softmax(f))
 
@@ -212,8 +212,8 @@ class Model(object):
 
         def _step(observation, **kwargs):
             return step_model._evaluate([step_model.action, step_model_p, step_model.state], observation, **kwargs)
-                
-                    
+
+
 
         self.train = train
         self.save = functools.partial(save_variables, sess=sess, variables=params)
@@ -283,18 +283,18 @@ def learn(network, env, seed=None, nsteps=20, nstack=4, total_timesteps=int(80e6
     ----------
 
     network:            policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
-                        specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns 
+                        specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
                         tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
                         neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
                         See baselines.common/policies.py/lstm for more details on using recurrent nets in policies
 
-    env:                environment. Needs to be vectorized for parallel environment simulation. 
+    env:                environment. Needs to be vectorized for parallel environment simulation.
                         The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class.
 
     nsteps:             int, number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where
                         nenv is number of environment copies simulated in parallel) (default: 20)
 
-    nstack:             int, size of the frame stack, i.e. number of the frames passed to the step model. Frames are stacked along channel dimension 
+    nstack:             int, size of the frame stack, i.e. number of the frames passed to the step model. Frames are stacked along channel dimension
                         (last image dimension) (default: 4)
 
     total_timesteps:    int, number of timesteps (i.e. number of actions taken in the environment) (default: 80M)
@@ -303,11 +303,11 @@ def learn(network, env, seed=None, nsteps=20, nstack=4, total_timesteps=int(80e6
 
     ent_coef:           float, policy entropy coefficient in the optimization objective (default: 0.01)
 
-    max_grad_norm:      float, gradient norm clipping coefficient. If set to None, no clipping. (default: 10), 
-    
+    max_grad_norm:      float, gradient norm clipping coefficient. If set to None, no clipping. (default: 10),
+
     lr:                 float, learning rate for RMSProp (current implementation has RMSProp hardcoded in) (default: 7e-4)
 
-    lrschedule:         schedule of learning rate. Can be 'linear', 'constant', or a function [0..1] -> [0..1] that takes fraction of the training progress as input and 
+    lrschedule:         schedule of learning rate. Can be 'linear', 'constant', or a function [0..1] -> [0..1] that takes fraction of the training progress as input and
                         returns fraction of the learning rate (specified as lr) as output
 
     rprop_epsilon:      float, RMSProp epsilon (stabilizes square root computation in denominator of RMSProp update) (default: 1e-5)
@@ -325,17 +325,17 @@ def learn(network, env, seed=None, nsteps=20, nstack=4, total_timesteps=int(80e6
     replay_start:       int, the sampling from the replay buffer does not start until replay buffer has at least that many samples (default: 10k)
 
     c:                  float, importance weight clipping factor (default: 10)
-    
+
     trust_region        bool, whether or not algorithms estimates the gradient KL divergence between the old and updated policy and uses it to determine step size  (default: True)
 
     delta:              float, max KL divergence between the old policy and updated policy (default: 1)
 
-    alpha:              float, momentum factor in the Polyak (exponential moving average) averaging of the model parameters (default: 0.99) 
+    alpha:              float, momentum factor in the Polyak (exponential moving average) averaging of the model parameters (default: 0.99)
 
     load_path:          str, path to load the model from (default: None)
 
     **network_kwargs:               keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
-                                    For instance, 'mlp' network architecture has arguments num_hidden and num_layers. 
+                                    For instance, 'mlp' network architecture has arguments num_hidden and num_layers.
 
     '''
 
diff --git a/baselines/bench/__init__.py b/baselines/bench/__init__.py
index 4fd3874..4cbd5bb 100644
--- a/baselines/bench/__init__.py
+++ b/baselines/bench/__init__.py
@@ -1,2 +1,2 @@
 from baselines.bench.benchmarks import *
-from baselines.bench.monitor import *
\ No newline at end of file
+from baselines.bench.monitor import *
diff --git a/baselines/bench/monitor.py b/baselines/bench/monitor.py
index bb0c282..8024ea0 100644
--- a/baselines/bench/monitor.py
+++ b/baselines/bench/monitor.py
@@ -102,7 +102,7 @@ def get_monitor_files(dir):
 def load_results(dir):
     import pandas
     monitor_files = (
-        glob(osp.join(dir, "*monitor.json")) + 
+        glob(osp.join(dir, "*monitor.json")) +
         glob(osp.join(dir, "*monitor.csv"))) # get both csv and (old) json files
     if not monitor_files:
         raise LoadMonitorResultsError("no monitor files of the form *%s found in %s" % (Monitor.EXT, dir))
diff --git a/baselines/common/cg.py b/baselines/common/cg.py
index a913186..52ca49d 100644
--- a/baselines/common/cg.py
+++ b/baselines/common/cg.py
@@ -31,4 +31,4 @@ def cg(f_Ax, b, cg_iters=10, callback=None, verbose=False, residual_tol=1e-10):
     if callback is not None:
         callback(x)
     if verbose: print(fmtstr % (i+1, rdotr, np.linalg.norm(x)))  # pylint: disable=W0631
-    return x
\ No newline at end of file
+    return x
diff --git a/baselines/common/cmd_util.py b/baselines/common/cmd_util.py
index 2f6c21a..cb4f054 100644
--- a/baselines/common/cmd_util.py
+++ b/baselines/common/cmd_util.py
@@ -29,7 +29,7 @@ def make_vec_env(env_id, env_type, num_env, seed, wrapper_kwargs=None, start_ind
         def _thunk():
             env = make_atari(env_id) if env_type == 'atari' else gym.make(env_id)
             env.seed(seed + 10000*mpi_rank + rank if seed is not None else None)
-            env = Monitor(env, 
+            env = Monitor(env,
                           logger.get_dir() and os.path.join(logger.get_dir(), str(mpi_rank) + '.' + str(rank)),
                           allow_early_resets=True)
 
diff --git a/baselines/common/console_util.py b/baselines/common/console_util.py
index e1237f2..a7e94c0 100644
--- a/baselines/common/console_util.py
+++ b/baselines/common/console_util.py
@@ -2,7 +2,7 @@ from __future__ import print_function
 from contextlib import contextmanager
 import numpy as np
 import time
-import shlex 
+import shlex
 import subprocess
 
 # ================================================================
diff --git a/baselines/common/input.py b/baselines/common/input.py
index dff9480..7d51008 100644
--- a/baselines/common/input.py
+++ b/baselines/common/input.py
@@ -2,15 +2,15 @@ import tensorflow as tf
 from gym.spaces import Discrete, Box
 
 def observation_placeholder(ob_space, batch_size=None, name='Ob'):
-    ''' 
+    '''
     Create placeholder to feed observations into of the size appropriate to the observation space
-    
+
     Parameters:
     ----------
 
     ob_space: gym.Space     observation space
-    
-    batch_size: int         size of the batch to be fed into input. Can be left None in most cases. 
+
+    batch_size: int         size of the batch to be fed into input. Can be left None in most cases.
 
     name: str               name of the placeholder
 
@@ -27,9 +27,9 @@ def observation_placeholder(ob_space, batch_size=None, name='Ob'):
 
 
 def observation_input(ob_space, batch_size=None, name='Ob'):
-    ''' 
-    Create placeholder to feed observations into of the size appropriate to the observation space, and add input 
-    encoder of the appropriate type. 
+    '''
+    Create placeholder to feed observations into of the size appropriate to the observation space, and add input
+    encoder of the appropriate type.
     '''
 
     placeholder = observation_placeholder(ob_space, batch_size, name)
@@ -41,9 +41,9 @@ def encode_observation(ob_space, placeholder):
 
     Parameters:
     ----------
-    
+
     ob_space: gym.Space             observation space
-    
+
     placeholder: tf.placeholder     observation input placeholder
     '''
     if isinstance(ob_space, Discrete):
diff --git a/baselines/common/math_util.py b/baselines/common/math_util.py
index 36b8927..461bdb7 100644
--- a/baselines/common/math_util.py
+++ b/baselines/common/math_util.py
@@ -82,4 +82,4 @@ def test_discount_with_boundaries():
         2 + gamma * 3,
         3,
         4
-    ])
\ No newline at end of file
+    ])
diff --git a/baselines/common/mpi_adam.py b/baselines/common/mpi_adam.py
index 4902caf..17491d7 100644
--- a/baselines/common/mpi_adam.py
+++ b/baselines/common/mpi_adam.py
@@ -76,4 +76,4 @@ def test_MpiAdam():
     for i in range(10):
         l,g = lossandgrad()
         adam.update(g, stepsize)
-        print(i,l)
\ No newline at end of file
+        print(i,l)
diff --git a/baselines/common/mpi_fork.py b/baselines/common/mpi_fork.py
index c5e609e..07b555e 100644
--- a/baselines/common/mpi_fork.py
+++ b/baselines/common/mpi_fork.py
@@ -4,7 +4,7 @@ def mpi_fork(n, bind_to_core=False):
     """Re-launches the current script with workers
     Returns "parent" for original parent, "child" for MPI children
     """
-    if n<=1: 
+    if n<=1:
         return "child"
     if os.getenv("IN_MPI") is None:
         env = os.environ.copy()
diff --git a/baselines/common/mpi_moments.py b/baselines/common/mpi_moments.py
index 7fcc6cd..7a97a43 100644
--- a/baselines/common/mpi_moments.py
+++ b/baselines/common/mpi_moments.py
@@ -33,8 +33,8 @@ def mpi_moments(x, axis=0, comm=None, keepdims=False):
 
 def test_runningmeanstd():
     import subprocess
-    subprocess.check_call(['mpirun', '-np', '3', 
-        'python','-c', 
+    subprocess.check_call(['mpirun', '-np', '3',
+        'python','-c',
         'from baselines.common.mpi_moments import _helper_runningmeanstd; _helper_runningmeanstd()'])
 
 def _helper_runningmeanstd():
diff --git a/baselines/common/policies.py b/baselines/common/policies.py
index 46207b5..6071ad2 100644
--- a/baselines/common/policies.py
+++ b/baselines/common/policies.py
@@ -32,7 +32,7 @@ class PolicyWithValue(object):
         **tensors       tensorflow tensors for additional attributes such as state or mask
 
         """
-            
+
         self.X = observations
         self.state = tf.constant([])
         self.initial_state = None
@@ -85,7 +85,7 @@ class PolicyWithValue(object):
         -------
         (action, value estimate, next state, negative log likelihood of the action under current policy parameters) tuple
         """
-    
+
         a, v, state, neglogp = self._evaluate([self.action, self.vf, self.state, self.neglogp], observation, **extra_feed)
         if state.size == 0:
             state = None
@@ -106,14 +106,14 @@ class PolicyWithValue(object):
         -------
         value estimate
         """
-        return self._evaluate(self.vf, ob, *args, **kwargs)      
+        return self._evaluate(self.vf, ob, *args, **kwargs)
 
     def save(self, save_path):
         tf_util.save_state(save_path, sess=self.sess)
 
     def load(self, load_path):
         tf_util.load_state(load_path, sess=self.sess)
-  
+
 def build_policy(env, policy_network, value_network=None,  normalize_observations=False, estimate_q=False, **policy_kwargs):
     if isinstance(policy_network, str):
         network_type = policy_network
@@ -123,7 +123,7 @@ def build_policy(env, policy_network, value_network=None,  normalize_observation
         ob_space = env.observation_space
 
         X = observ_placeholder if observ_placeholder is not None else observation_placeholder(ob_space, batch_size=nbatch)
-        
+
         extra_tensors = {}
 
         if normalize_observations and X.dtype == tf.float32:
@@ -144,7 +144,7 @@ def build_policy(env, policy_network, value_network=None,  normalize_observation
                 policy_latent, recurrent_tensors = policy_network(encoded_x, nenv)
                 extra_tensors.update(recurrent_tensors)
 
-            
+
         _v_net = value_network
 
         if _v_net is None or _v_net == 'shared':
@@ -154,10 +154,10 @@ def build_policy(env, policy_network, value_network=None,  normalize_observation
                 _v_net = policy_network
             else:
                 assert callable(_v_net)
- 
+
             with tf.variable_scope('vf', reuse=tf.AUTO_REUSE):
                 vf_latent, _ = _v_net(encoded_x)
-        
+
         policy = PolicyWithValue(
             env=env,
             observations=X,
@@ -176,4 +176,4 @@ def _normalize_clip_observation(x, clip_range=[-5.0, 5.0]):
     rms = RunningMeanStd(shape=x.shape[1:])
     norm_x = tf.clip_by_value((x - rms.mean) / rms.std, min(clip_range), max(clip_range))
     return norm_x, rms
-    
+
diff --git a/baselines/common/running_mean_std.py b/baselines/common/running_mean_std.py
index 504c7c9..443aa74 100644
--- a/baselines/common/running_mean_std.py
+++ b/baselines/common/running_mean_std.py
@@ -23,15 +23,15 @@ def update_mean_var_count_from_moments(mean, var, count, batch_mean, batch_var,
     delta = batch_mean - mean
     tot_count = count + batch_count
 
-    new_mean = mean + delta * batch_count / tot_count        
+    new_mean = mean + delta * batch_count / tot_count
     m_a = var * count
     m_b = batch_var * batch_count
     M2 = m_a + m_b + np.square(delta) * count * batch_count / (count + batch_count)
     new_var = M2 / (count + batch_count)
     new_count = batch_count + count
-    
+
     return new_mean, new_var, new_count
-    
+
 
 class TfRunningMeanStd(object):
     # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
@@ -46,10 +46,10 @@ class TfRunningMeanStd(object):
         self._new_var = tf.placeholder(shape=shape, dtype=tf.float64)
         self._new_count = tf.placeholder(shape=(), dtype=tf.float64)
 
-        
+
         with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
             self._mean  = tf.get_variable('mean',  initializer=np.zeros(shape, 'float64'),      dtype=tf.float64)
-            self._var   = tf.get_variable('std',   initializer=np.ones(shape, 'float64'),       dtype=tf.float64)    
+            self._var   = tf.get_variable('std',   initializer=np.ones(shape, 'float64'),       dtype=tf.float64)
             self._count = tf.get_variable('count', initializer=np.full((), epsilon, 'float64'), dtype=tf.float64)
 
         self.update_ops = tf.group([
@@ -61,10 +61,10 @@ class TfRunningMeanStd(object):
         sess.run(tf.variables_initializer([self._mean, self._var, self._count]))
         self.sess = sess
         self._set_mean_var_count()
-    
+
     def _set_mean_var_count(self):
-        self.mean, self.var, self.count = self.sess.run([self._mean, self._var, self._count])                    
-         
+        self.mean, self.var, self.count = self.sess.run([self._mean, self._var, self._count])
+
     def update(self, x):
         batch_mean = np.mean(x, axis=0)
         batch_var = np.var(x, axis=0)
@@ -74,13 +74,13 @@ class TfRunningMeanStd(object):
 
         self.sess.run(self.update_ops, feed_dict={
             self._new_mean: new_mean,
-            self._new_var: new_var, 
+            self._new_var: new_var,
             self._new_count: new_count
         })
 
         self._set_mean_var_count()
 
-        
+
 
 def test_runningmeanstd():
     for (x1, x2, x3) in [
@@ -145,7 +145,7 @@ def profile_tf_runningmeanstd():
 
     print('rms update time ({} trials): {} s'.format(n_trials, tic2 - tic1))
     print('tfrms update time ({} trials): {} s'.format(n_trials, tic3 - tic2))
-    
+
 
     tic1 = time.time()
     for _ in range(n_trials):
@@ -161,21 +161,21 @@ def profile_tf_runningmeanstd():
 
     print('rms get mean time ({} trials): {} s'.format(n_trials, tic2 - tic1))
     print('tfrms get mean time ({} trials): {} s'.format(n_trials, tic3 - tic2))
-         
-    
+
+
 
     '''
     options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) #pylint: disable=E1101
     run_metadata = tf.RunMetadata()
     profile_opts = dict(options=options, run_metadata=run_metadata)
 
-    
+
 
     from tensorflow.python.client import timeline
     fetched_timeline = timeline.Timeline(run_metadata.step_stats) #pylint: disable=E1101
     chrome_trace = fetched_timeline.generate_chrome_trace_format()
     outfile = '/tmp/timeline.json'
-    with open(outfile, 'wt') as f: 
+    with open(outfile, 'wt') as f:
         f.write(chrome_trace)
     print(f'Successfully saved profile to {outfile}. Exiting.')
     exit(0)
@@ -184,4 +184,4 @@ def profile_tf_runningmeanstd():
 
 
 if __name__ == '__main__':
-   profile_tf_runningmeanstd() 
+   profile_tf_runningmeanstd()
diff --git a/baselines/common/tests/envs/fixed_sequence_env.py b/baselines/common/tests/envs/fixed_sequence_env.py
index 9f1b03d..9b538e7 100644
--- a/baselines/common/tests/envs/fixed_sequence_env.py
+++ b/baselines/common/tests/envs/fixed_sequence_env.py
@@ -40,5 +40,5 @@ class FixedSequenceEnv(Env):
 
     def _get_reward(self, actions):
         return 1 if actions == self.sequence[self.time] else 0
-        
+
 
diff --git a/baselines/common/tests/envs/mnist_env.py b/baselines/common/tests/envs/mnist_env.py
index 563e215..4f73495 100644
--- a/baselines/common/tests/envs/mnist_env.py
+++ b/baselines/common/tests/envs/mnist_env.py
@@ -15,7 +15,7 @@ class MnistEnv(Env):
             no_images=None
     ):
         from tensorflow.examples.tutorials.mnist import input_data
-        # we could use temporary directory for this with a context manager and 
+        # we could use temporary directory for this with a context manager and
         # TemporaryDirecotry, but then each test that uses mnist would re-download the data
         # this way the data is not cleaned up, but we only download it once per machine
         mnist_path = osp.join(tempfile.gettempdir(), 'MNIST_data')
@@ -33,7 +33,7 @@ class MnistEnv(Env):
 
         self.train_mode()
         self.reset()
-        
+
     def reset(self):
         self._choose_next_state()
         self.time = 0
diff --git a/baselines/common/tests/test_cartpole.py b/baselines/common/tests/test_cartpole.py
index fe799a3..0660161 100644
--- a/baselines/common/tests/test_cartpole.py
+++ b/baselines/common/tests/test_cartpole.py
@@ -10,7 +10,7 @@ common_kwargs = dict(
     gamma=1.0,
     seed=0,
 )
-   
+
 learn_kwargs = {
     'a2c' : dict(nsteps=32, value_network='copy', lr=0.05),
     'acktr': dict(nsteps=32, value_network='copy'),
@@ -31,8 +31,8 @@ def test_cartpole(alg):
     kwargs.update(learn_kwargs[alg])
 
     learn_fn = lambda e: get_learn_function(alg)(env=e, **kwargs)
-    def env_fn(): 
-        
+    def env_fn():
+
         env = gym.make('CartPole-v0')
         env.seed(0)
         return env
diff --git a/baselines/common/tests/test_doc_examples.py b/baselines/common/tests/test_doc_examples.py
index b2d6e00..240175a 100644
--- a/baselines/common/tests/test_doc_examples.py
+++ b/baselines/common/tests/test_doc_examples.py
@@ -8,7 +8,7 @@ except BaseException:
 
 
 @pytest.mark.skipif(
-    not _mujoco_present, 
+    not _mujoco_present,
     reason='error loading mujoco - either mujoco / mujoco key not present, or LD_LIBRARY_PATH is not pointing to mujoco library'
 )
 def test_lstm_example():
@@ -37,12 +37,12 @@ def test_lstm_example():
             action, _, state, _ = policy.step(ob, S=state, M=done)
             ob, reward, done, _ = venv.step(action)
             step_counter += 1
-            if done:    
+            if done:
                 break
 
-        
+
         assert step_counter > 5
-            
+
 
 
 
diff --git a/baselines/common/tests/test_fixed_sequence.py b/baselines/common/tests/test_fixed_sequence.py
index f15ce0f..4131a9d 100644
--- a/baselines/common/tests/test_fixed_sequence.py
+++ b/baselines/common/tests/test_fixed_sequence.py
@@ -8,7 +8,7 @@ common_kwargs = dict(
     seed=0,
     total_timesteps=50000,
 )
-    
+
 learn_kwargs = {
     'a2c': {},
     'ppo2': dict(nsteps=10, ent_coef=0.0, nminibatches=1),
@@ -36,7 +36,7 @@ def test_fixed_sequence(alg, rnn):
     episode_len = 5
     env_fn = lambda: FixedSequenceEnv(10, episode_len=episode_len)
     learn = lambda e: get_learn_function(alg)(
-        env=e, 
+        env=e,
         network=rnn,
         **kwargs
     )
@@ -47,5 +47,5 @@ def test_fixed_sequence(alg, rnn):
 if __name__ == '__main__':
     test_fixed_sequence('ppo2', 'lstm')
 
-    
+
 
diff --git a/baselines/common/tests/test_identity.py b/baselines/common/tests/test_identity.py
index 71d5a3e..e880b11 100644
--- a/baselines/common/tests/test_identity.py
+++ b/baselines/common/tests/test_identity.py
@@ -9,7 +9,7 @@ common_kwargs = dict(
     gamma=0.9,
     seed=0,
 )
-   
+
 learn_kwargs = {
     'a2c' : {},
     'acktr': {},
@@ -51,5 +51,5 @@ def test_continuous_identity(alg):
     simple_test(env_fn, learn_fn, -0.1)
 
 if __name__ == '__main__':
-    test_continuous_identity('a2c')    
+    test_continuous_identity('a2c')
 
diff --git a/baselines/common/tests/test_mnist.py b/baselines/common/tests/test_mnist.py
index 5489c3a..536164f 100644
--- a/baselines/common/tests/test_mnist.py
+++ b/baselines/common/tests/test_mnist.py
@@ -6,7 +6,7 @@ from baselines.common.tests.util import simple_test
 from baselines.run import get_learn_function
 
 
-# TODO investigate a2c and ppo2 failures - is it due to bad hyperparameters for this problem?  
+# TODO investigate a2c and ppo2 failures - is it due to bad hyperparameters for this problem?
 # GitHub issue https://github.com/openai/baselines/issues/189
 common_kwargs = {
     'seed': 0,
@@ -25,21 +25,21 @@ learn_args = {
     'trpo_mpi': dict(total_timesteps=80000, timesteps_per_batch=100, cg_iters=10, lam=1.0, max_kl=0.001)
 }
 
- 
-#tests pass, but are too slow on travis. Same algorithms are covered 
+
+#tests pass, but are too slow on travis. Same algorithms are covered
 # by other tests with less compute-hungry nn's and by benchmarks
-@pytest.mark.skip 
+@pytest.mark.skip
 @pytest.mark.slow
 @pytest.mark.parametrize("alg", learn_args.keys())
 def test_mnist(alg):
     '''
-    Test if the algorithm can learn to classify MNIST digits. 
-    Uses CNN policy. 
+    Test if the algorithm can learn to classify MNIST digits.
+    Uses CNN policy.
     '''
-    
+
     learn_kwargs = learn_args[alg]
     learn_kwargs.update(common_kwargs)
-    
+
     learn = get_learn_function(alg)
     learn_fn = lambda e: learn(env=e, **learn_kwargs)
     env_fn = lambda: MnistEnv(seed=0, episode_len=100)
diff --git a/baselines/common/tests/test_serialization.py b/baselines/common/tests/test_serialization.py
index ca3d222..4086f2b 100644
--- a/baselines/common/tests/test_serialization.py
+++ b/baselines/common/tests/test_serialization.py
@@ -14,15 +14,15 @@ from functools import partial
 
 learn_kwargs = {
     'deepq': {},
-    'a2c': {}, 
+    'a2c': {},
     'acktr': {},
     'ppo2': {'nminibatches': 1, 'nsteps': 10},
     'trpo_mpi': {},
 }
 
 network_kwargs = {
-    'mlp': {}, 
-    'cnn': {'pad': 'SAME'}, 
+    'mlp': {},
+    'cnn': {'pad': 'SAME'},
     'lstm': {},
     'cnn_lnlstm': {'pad': 'SAME'}
 }
@@ -32,15 +32,15 @@ network_kwargs = {
 @pytest.mark.parametrize("network_fn", network_kwargs.keys())
 def test_serialization(learn_fn, network_fn):
     '''
-    Test if the trained model can be serialized 
+    Test if the trained model can be serialized
     '''
 
-    
+
     if network_fn.endswith('lstm') and learn_fn in ['acktr', 'trpo_mpi', 'deepq']:
             # TODO make acktr work with recurrent policies
             # and test
             # github issue: https://github.com/openai/baselines/issues/194
-            return 
+            return
 
     env = DummyVecEnv([lambda: MnistEnv(10, episode_len=100)])
     ob = env.reset().copy()
@@ -74,14 +74,14 @@ def test_serialization(learn_fn, network_fn):
         np.testing.assert_allclose(mean1, mean2, atol=0.5)
         np.testing.assert_allclose(std1, std2, atol=0.5)
 
- 
+
 
 def _serialize_variables():
     sess = get_session()
-    variables = tf.trainable_variables()    
+    variables = tf.trainable_variables()
     values = sess.run(variables)
     return {var.name: value for var, value in zip(variables, values)}
-    
+
 
 def _get_action_stats(model, ob):
     ntrials = 1000
diff --git a/baselines/common/tests/util.py b/baselines/common/tests/util.py
index 30b8954..86a418e 100644
--- a/baselines/common/tests/util.py
+++ b/baselines/common/tests/util.py
@@ -30,7 +30,7 @@ def simple_test(env_fn, learn_fn, min_reward_fraction, n_trials=N_TRIALS):
                 a, v, state, _ = model.step(obs, S=state, M=[False])
             else:
                 a, v, _, _ = model.step(obs)
-            
+
             obs, rew, done, _ = env.step(a)
             sum_rew += float(rew)
 
@@ -46,7 +46,7 @@ def reward_per_episode_test(env_fn, learn_fn, min_avg_reward, n_trials=N_EPISODE
     with tf.Graph().as_default(), tf.Session(config=tf.ConfigProto(allow_soft_placement=True)).as_default():
         model = learn_fn(env)
 
-        N_TRIALS = 100    
+        N_TRIALS = 100
 
         observations, actions, rewards = rollout(env, model, N_TRIALS)
         rewards = [sum(r) for r in rewards]
diff --git a/baselines/common/tf_util.py b/baselines/common/tf_util.py
index a40b109..b293975 100644
--- a/baselines/common/tf_util.py
+++ b/baselines/common/tf_util.py
@@ -347,7 +347,7 @@ def load_variables(load_path, variables=None, sess=None):
     variables = variables or tf.trainable_variables()
 
     loaded_params = joblib.load(os.path.expanduser(load_path))
-    restores = []   
+    restores = []
     if isinstance(loaded_params, list):
         assert len(loaded_params) == len(variables), 'number of variables loaded mismatches len(variables)'
         for d, v in zip(loaded_params, variables):
diff --git a/baselines/common/vec_env/dummy_vec_env.py b/baselines/common/vec_env/dummy_vec_env.py
index 9c3858e..265308c 100644
--- a/baselines/common/vec_env/dummy_vec_env.py
+++ b/baselines/common/vec_env/dummy_vec_env.py
@@ -9,8 +9,8 @@ class DummyVecEnv(VecEnv):
         env = self.envs[0]
         VecEnv.__init__(self, len(env_fns), env.observation_space, env.action_space)
         obs_space = env.observation_space
- 
-        self.keys, shapes, dtypes = obs_space_info(obs_space)       
+
+        self.keys, shapes, dtypes = obs_space_info(obs_space)
         self.buf_obs = { k: np.zeros((self.num_envs,) + tuple(shapes[k]), dtype=dtypes[k]) for k in self.keys }
         self.buf_dones = np.zeros((self.num_envs,), dtype=np.bool)
         self.buf_rews  = np.zeros((self.num_envs,), dtype=np.float32)
@@ -62,7 +62,7 @@ class DummyVecEnv(VecEnv):
 
     def get_images(self):
         return [env.render(mode='rgb_array') for env in self.envs]
-        
+
     def render(self, mode='human'):
         if self.num_envs == 1:
             self.envs[0].render(mode=mode)
diff --git a/baselines/ddpg/memory.py b/baselines/ddpg/memory.py
index 90f0f9a..781fa71 100644
--- a/baselines/ddpg/memory.py
+++ b/baselines/ddpg/memory.py
@@ -71,7 +71,7 @@ class Memory(object):
     def append(self, obs0, action, reward, obs1, terminal1, training=True):
         if not training:
             return
-        
+
         self.observations0.append(obs0)
         self.actions.append(action)
         self.rewards.append(reward)
diff --git a/baselines/ddpg/models.py b/baselines/ddpg/models.py
index dc5803a..3cd9543 100644
--- a/baselines/ddpg/models.py
+++ b/baselines/ddpg/models.py
@@ -35,12 +35,12 @@ class Actor(Model):
             if self.layer_norm:
                 x = tc.layers.layer_norm(x, center=True, scale=True)
             x = tf.nn.relu(x)
-            
+
             x = tf.layers.dense(x, 64)
             if self.layer_norm:
                 x = tc.layers.layer_norm(x, center=True, scale=True)
             x = tf.nn.relu(x)
-            
+
             x = tf.layers.dense(x, self.nb_actions, kernel_initializer=tf.random_uniform_initializer(minval=-3e-3, maxval=3e-3))
             x = tf.nn.tanh(x)
         return x
diff --git a/baselines/deepq/deepq.py b/baselines/deepq/deepq.py
index 5a4b2e7..47fe19a 100644
--- a/baselines/deepq/deepq.py
+++ b/baselines/deepq/deepq.py
@@ -176,7 +176,7 @@ def learn(env,
     load_path: str
         path to load the model from. (default: None)
     **network_kwargs
-        additional keyword arguments to pass to the network builder. 
+        additional keyword arguments to pass to the network builder.
 
     Returns
     -------
@@ -215,7 +215,7 @@ def learn(env,
     }
 
     act = ActWrapper(act, act_params)
-  
+
     # Create the replay buffer
     if prioritized_replay:
         replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha)
@@ -246,7 +246,7 @@ def learn(env,
 
         model_file = os.path.join(td, "model")
         model_saved = False
-        
+
         if tf.train.latest_checkpoint(td) is not None:
             load_variables(model_file)
             logger.log('Loaded model from {}'.format(model_file))
@@ -254,7 +254,7 @@ def learn(env,
         elif load_path is not None:
             load_variables(load_path)
             logger.log('Loaded model from {}'.format(load_path))
-        
+
 
         for t in range(total_timesteps):
             if callback is not None:
diff --git a/baselines/deepq/experiments/enjoy_mountaincar.py b/baselines/deepq/experiments/enjoy_mountaincar.py
index 8b1089e..2998bb6 100644
--- a/baselines/deepq/experiments/enjoy_mountaincar.py
+++ b/baselines/deepq/experiments/enjoy_mountaincar.py
@@ -7,7 +7,7 @@ from baselines.common import models
 def main():
     env = gym.make("MountainCar-v0")
     act = deepq.learn(
-        env, 
+        env,
         network=models.mlp(num_layers=1, num_hidden=64),
         total_timesteps=0,
         load_path='mountaincar_model.pkl'
diff --git a/baselines/deepq/experiments/train_pong.py b/baselines/deepq/experiments/train_pong.py
index a8febb9..1483f12 100644
--- a/baselines/deepq/experiments/train_pong.py
+++ b/baselines/deepq/experiments/train_pong.py
@@ -29,7 +29,7 @@ def main():
 
     model.save('pong_model.pkl')
     env.close()
-    
+
 
 
 if __name__ == '__main__':
diff --git a/baselines/deepq/models.py b/baselines/deepq/models.py
index c41b707..e35b1f8 100644
--- a/baselines/deepq/models.py
+++ b/baselines/deepq/models.py
@@ -94,8 +94,8 @@ def cnn_to_mlp(convs, hiddens, dueling=False, layer_norm=False):
 def build_q_func(network, hiddens=[256], dueling=True, layer_norm=False, **network_kwargs):
     if isinstance(network, str):
         from baselines.common.models import get_network_builder
-        network = get_network_builder(network)(**network_kwargs)   
-        
+        network = get_network_builder(network)(**network_kwargs)
+
     def q_func_builder(input_placeholder, num_actions, scope, reuse=False):
         with tf.variable_scope(scope, reuse=reuse):
             latent, _ = network(input_placeholder)
@@ -125,5 +125,5 @@ def build_q_func(network, hiddens=[256], dueling=True, layer_norm=False, **netwo
             else:
                 q_out = action_scores
             return q_out
-            
+
     return q_func_builder
diff --git a/baselines/deepq/utils.py b/baselines/deepq/utils.py
index 2914f43..4dae7a6 100644
--- a/baselines/deepq/utils.py
+++ b/baselines/deepq/utils.py
@@ -66,13 +66,13 @@ class Uint8Input(PlaceholderTfInput):
 class ObservationInput(PlaceholderTfInput):
     def __init__(self, observation_space, name=None):
         """Creates an input placeholder tailored to a specific observation space
-        
+
         Parameters
         ----------
 
-        observation_space: 
+        observation_space:
                 observation space of the environment. Should be one of the gym.spaces types
-        name: str 
+        name: str
                 tensorflow name of the underlying placeholder
         """
         inpt, self.processed_inpt = observation_input(observation_space, name=name)
@@ -80,5 +80,5 @@ class ObservationInput(PlaceholderTfInput):
 
     def get(self):
         return self.processed_inpt
-    
-    
+
+
diff --git a/baselines/her/experiment/play.py b/baselines/her/experiment/play.py
index 5b2f85d..a6f94e9 100644
--- a/baselines/her/experiment/play.py
+++ b/baselines/her/experiment/play.py
@@ -41,7 +41,7 @@ def main(policy_file, seed, n_test_rollouts, render):
 
     for name in ['T', 'gamma', 'noise_eps', 'random_eps']:
         eval_params[name] = params[name]
-    
+
     evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params)
     evaluator.seed(seed)
 
diff --git a/baselines/her/experiment/plot.py b/baselines/her/experiment/plot.py
index 560903f..a14872d 100644
--- a/baselines/her/experiment/plot.py
+++ b/baselines/her/experiment/plot.py
@@ -37,12 +37,12 @@ def load_results(file):
 
 def pad(xs, value=np.nan):
     maxlen = np.max([len(x) for x in xs])
-    
+
     padded_xs = []
     for x in xs:
         if x.shape[0] >= maxlen:
             padded_xs.append(x)
-    
+
         padding = np.ones((maxlen - x.shape[0],) + x.shape[1:]) * value
         x_padded = np.concatenate([x, padding], axis=0)
         assert x_padded.shape[1:] == x.shape[1:]
diff --git a/baselines/ppo1/run_humanoid.py b/baselines/ppo1/run_humanoid.py
index d7d8f5a..17b42b5 100644
--- a/baselines/ppo1/run_humanoid.py
+++ b/baselines/ppo1/run_humanoid.py
@@ -23,17 +23,17 @@ def train(num_timesteps, seed, model_path=None):
             max_timesteps=num_timesteps,
             timesteps_per_actorbatch=2048,
             clip_param=0.2, entcoeff=0.0,
-            optim_epochs=10, 
-            optim_stepsize=3e-4, 
-            optim_batchsize=64, 
-            gamma=0.99, 
+            optim_epochs=10,
+            optim_stepsize=3e-4,
+            optim_batchsize=64,
+            gamma=0.99,
             lam=0.95,
             schedule='linear',
         )
     env.close()
     if model_path:
         U.save_state(model_path)
-        
+
     return pi
 
 class RewScale(gym.RewardWrapper):
@@ -48,28 +48,28 @@ def main():
     parser = mujoco_arg_parser()
     parser.add_argument('--model-path', default=os.path.join(logger.get_dir(), 'humanoid_policy'))
     parser.set_defaults(num_timesteps=int(2e7))
-   
+
     args = parser.parse_args()
-    
+
     if not args.play:
         # train the model
         train(num_timesteps=args.num_timesteps, seed=args.seed, model_path=args.model_path)
-    else:       
+    else:
         # construct the model object, load pre-trained model and render
         pi = train(num_timesteps=1, seed=args.seed)
         U.load_state(args.model_path)
         env = make_mujoco_env('Humanoid-v2', seed=0)
 
-        ob = env.reset()        
+        ob = env.reset()
         while True:
             action = pi.act(stochastic=False, ob=ob)[0]
             ob, _, done, _ =  env.step(action)
             env.render()
             if done:
                 ob = env.reset()
-        
-        
-    
+
+
+
 
 if __name__ == '__main__':
     main()
diff --git a/baselines/ppo2/ppo2.py b/baselines/ppo2/ppo2.py
index d118a72..0ceee8e 100644
--- a/baselines/ppo2/ppo2.py
+++ b/baselines/ppo2/ppo2.py
@@ -155,20 +155,20 @@ def learn(*, network, env, total_timesteps, seed=None, nsteps=2048, ent_coef=0.0
             save_interval=0, load_path=None, **network_kwargs):
     '''
     Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347)
-    
+
     Parameters:
     ----------
 
     network:                          policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
-                                      specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns 
+                                      specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
                                       tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
                                       neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
                                       See common/models.py/lstm for more details on using recurrent nets in policies
 
-    env: baselines.common.vec_env.VecEnv     environment. Needs to be vectorized for parallel environment simulation. 
+    env: baselines.common.vec_env.VecEnv     environment. Needs to be vectorized for parallel environment simulation.
                                       The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class.
 
-    
+
     nsteps: int                       number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where
                                       nenv is number of environment copies simulated in parallel)
 
@@ -176,38 +176,38 @@ def learn(*, network, env, total_timesteps, seed=None, nsteps=2048, ent_coef=0.0
 
     ent_coef: float                   policy entropy coefficient in the optimization objective
 
-    lr: float or function             learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the 
+    lr: float or function             learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the
                                       training and 0 is the end of the training.
 
     vf_coef: float                    value function loss coefficient in the optimization objective
 
     max_grad_norm: float or None      gradient norm clipping coefficient
-    
+
     gamma: float                      discounting factor
 
     lam: float                        advantage estimation discounting factor (lambda in the paper)
 
     log_interval: int                 number of timesteps between logging events
 
-    nminibatches: int                 number of training minibatches per update. For recurrent policies, 
-                                      should be smaller or equal than number of environments run in parallel. 
+    nminibatches: int                 number of training minibatches per update. For recurrent policies,
+                                      should be smaller or equal than number of environments run in parallel.
 
     noptepochs: int                   number of training epochs per update
 
-    cliprange: float or function      clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training 
-                                      and 0 is the end of the training 
+    cliprange: float or function      clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training
+                                      and 0 is the end of the training
 
     save_interval: int                number of timesteps between saving events
 
     load_path: str                    path to load the model from
 
     **network_kwargs:                 keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
-                                      For instance, 'mlp' network architecture has arguments num_hidden and num_layers. 
+                                      For instance, 'mlp' network architecture has arguments num_hidden and num_layers.
+
 
-    
 
     '''
-    
+
     set_global_seeds(seed)
 
     if isinstance(lr, float): lr = constfn(lr)
diff --git a/baselines/results_plotter.py b/baselines/results_plotter.py
index 0514204..123f850 100644
--- a/baselines/results_plotter.py
+++ b/baselines/results_plotter.py
@@ -84,4 +84,4 @@ def main():
     plt.show()
 
 if __name__ == '__main__':
-    main()
\ No newline at end of file
+    main()
diff --git a/baselines/run.py b/baselines/run.py
index cf65099..3ac3d81 100644
--- a/baselines/run.py
+++ b/baselines/run.py
@@ -120,7 +120,7 @@ def build_env(args):
         env = bench.Monitor(env, logger.get_dir())
         env = retro_wrappers.wrap_deepmind_retro(env)
 
-    else: 
+    else:
        get_session(tf.ConfigProto(allow_soft_placement=True,
                                    intra_op_parallelism_threads=1,
                                    inter_op_parallelism_threads=1))
@@ -128,7 +128,7 @@ def build_env(args):
        env = make_vec_env(env_id, env_type, args.num_env or 1, seed, reward_scale=args.reward_scale)
 
        if env_type == 'mujoco':
-           env = VecNormalize(env) 
+           env = VecNormalize(env)
 
     return env
 
diff --git a/baselines/trpo_mpi/defaults.py b/baselines/trpo_mpi/defaults.py
index 0b58d18..2ac3f7a 100644
--- a/baselines/trpo_mpi/defaults.py
+++ b/baselines/trpo_mpi/defaults.py
@@ -4,7 +4,7 @@ from baselines.common.models import mlp, cnn_small
 def atari():
     return dict(
         network = cnn_small(),
-        timesteps_per_batch=512, 
+        timesteps_per_batch=512,
         max_kl=0.001,
         cg_iters=10,
         cg_damping=1e-3,
@@ -26,5 +26,5 @@ def mujoco():
         lam=0.98,
         vf_iters=5,
         vf_stepsize=1e-3,
-        normalize_observations=True, 
+        normalize_observations=True,
     )
diff --git a/baselines/trpo_mpi/trpo_mpi.py b/baselines/trpo_mpi/trpo_mpi.py
index d84b0fc..2e49ab6 100644
--- a/baselines/trpo_mpi/trpo_mpi.py
+++ b/baselines/trpo_mpi/trpo_mpi.py
@@ -83,13 +83,13 @@ def add_vtarg_and_adv(seg, gamma, lam):
     seg["tdlamret"] = seg["adv"] + seg["vpred"]
 
 def learn(*,
-        network, 
+        network,
         env,
-        total_timesteps, 
+        total_timesteps,
         timesteps_per_batch=1024, # what to train on
-        max_kl=0.001, 
-        cg_iters=10,   
-        gamma=0.99, 
+        max_kl=0.001,
+        cg_iters=10,
+        gamma=0.99,
         lam=1.0, # advantage estimation
         seed=None,
         entcoeff=0.0,
@@ -103,7 +103,7 @@ def learn(*,
         ):
     '''
     learn a policy function with TRPO algorithm
-    
+
     Parameters:
     ----------
 
@@ -121,7 +121,7 @@ def learn(*,
 
     cg_iters                number of iterations of conjugate gradient algorithm
 
-    cg_damping              conjugate gradient damping 
+    cg_damping              conjugate gradient damping
 
     vf_stepsize             learning rate for adam optimizer used to optimie value function loss
 
@@ -130,11 +130,11 @@ def learn(*,
     total_timesteps           max number of timesteps
 
     max_episodes            max number of episodes
-    
+
     max_iters               maximum number of policy optimization iterations
 
     callback                function to be called with (locals(), globals()) each policy optimization step
-    
+
     load_path               str, path to load the model from (default: None, i.e. no model is loaded)
 
     **network_kwargs        keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
@@ -145,18 +145,18 @@ def learn(*,
     learnt model
 
     '''
-    
-    
+
+
     nworkers = MPI.COMM_WORLD.Get_size()
     rank = MPI.COMM_WORLD.Get_rank()
 
     cpus_per_worker = 1
     U.get_session(config=tf.ConfigProto(
-            allow_soft_placement=True, 
+            allow_soft_placement=True,
             inter_op_parallelism_threads=cpus_per_worker,
             intra_op_parallelism_threads=cpus_per_worker
     ))
-    
+
 
     policy = build_policy(env, network, value_network='copy', **network_kwargs)
     set_global_seeds(seed)
@@ -245,7 +245,7 @@ def learn(*,
     U.initialize()
     if load_path is not None:
         pi.load(load_path)
-    
+
     th_init = get_flat()
     MPI.COMM_WORLD.Bcast(th_init, root=0)
     set_from_flat(th_init)
@@ -384,8 +384,8 @@ def get_trainable_variables(scope):
     return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope)
 
 def get_vf_trainable_variables(scope):
-    return [v for v in get_trainable_variables(scope) if 'vf' in v.name[len(scope):].split('/')]    
+    return [v for v in get_trainable_variables(scope) if 'vf' in v.name[len(scope):].split('/')]
 
 def get_pi_trainable_variables(scope):
-    return [v for v in get_trainable_variables(scope) if 'pi' in v.name[len(scope):].split('/')]    
+    return [v for v in get_trainable_variables(scope) if 'pi' in v.name[len(scope):].split('/')]
 
diff --git a/setup.cfg b/setup.cfg
index 2ca999f..0b5d28a 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
 [flake8]
-select = F,E999
+select = F,E999,W291,W293
 exclude = 
     .git,
     __pycache__,
diff --git a/setup.py b/setup.py
index d4a00c1..a9648fa 100644
--- a/setup.py
+++ b/setup.py
@@ -8,7 +8,7 @@ if sys.version_info.major != 3:
 
 extras = {
     'test': [
-        'filelock', 
+        'filelock',
         'pytest'
     ]
 }