tighten flake8, autopep8 to fix trailing whitespaces and blank lines with whitespaces (#87)

2018-09-11 11:01:51 -07:00
parent e56803491f
commit 9070ee7ef3
43 changed files with 176 additions and 176 deletions
--- a/baselines/a2c/a2c.py
+++ b/baselines/a2c/a2c.py
@@ -97,21 +97,21 @@ def learn(
    load_path=None,
    **network_kwargs):

-    ''' 
+    '''
    Main entrypoint for A2C algorithm. Train a policy with given network architecture on a given environment using a2c algorithm.

    Parameters:
    -----------

    network:            policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
-                        specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns 
+                        specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
                        tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
                        neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
                        See baselines.common/policies.py/lstm for more details on using recurrent nets in policies
-                
+

    env:                RL environment. Should implement interface similar to VecEnv (baselines.common/vec_env) or be wrapped with DummyVecEnv (baselines.common/vec_env/dummy_vec_env.py)
-                    
+

    seed:               seed to make random number sequence in the alorightm reproducible. By default is None which means seed from system noise generator (not reproducible)

@@ -128,7 +128,7 @@ def learn(

    lr:                 float, learning rate for RMSProp (current implementation has RMSProp hardcoded in) (default: 7e-4)

-    lrschedule:         schedule of learning rate. Can be 'linear', 'constant', or a function [0..1] -> [0..1] that takes fraction of the training progress as input and 
+    lrschedule:         schedule of learning rate. Can be 'linear', 'constant', or a function [0..1] -> [0..1] that takes fraction of the training progress as input and
                        returns fraction of the learning rate (specified as lr) as output

    epsilon:            float, RMSProp epsilon (stabilizes square root computation in denominator of RMSProp update) (default: 1e-5)
@@ -140,17 +140,17 @@ def learn(
    log_interval:       int, specifies how frequently the logs are printed out (default: 100)

    **network_kwargs:   keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
-                        For instance, 'mlp' network architecture has arguments num_hidden and num_layers. 
+                        For instance, 'mlp' network architecture has arguments num_hidden and num_layers.

    '''
-    
+


    set_global_seeds(seed)

    nenvs = env.num_envs
    policy = build_policy(env, network, **network_kwargs)
-   
+
    model = Model(policy=policy, env=env, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef,
        max_grad_norm=max_grad_norm, lr=lr, alpha=alpha, epsilon=epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule)
    if load_path is not None:
--- a/baselines/a2c/runner.py
+++ b/baselines/a2c/runner.py
@@ -9,7 +9,7 @@ class Runner(AbstractEnvRunner):
        self.gamma = gamma
        self.batch_action_shape = [x if x is not None else -1 for x in model.train_model.action.shape.as_list()]
        self.ob_dtype = model.train_model.X.dtype.as_numpy_dtype
-    
+
    def run(self):
        mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[]
        mb_states = self.states
@@ -51,7 +51,7 @@ class Runner(AbstractEnvRunner):
                    rewards = discount_with_dones(rewards, dones, self.gamma)

                mb_rewards[n] = rewards
-    
+
        mb_actions = mb_actions.reshape(self.batch_action_shape)

        mb_rewards = mb_rewards.flatten()
--- a/baselines/acer/acer.py
+++ b/baselines/acer/acer.py
@@ -70,7 +70,7 @@ class Model(object):
        MU = tf.placeholder(tf.float32, [nbatch, nact]) # mu's
        LR = tf.placeholder(tf.float32, [])
        eps = 1e-6
-    
+
        step_ob_placeholder = tf.placeholder(dtype=ob_space.dtype, shape=(nenvs,) + ob_space.shape[:-1] + (ob_space.shape[-1] * nstack,))
        train_ob_placeholder = tf.placeholder(dtype=ob_space.dtype, shape=(nenvs*(nsteps+1),) + ob_space.shape[:-1] + (ob_space.shape[-1] * nstack,))
        with tf.variable_scope('acer_model', reuse=tf.AUTO_REUSE):
@@ -78,7 +78,7 @@ class Model(object):
            step_model = policy(observ_placeholder=step_ob_placeholder, sess=sess)
            train_model = policy(observ_placeholder=train_ob_placeholder, sess=sess)

-    
+
        params = find_trainable_variables("acer_model")
        print("Params {}".format(len(params)))
        for var in params:
@@ -97,10 +97,10 @@ class Model(object):
            polyak_model = policy(observ_placeholder=train_ob_placeholder, sess=sess)

        # Notation: (var) = batch variable, (var)s = seqeuence variable, (var)_i = variable index by action at step i
-        
+
        # action probability distributions according to train_model, polyak_model and step_model
        # poilcy.pi is probability distribution parameters; to obtain distribution that sums to 1 need to take softmax
-        train_model_p = tf.nn.softmax(train_model.pi)  
+        train_model_p = tf.nn.softmax(train_model.pi)
        polyak_model_p = tf.nn.softmax(polyak_model.pi)
        step_model_p = tf.nn.softmax(step_model.pi)
        v = tf.reduce_sum(train_model_p * train_model.q, axis = -1) # shape is [nenvs * (nsteps + 1)]
@@ -119,7 +119,7 @@ class Model(object):
        qret = q_retrace(R, D, q_i, v, rho_i, nenvs, nsteps, gamma)

        # Calculate losses
-        # Entropy   
+        # Entropy
        # entropy = tf.reduce_mean(strip(train_model.pd.entropy(), nenvs, nsteps))
        entropy = tf.reduce_mean(cat_entropy_softmax(f))

@@ -212,8 +212,8 @@ class Model(object):

        def _step(observation, **kwargs):
            return step_model._evaluate([step_model.action, step_model_p, step_model.state], observation, **kwargs)
-                
-                    
+
+

        self.train = train
        self.save = functools.partial(save_variables, sess=sess, variables=params)
@@ -283,18 +283,18 @@ def learn(network, env, seed=None, nsteps=20, nstack=4, total_timesteps=int(80e6
    ----------

    network:            policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
-                        specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns 
+                        specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
                        tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
                        neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
                        See baselines.common/policies.py/lstm for more details on using recurrent nets in policies

-    env:                environment. Needs to be vectorized for parallel environment simulation. 
+    env:                environment. Needs to be vectorized for parallel environment simulation.
                        The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class.

    nsteps:             int, number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where
                        nenv is number of environment copies simulated in parallel) (default: 20)

-    nstack:             int, size of the frame stack, i.e. number of the frames passed to the step model. Frames are stacked along channel dimension 
+    nstack:             int, size of the frame stack, i.e. number of the frames passed to the step model. Frames are stacked along channel dimension
                        (last image dimension) (default: 4)

    total_timesteps:    int, number of timesteps (i.e. number of actions taken in the environment) (default: 80M)
@@ -303,11 +303,11 @@ def learn(network, env, seed=None, nsteps=20, nstack=4, total_timesteps=int(80e6

    ent_coef:           float, policy entropy coefficient in the optimization objective (default: 0.01)

-    max_grad_norm:      float, gradient norm clipping coefficient. If set to None, no clipping. (default: 10), 
-    
+    max_grad_norm:      float, gradient norm clipping coefficient. If set to None, no clipping. (default: 10),
+
    lr:                 float, learning rate for RMSProp (current implementation has RMSProp hardcoded in) (default: 7e-4)

-    lrschedule:         schedule of learning rate. Can be 'linear', 'constant', or a function [0..1] -> [0..1] that takes fraction of the training progress as input and 
+    lrschedule:         schedule of learning rate. Can be 'linear', 'constant', or a function [0..1] -> [0..1] that takes fraction of the training progress as input and
                        returns fraction of the learning rate (specified as lr) as output

    rprop_epsilon:      float, RMSProp epsilon (stabilizes square root computation in denominator of RMSProp update) (default: 1e-5)
@@ -325,17 +325,17 @@ def learn(network, env, seed=None, nsteps=20, nstack=4, total_timesteps=int(80e6
    replay_start:       int, the sampling from the replay buffer does not start until replay buffer has at least that many samples (default: 10k)

    c:                  float, importance weight clipping factor (default: 10)
-    
+
    trust_region        bool, whether or not algorithms estimates the gradient KL divergence between the old and updated policy and uses it to determine step size  (default: True)

    delta:              float, max KL divergence between the old policy and updated policy (default: 1)

-    alpha:              float, momentum factor in the Polyak (exponential moving average) averaging of the model parameters (default: 0.99) 
+    alpha:              float, momentum factor in the Polyak (exponential moving average) averaging of the model parameters (default: 0.99)

    load_path:          str, path to load the model from (default: None)

    **network_kwargs:               keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
-                                    For instance, 'mlp' network architecture has arguments num_hidden and num_layers. 
+                                    For instance, 'mlp' network architecture has arguments num_hidden and num_layers.

    '''

--- a/baselines/bench/init.py
+++ b/baselines/bench/init.py
@@ -1,2 +1,2 @@
 from baselines.bench.benchmarks import *
-from baselines.bench.monitor import *
+from baselines.bench.monitor import *
--- a/baselines/bench/monitor.py
+++ b/baselines/bench/monitor.py
@@ -102,7 +102,7 @@ def get_monitor_files(dir):
 def load_results(dir):
    import pandas
    monitor_files = (
-        glob(osp.join(dir, "*monitor.json")) + 
+        glob(osp.join(dir, "*monitor.json")) +
        glob(osp.join(dir, "*monitor.csv"))) # get both csv and (old) json files
    if not monitor_files:
        raise LoadMonitorResultsError("no monitor files of the form *%s found in %s" % (Monitor.EXT, dir))
--- a/baselines/common/cg.py
+++ b/baselines/common/cg.py
@@ -31,4 +31,4 @@ def cg(f_Ax, b, cg_iters=10, callback=None, verbose=False, residual_tol=1e-10):
    if callback is not None:
        callback(x)
    if verbose: print(fmtstr % (i+1, rdotr, np.linalg.norm(x)))  # pylint: disable=W0631
-    return x
+    return x
--- a/baselines/common/cmd_util.py
+++ b/baselines/common/cmd_util.py
@@ -29,7 +29,7 @@ def make_vec_env(env_id, env_type, num_env, seed, wrapper_kwargs=None, start_ind
        def _thunk():
            env = make_atari(env_id) if env_type == 'atari' else gym.make(env_id)
            env.seed(seed + 10000*mpi_rank + rank if seed is not None else None)
-            env = Monitor(env, 
+            env = Monitor(env,
                          logger.get_dir() and os.path.join(logger.get_dir(), str(mpi_rank) + '.' + str(rank)),
                          allow_early_resets=True)

--- a/baselines/common/console_util.py
+++ b/baselines/common/console_util.py
@@ -2,7 +2,7 @@ from __future__ import print_function
 from contextlib import contextmanager
 import numpy as np
 import time
-import shlex 
+import shlex
 import subprocess

 # ================================================================
--- a/baselines/common/input.py
+++ b/baselines/common/input.py
@@ -2,15 +2,15 @@ import tensorflow as tf
 from gym.spaces import Discrete, Box

 def observation_placeholder(ob_space, batch_size=None, name='Ob'):
-    ''' 
+    '''
    Create placeholder to feed observations into of the size appropriate to the observation space
-    
+
    Parameters:
    ----------

    ob_space: gym.Space     observation space
-    
-    batch_size: int         size of the batch to be fed into input. Can be left None in most cases. 
+
+    batch_size: int         size of the batch to be fed into input. Can be left None in most cases.

    name: str               name of the placeholder

@@ -27,9 +27,9 @@ def observation_placeholder(ob_space, batch_size=None, name='Ob'):


 def observation_input(ob_space, batch_size=None, name='Ob'):
-    ''' 
-    Create placeholder to feed observations into of the size appropriate to the observation space, and add input 
-    encoder of the appropriate type. 
+    '''
+    Create placeholder to feed observations into of the size appropriate to the observation space, and add input
+    encoder of the appropriate type.
    '''

    placeholder = observation_placeholder(ob_space, batch_size, name)
@@ -41,9 +41,9 @@ def encode_observation(ob_space, placeholder):

    Parameters:
    ----------
-    
+
    ob_space: gym.Space             observation space
-    
+
    placeholder: tf.placeholder     observation input placeholder
    '''
    if isinstance(ob_space, Discrete):
--- a/baselines/common/math_util.py
+++ b/baselines/common/math_util.py
@@ -82,4 +82,4 @@ def test_discount_with_boundaries():
        2 + gamma * 3,
        3,
        4
-    ])
+    ])
--- a/baselines/common/mpi_adam.py
+++ b/baselines/common/mpi_adam.py
@@ -76,4 +76,4 @@ def test_MpiAdam():
    for i in range(10):
        l,g = lossandgrad()
        adam.update(g, stepsize)
-        print(i,l)
+        print(i,l)
--- a/baselines/common/mpi_fork.py
+++ b/baselines/common/mpi_fork.py
@@ -4,7 +4,7 @@ def mpi_fork(n, bind_to_core=False):
    """Re-launches the current script with workers
    Returns "parent" for original parent, "child" for MPI children
    """
-    if n<=1: 
+    if n<=1:
        return "child"
    if os.getenv("IN_MPI") is None:
        env = os.environ.copy()
--- a/baselines/common/mpi_moments.py
+++ b/baselines/common/mpi_moments.py
@@ -33,8 +33,8 @@ def mpi_moments(x, axis=0, comm=None, keepdims=False):

 def test_runningmeanstd():
    import subprocess
-    subprocess.check_call(['mpirun', '-np', '3', 
-        'python','-c', 
+    subprocess.check_call(['mpirun', '-np', '3',
+        'python','-c',
        'from baselines.common.mpi_moments import _helper_runningmeanstd; _helper_runningmeanstd()'])

 def _helper_runningmeanstd():
--- a/baselines/common/policies.py
+++ b/baselines/common/policies.py
@@ -32,7 +32,7 @@ class PolicyWithValue(object):
        **tensors       tensorflow tensors for additional attributes such as state or mask

        """
-            
+
        self.X = observations
        self.state = tf.constant([])
        self.initial_state = None
@@ -85,7 +85,7 @@ class PolicyWithValue(object):
        -------
        (action, value estimate, next state, negative log likelihood of the action under current policy parameters) tuple
        """
-    
+
        a, v, state, neglogp = self._evaluate([self.action, self.vf, self.state, self.neglogp], observation, **extra_feed)
        if state.size == 0:
            state = None
@@ -106,14 +106,14 @@ class PolicyWithValue(object):
        -------
        value estimate
        """
-        return self._evaluate(self.vf, ob, *args, **kwargs)      
+        return self._evaluate(self.vf, ob, *args, **kwargs)

    def save(self, save_path):
        tf_util.save_state(save_path, sess=self.sess)

    def load(self, load_path):
        tf_util.load_state(load_path, sess=self.sess)
-  
+
 def build_policy(env, policy_network, value_network=None,  normalize_observations=False, estimate_q=False, **policy_kwargs):
    if isinstance(policy_network, str):
        network_type = policy_network
@@ -123,7 +123,7 @@ def build_policy(env, policy_network, value_network=None,  normalize_observation
        ob_space = env.observation_space

        X = observ_placeholder if observ_placeholder is not None else observation_placeholder(ob_space, batch_size=nbatch)
-        
+
        extra_tensors = {}

        if normalize_observations and X.dtype == tf.float32:
@@ -144,7 +144,7 @@ def build_policy(env, policy_network, value_network=None,  normalize_observation
                policy_latent, recurrent_tensors = policy_network(encoded_x, nenv)
                extra_tensors.update(recurrent_tensors)

-            
+
        _v_net = value_network

        if _v_net is None or _v_net == 'shared':
@@ -154,10 +154,10 @@ def build_policy(env, policy_network, value_network=None,  normalize_observation
                _v_net = policy_network
            else:
                assert callable(_v_net)
- 
+
            with tf.variable_scope('vf', reuse=tf.AUTO_REUSE):
                vf_latent, _ = _v_net(encoded_x)
-        
+
        policy = PolicyWithValue(
            env=env,
            observations=X,
@@ -176,4 +176,4 @@ def _normalize_clip_observation(x, clip_range=[-5.0, 5.0]):
    rms = RunningMeanStd(shape=x.shape[1:])
    norm_x = tf.clip_by_value((x - rms.mean) / rms.std, min(clip_range), max(clip_range))
    return norm_x, rms
-    
+
--- a/baselines/common/running_mean_std.py
+++ b/baselines/common/running_mean_std.py
@@ -23,15 +23,15 @@ def update_mean_var_count_from_moments(mean, var, count, batch_mean, batch_var,
    delta = batch_mean - mean
    tot_count = count + batch_count

-    new_mean = mean + delta * batch_count / tot_count        
+    new_mean = mean + delta * batch_count / tot_count
    m_a = var * count
    m_b = batch_var * batch_count
    M2 = m_a + m_b + np.square(delta) * count * batch_count / (count + batch_count)
    new_var = M2 / (count + batch_count)
    new_count = batch_count + count
-    
+
    return new_mean, new_var, new_count
-    
+

 class TfRunningMeanStd(object):
    # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
@@ -46,10 +46,10 @@ class TfRunningMeanStd(object):
        self._new_var = tf.placeholder(shape=shape, dtype=tf.float64)
        self._new_count = tf.placeholder(shape=(), dtype=tf.float64)

-        
+
        with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
            self._mean  = tf.get_variable('mean',  initializer=np.zeros(shape, 'float64'),      dtype=tf.float64)
-            self._var   = tf.get_variable('std',   initializer=np.ones(shape, 'float64'),       dtype=tf.float64)    
+            self._var   = tf.get_variable('std',   initializer=np.ones(shape, 'float64'),       dtype=tf.float64)
            self._count = tf.get_variable('count', initializer=np.full((), epsilon, 'float64'), dtype=tf.float64)

        self.update_ops = tf.group([
@@ -61,10 +61,10 @@ class TfRunningMeanStd(object):
        sess.run(tf.variables_initializer([self._mean, self._var, self._count]))
        self.sess = sess
        self._set_mean_var_count()
-    
+
    def _set_mean_var_count(self):
-        self.mean, self.var, self.count = self.sess.run([self._mean, self._var, self._count])                    
-         
+        self.mean, self.var, self.count = self.sess.run([self._mean, self._var, self._count])
+
    def update(self, x):
        batch_mean = np.mean(x, axis=0)
        batch_var = np.var(x, axis=0)
@@ -74,13 +74,13 @@ class TfRunningMeanStd(object):

        self.sess.run(self.update_ops, feed_dict={
            self._new_mean: new_mean,
-            self._new_var: new_var, 
+            self._new_var: new_var,
            self._new_count: new_count
        })

        self._set_mean_var_count()

-        
+

 def test_runningmeanstd():
    for (x1, x2, x3) in [
@@ -145,7 +145,7 @@ def profile_tf_runningmeanstd():

    print('rms update time ({} trials): {} s'.format(n_trials, tic2 - tic1))
    print('tfrms update time ({} trials): {} s'.format(n_trials, tic3 - tic2))
-    
+

    tic1 = time.time()
    for _ in range(n_trials):
@@ -161,21 +161,21 @@ def profile_tf_runningmeanstd():

    print('rms get mean time ({} trials): {} s'.format(n_trials, tic2 - tic1))
    print('tfrms get mean time ({} trials): {} s'.format(n_trials, tic3 - tic2))
-         
-    
+
+

    '''
    options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) #pylint: disable=E1101
    run_metadata = tf.RunMetadata()
    profile_opts = dict(options=options, run_metadata=run_metadata)

-    
+

    from tensorflow.python.client import timeline
    fetched_timeline = timeline.Timeline(run_metadata.step_stats) #pylint: disable=E1101
    chrome_trace = fetched_timeline.generate_chrome_trace_format()
    outfile = '/tmp/timeline.json'
-    with open(outfile, 'wt') as f: 
+    with open(outfile, 'wt') as f:
        f.write(chrome_trace)
    print(f'Successfully saved profile to {outfile}. Exiting.')
    exit(0)
@@ -184,4 +184,4 @@ def profile_tf_runningmeanstd():


 if __name__ == '__main__':
-   profile_tf_runningmeanstd() 
+   profile_tf_runningmeanstd()
--- a/baselines/common/tests/envs/fixed_sequence_env.py
+++ b/baselines/common/tests/envs/fixed_sequence_env.py
@@ -40,5 +40,5 @@ class FixedSequenceEnv(Env):

    def _get_reward(self, actions):
        return 1 if actions == self.sequence[self.time] else 0
-        
+

--- a/baselines/common/tests/envs/mnist_env.py
+++ b/baselines/common/tests/envs/mnist_env.py
@@ -15,7 +15,7 @@ class MnistEnv(Env):
            no_images=None
    ):
        from tensorflow.examples.tutorials.mnist import input_data
-        # we could use temporary directory for this with a context manager and 
+        # we could use temporary directory for this with a context manager and
        # TemporaryDirecotry, but then each test that uses mnist would re-download the data
        # this way the data is not cleaned up, but we only download it once per machine
        mnist_path = osp.join(tempfile.gettempdir(), 'MNIST_data')
@@ -33,7 +33,7 @@ class MnistEnv(Env):

        self.train_mode()
        self.reset()
-        
+
    def reset(self):
        self._choose_next_state()
        self.time = 0
--- a/baselines/common/tests/test_cartpole.py
+++ b/baselines/common/tests/test_cartpole.py
@@ -10,7 +10,7 @@ common_kwargs = dict(
    gamma=1.0,
    seed=0,
 )
-   
+
 learn_kwargs = {
    'a2c' : dict(nsteps=32, value_network='copy', lr=0.05),
    'acktr': dict(nsteps=32, value_network='copy'),
@@ -31,8 +31,8 @@ def test_cartpole(alg):
    kwargs.update(learn_kwargs[alg])

    learn_fn = lambda e: get_learn_function(alg)(env=e, **kwargs)
-    def env_fn(): 
-        
+    def env_fn():
+
        env = gym.make('CartPole-v0')
        env.seed(0)
        return env
--- a/baselines/common/tests/test_doc_examples.py
+++ b/baselines/common/tests/test_doc_examples.py
@@ -8,7 +8,7 @@ except BaseException:


@pytest.mark.skipif(
-    not _mujoco_present, 
+    not _mujoco_present,
    reason='error loading mujoco - either mujoco / mujoco key not present, or LD_LIBRARY_PATH is not pointing to mujoco library'
 )
 def test_lstm_example():
@@ -37,12 +37,12 @@ def test_lstm_example():
            action, _, state, _ = policy.step(ob, S=state, M=done)
            ob, reward, done, _ = venv.step(action)
            step_counter += 1
-            if done:    
+            if done:
                break

-        
+
        assert step_counter > 5
-            
+



--- a/baselines/common/tests/test_fixed_sequence.py
+++ b/baselines/common/tests/test_fixed_sequence.py
@@ -8,7 +8,7 @@ common_kwargs = dict(
    seed=0,
    total_timesteps=50000,
 )
-    
+
 learn_kwargs = {
    'a2c': {},
    'ppo2': dict(nsteps=10, ent_coef=0.0, nminibatches=1),
@@ -36,7 +36,7 @@ def test_fixed_sequence(alg, rnn):
    episode_len = 5
    env_fn = lambda: FixedSequenceEnv(10, episode_len=episode_len)
    learn = lambda e: get_learn_function(alg)(
-        env=e, 
+        env=e,
        network=rnn,
        **kwargs
    )
@@ -47,5 +47,5 @@ def test_fixed_sequence(alg, rnn):
 if __name__ == '__main__':
    test_fixed_sequence('ppo2', 'lstm')

-    
+

--- a/baselines/common/tests/test_identity.py
+++ b/baselines/common/tests/test_identity.py
@@ -9,7 +9,7 @@ common_kwargs = dict(
    gamma=0.9,
    seed=0,
 )
-   
+
 learn_kwargs = {
    'a2c' : {},
    'acktr': {},
@@ -51,5 +51,5 @@ def test_continuous_identity(alg):
    simple_test(env_fn, learn_fn, -0.1)

 if __name__ == '__main__':
-    test_continuous_identity('a2c')    
+    test_continuous_identity('a2c')

--- a/baselines/common/tests/test_mnist.py
+++ b/baselines/common/tests/test_mnist.py
@@ -6,7 +6,7 @@ from baselines.common.tests.util import simple_test
 from baselines.run import get_learn_function


-# TODO investigate a2c and ppo2 failures - is it due to bad hyperparameters for this problem?  
+# TODO investigate a2c and ppo2 failures - is it due to bad hyperparameters for this problem?
 # GitHub issue https://github.com/openai/baselines/issues/189
 common_kwargs = {
    'seed': 0,
@@ -25,21 +25,21 @@ learn_args = {
    'trpo_mpi': dict(total_timesteps=80000, timesteps_per_batch=100, cg_iters=10, lam=1.0, max_kl=0.001)
 }

- 
-#tests pass, but are too slow on travis. Same algorithms are covered 
+
+#tests pass, but are too slow on travis. Same algorithms are covered
 # by other tests with less compute-hungry nn's and by benchmarks
-@pytest.mark.skip 
+@pytest.mark.skip
@pytest.mark.slow
@pytest.mark.parametrize("alg", learn_args.keys())
 def test_mnist(alg):
    '''
-    Test if the algorithm can learn to classify MNIST digits. 
-    Uses CNN policy. 
+    Test if the algorithm can learn to classify MNIST digits.
+    Uses CNN policy.
    '''
-    
+
    learn_kwargs = learn_args[alg]
    learn_kwargs.update(common_kwargs)
-    
+
    learn = get_learn_function(alg)
    learn_fn = lambda e: learn(env=e, **learn_kwargs)
    env_fn = lambda: MnistEnv(seed=0, episode_len=100)
--- a/baselines/common/tests/test_serialization.py
+++ b/baselines/common/tests/test_serialization.py
@@ -14,15 +14,15 @@ from functools import partial

 learn_kwargs = {
    'deepq': {},
-    'a2c': {}, 
+    'a2c': {},
    'acktr': {},
    'ppo2': {'nminibatches': 1, 'nsteps': 10},
    'trpo_mpi': {},
 }

 network_kwargs = {
-    'mlp': {}, 
-    'cnn': {'pad': 'SAME'}, 
+    'mlp': {},
+    'cnn': {'pad': 'SAME'},
    'lstm': {},
    'cnn_lnlstm': {'pad': 'SAME'}
 }
@@ -32,15 +32,15 @@ network_kwargs = {
@pytest.mark.parametrize("network_fn", network_kwargs.keys())
 def test_serialization(learn_fn, network_fn):
    '''
-    Test if the trained model can be serialized 
+    Test if the trained model can be serialized
    '''

-    
+
    if network_fn.endswith('lstm') and learn_fn in ['acktr', 'trpo_mpi', 'deepq']:
            # TODO make acktr work with recurrent policies
            # and test
            # github issue: https://github.com/openai/baselines/issues/194
-            return 
+            return

    env = DummyVecEnv([lambda: MnistEnv(10, episode_len=100)])
    ob = env.reset().copy()
@@ -74,14 +74,14 @@ def test_serialization(learn_fn, network_fn):
        np.testing.assert_allclose(mean1, mean2, atol=0.5)
        np.testing.assert_allclose(std1, std2, atol=0.5)

- 
+

 def _serialize_variables():
    sess = get_session()
-    variables = tf.trainable_variables()    
+    variables = tf.trainable_variables()
    values = sess.run(variables)
    return {var.name: value for var, value in zip(variables, values)}
-    
+

 def _get_action_stats(model, ob):
    ntrials = 1000
--- a/baselines/common/tests/util.py
+++ b/baselines/common/tests/util.py
@@ -30,7 +30,7 @@ def simple_test(env_fn, learn_fn, min_reward_fraction, n_trials=N_TRIALS):
                a, v, state, _ = model.step(obs, S=state, M=[False])
            else:
                a, v, _, _ = model.step(obs)
-            
+
            obs, rew, done, _ = env.step(a)
            sum_rew += float(rew)

@@ -46,7 +46,7 @@ def reward_per_episode_test(env_fn, learn_fn, min_avg_reward, n_trials=N_EPISODE
    with tf.Graph().as_default(), tf.Session(config=tf.ConfigProto(allow_soft_placement=True)).as_default():
        model = learn_fn(env)

-        N_TRIALS = 100    
+        N_TRIALS = 100

        observations, actions, rewards = rollout(env, model, N_TRIALS)
        rewards = [sum(r) for r in rewards]
--- a/baselines/common/tf_util.py
+++ b/baselines/common/tf_util.py
@@ -347,7 +347,7 @@ def load_variables(load_path, variables=None, sess=None):
    variables = variables or tf.trainable_variables()

    loaded_params = joblib.load(os.path.expanduser(load_path))
-    restores = []   
+    restores = []
    if isinstance(loaded_params, list):
        assert len(loaded_params) == len(variables), 'number of variables loaded mismatches len(variables)'
        for d, v in zip(loaded_params, variables):
--- a/baselines/common/vec_env/dummy_vec_env.py
+++ b/baselines/common/vec_env/dummy_vec_env.py
@@ -9,8 +9,8 @@ class DummyVecEnv(VecEnv):
        env = self.envs[0]
        VecEnv.__init__(self, len(env_fns), env.observation_space, env.action_space)
        obs_space = env.observation_space
- 
-        self.keys, shapes, dtypes = obs_space_info(obs_space)       
+
+        self.keys, shapes, dtypes = obs_space_info(obs_space)
        self.buf_obs = { k: np.zeros((self.num_envs,) + tuple(shapes[k]), dtype=dtypes[k]) for k in self.keys }
        self.buf_dones = np.zeros((self.num_envs,), dtype=np.bool)
        self.buf_rews  = np.zeros((self.num_envs,), dtype=np.float32)
@@ -62,7 +62,7 @@ class DummyVecEnv(VecEnv):

    def get_images(self):
        return [env.render(mode='rgb_array') for env in self.envs]
-        
+
    def render(self, mode='human'):
        if self.num_envs == 1:
            self.envs[0].render(mode=mode)
--- a/baselines/ddpg/memory.py
+++ b/baselines/ddpg/memory.py
@@ -71,7 +71,7 @@ class Memory(object):
    def append(self, obs0, action, reward, obs1, terminal1, training=True):
        if not training:
            return
-        
+
        self.observations0.append(obs0)
        self.actions.append(action)
        self.rewards.append(reward)
--- a/baselines/ddpg/models.py
+++ b/baselines/ddpg/models.py
@@ -35,12 +35,12 @@ class Actor(Model):
            if self.layer_norm:
                x = tc.layers.layer_norm(x, center=True, scale=True)
            x = tf.nn.relu(x)
-            
+
            x = tf.layers.dense(x, 64)
            if self.layer_norm:
                x = tc.layers.layer_norm(x, center=True, scale=True)
            x = tf.nn.relu(x)
-            
+
            x = tf.layers.dense(x, self.nb_actions, kernel_initializer=tf.random_uniform_initializer(minval=-3e-3, maxval=3e-3))
            x = tf.nn.tanh(x)
        return x
--- a/baselines/deepq/deepq.py
+++ b/baselines/deepq/deepq.py
@@ -176,7 +176,7 @@ def learn(env,
    load_path: str
        path to load the model from. (default: None)
    **network_kwargs
-        additional keyword arguments to pass to the network builder. 
+        additional keyword arguments to pass to the network builder.

    Returns
    -------
@@ -215,7 +215,7 @@ def learn(env,
    }

    act = ActWrapper(act, act_params)
-  
+
    # Create the replay buffer
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha)
@@ -246,7 +246,7 @@ def learn(env,

        model_file = os.path.join(td, "model")
        model_saved = False
-        
+
        if tf.train.latest_checkpoint(td) is not None:
            load_variables(model_file)
            logger.log('Loaded model from {}'.format(model_file))
@@ -254,7 +254,7 @@ def learn(env,
        elif load_path is not None:
            load_variables(load_path)
            logger.log('Loaded model from {}'.format(load_path))
-        
+

        for t in range(total_timesteps):
            if callback is not None:
--- a/baselines/deepq/experiments/enjoy_mountaincar.py
+++ b/baselines/deepq/experiments/enjoy_mountaincar.py
@@ -7,7 +7,7 @@ from baselines.common import models
 def main():
    env = gym.make("MountainCar-v0")
    act = deepq.learn(
-        env, 
+        env,
        network=models.mlp(num_layers=1, num_hidden=64),
        total_timesteps=0,
        load_path='mountaincar_model.pkl'
--- a/baselines/deepq/experiments/train_pong.py
+++ b/baselines/deepq/experiments/train_pong.py
@@ -29,7 +29,7 @@ def main():

    model.save('pong_model.pkl')
    env.close()
-    
+


 if __name__ == '__main__':
--- a/baselines/deepq/models.py
+++ b/baselines/deepq/models.py
@@ -94,8 +94,8 @@ def cnn_to_mlp(convs, hiddens, dueling=False, layer_norm=False):
 def build_q_func(network, hiddens=[256], dueling=True, layer_norm=False, **network_kwargs):
    if isinstance(network, str):
        from baselines.common.models import get_network_builder
-        network = get_network_builder(network)(**network_kwargs)   
-        
+        network = get_network_builder(network)(**network_kwargs)
+
    def q_func_builder(input_placeholder, num_actions, scope, reuse=False):
        with tf.variable_scope(scope, reuse=reuse):
            latent, _ = network(input_placeholder)
@@ -125,5 +125,5 @@ def build_q_func(network, hiddens=[256], dueling=True, layer_norm=False, **netwo
            else:
                q_out = action_scores
            return q_out
-            
+
    return q_func_builder
--- a/baselines/deepq/utils.py
+++ b/baselines/deepq/utils.py
@@ -66,13 +66,13 @@ class Uint8Input(PlaceholderTfInput):
 class ObservationInput(PlaceholderTfInput):
    def __init__(self, observation_space, name=None):
        """Creates an input placeholder tailored to a specific observation space
-        
+
        Parameters
        ----------

-        observation_space: 
+        observation_space:
                observation space of the environment. Should be one of the gym.spaces types
-        name: str 
+        name: str
                tensorflow name of the underlying placeholder
        """
        inpt, self.processed_inpt = observation_input(observation_space, name=name)
@@ -80,5 +80,5 @@ class ObservationInput(PlaceholderTfInput):

    def get(self):
        return self.processed_inpt
-    
-    
+
+
--- a/baselines/her/experiment/play.py
+++ b/baselines/her/experiment/play.py
@@ -41,7 +41,7 @@ def main(policy_file, seed, n_test_rollouts, render):

    for name in ['T', 'gamma', 'noise_eps', 'random_eps']:
        eval_params[name] = params[name]
-    
+
    evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params)
    evaluator.seed(seed)

--- a/baselines/her/experiment/plot.py
+++ b/baselines/her/experiment/plot.py
@@ -37,12 +37,12 @@ def load_results(file):

 def pad(xs, value=np.nan):
    maxlen = np.max([len(x) for x in xs])
-    
+
    padded_xs = []
    for x in xs:
        if x.shape[0] >= maxlen:
            padded_xs.append(x)
-    
+
        padding = np.ones((maxlen - x.shape[0],) + x.shape[1:]) * value
        x_padded = np.concatenate([x, padding], axis=0)
        assert x_padded.shape[1:] == x.shape[1:]
--- a/baselines/ppo1/run_humanoid.py
+++ b/baselines/ppo1/run_humanoid.py
@@ -23,17 +23,17 @@ def train(num_timesteps, seed, model_path=None):
            max_timesteps=num_timesteps,
            timesteps_per_actorbatch=2048,
            clip_param=0.2, entcoeff=0.0,
-            optim_epochs=10, 
-            optim_stepsize=3e-4, 
-            optim_batchsize=64, 
-            gamma=0.99, 
+            optim_epochs=10,
+            optim_stepsize=3e-4,
+            optim_batchsize=64,
+            gamma=0.99,
            lam=0.95,
            schedule='linear',
        )
    env.close()
    if model_path:
        U.save_state(model_path)
-        
+
    return pi

 class RewScale(gym.RewardWrapper):
@@ -48,28 +48,28 @@ def main():
    parser = mujoco_arg_parser()
    parser.add_argument('--model-path', default=os.path.join(logger.get_dir(), 'humanoid_policy'))
    parser.set_defaults(num_timesteps=int(2e7))
-   
+
    args = parser.parse_args()
-    
+
    if not args.play:
        # train the model
        train(num_timesteps=args.num_timesteps, seed=args.seed, model_path=args.model_path)
-    else:       
+    else:
        # construct the model object, load pre-trained model and render
        pi = train(num_timesteps=1, seed=args.seed)
        U.load_state(args.model_path)
        env = make_mujoco_env('Humanoid-v2', seed=0)

-        ob = env.reset()        
+        ob = env.reset()
        while True:
            action = pi.act(stochastic=False, ob=ob)[0]
            ob, _, done, _ =  env.step(action)
            env.render()
            if done:
                ob = env.reset()
-        
-        
-    
+
+
+

 if __name__ == '__main__':
    main()
--- a/baselines/ppo2/ppo2.py
+++ b/baselines/ppo2/ppo2.py
@@ -155,20 +155,20 @@ def learn(*, network, env, total_timesteps, seed=None, nsteps=2048, ent_coef=0.0
            save_interval=0, load_path=None, **network_kwargs):
    '''
    Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347)
-    
+
    Parameters:
    ----------

    network:                          policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
-                                      specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns 
+                                      specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
                                      tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
                                      neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
                                      See common/models.py/lstm for more details on using recurrent nets in policies

-    env: baselines.common.vec_env.VecEnv     environment. Needs to be vectorized for parallel environment simulation. 
+    env: baselines.common.vec_env.VecEnv     environment. Needs to be vectorized for parallel environment simulation.
                                      The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class.

-    
+
    nsteps: int                       number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where
                                      nenv is number of environment copies simulated in parallel)

@@ -176,38 +176,38 @@ def learn(*, network, env, total_timesteps, seed=None, nsteps=2048, ent_coef=0.0

    ent_coef: float                   policy entropy coefficient in the optimization objective

-    lr: float or function             learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the 
+    lr: float or function             learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the
                                      training and 0 is the end of the training.

    vf_coef: float                    value function loss coefficient in the optimization objective

    max_grad_norm: float or None      gradient norm clipping coefficient
-    
+
    gamma: float                      discounting factor

    lam: float                        advantage estimation discounting factor (lambda in the paper)

    log_interval: int                 number of timesteps between logging events

-    nminibatches: int                 number of training minibatches per update. For recurrent policies, 
-                                      should be smaller or equal than number of environments run in parallel. 
+    nminibatches: int                 number of training minibatches per update. For recurrent policies,
+                                      should be smaller or equal than number of environments run in parallel.

    noptepochs: int                   number of training epochs per update

-    cliprange: float or function      clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training 
-                                      and 0 is the end of the training 
+    cliprange: float or function      clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training
+                                      and 0 is the end of the training

    save_interval: int                number of timesteps between saving events

    load_path: str                    path to load the model from

    **network_kwargs:                 keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
-                                      For instance, 'mlp' network architecture has arguments num_hidden and num_layers. 
+                                      For instance, 'mlp' network architecture has arguments num_hidden and num_layers.
+

-    

    '''
-    
+
    set_global_seeds(seed)

    if isinstance(lr, float): lr = constfn(lr)
--- a/baselines/results_plotter.py
+++ b/baselines/results_plotter.py
@@ -84,4 +84,4 @@ def main():
    plt.show()

 if __name__ == '__main__':
-    main()
+    main()
--- a/baselines/run.py
+++ b/baselines/run.py
@@ -120,7 +120,7 @@ def build_env(args):
        env = bench.Monitor(env, logger.get_dir())
        env = retro_wrappers.wrap_deepmind_retro(env)

-    else: 
+    else:
       get_session(tf.ConfigProto(allow_soft_placement=True,
                                   intra_op_parallelism_threads=1,
                                   inter_op_parallelism_threads=1))
@@ -128,7 +128,7 @@ def build_env(args):
       env = make_vec_env(env_id, env_type, args.num_env or 1, seed, reward_scale=args.reward_scale)

       if env_type == 'mujoco':
-           env = VecNormalize(env) 
+           env = VecNormalize(env)

    return env

--- a/baselines/trpo_mpi/defaults.py
+++ b/baselines/trpo_mpi/defaults.py
@@ -4,7 +4,7 @@ from baselines.common.models import mlp, cnn_small
 def atari():
    return dict(
        network = cnn_small(),
-        timesteps_per_batch=512, 
+        timesteps_per_batch=512,
        max_kl=0.001,
        cg_iters=10,
        cg_damping=1e-3,
@@ -26,5 +26,5 @@ def mujoco():
        lam=0.98,
        vf_iters=5,
        vf_stepsize=1e-3,
-        normalize_observations=True, 
+        normalize_observations=True,
    )
--- a/baselines/trpo_mpi/trpo_mpi.py
+++ b/baselines/trpo_mpi/trpo_mpi.py
@@ -83,13 +83,13 @@ def add_vtarg_and_adv(seg, gamma, lam):
    seg["tdlamret"] = seg["adv"] + seg["vpred"]

 def learn(*,
-        network, 
+        network,
        env,
-        total_timesteps, 
+        total_timesteps,
        timesteps_per_batch=1024, # what to train on
-        max_kl=0.001, 
-        cg_iters=10,   
-        gamma=0.99, 
+        max_kl=0.001,
+        cg_iters=10,
+        gamma=0.99,
        lam=1.0, # advantage estimation
        seed=None,
        entcoeff=0.0,
@@ -103,7 +103,7 @@ def learn(*,
        ):
    '''
    learn a policy function with TRPO algorithm
-    
+
    Parameters:
    ----------

@@ -121,7 +121,7 @@ def learn(*,

    cg_iters                number of iterations of conjugate gradient algorithm

-    cg_damping              conjugate gradient damping 
+    cg_damping              conjugate gradient damping

    vf_stepsize             learning rate for adam optimizer used to optimie value function loss

@@ -130,11 +130,11 @@ def learn(*,
    total_timesteps           max number of timesteps

    max_episodes            max number of episodes
-    
+
    max_iters               maximum number of policy optimization iterations

    callback                function to be called with (locals(), globals()) each policy optimization step
-    
+
    load_path               str, path to load the model from (default: None, i.e. no model is loaded)

    **network_kwargs        keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
@@ -145,18 +145,18 @@ def learn(*,
    learnt model

    '''
-    
-    
+
+
    nworkers = MPI.COMM_WORLD.Get_size()
    rank = MPI.COMM_WORLD.Get_rank()

    cpus_per_worker = 1
    U.get_session(config=tf.ConfigProto(
-            allow_soft_placement=True, 
+            allow_soft_placement=True,
            inter_op_parallelism_threads=cpus_per_worker,
            intra_op_parallelism_threads=cpus_per_worker
    ))
-    
+

    policy = build_policy(env, network, value_network='copy', **network_kwargs)
    set_global_seeds(seed)
@@ -245,7 +245,7 @@ def learn(*,
    U.initialize()
    if load_path is not None:
        pi.load(load_path)
-    
+
    th_init = get_flat()
    MPI.COMM_WORLD.Bcast(th_init, root=0)
    set_from_flat(th_init)
@@ -384,8 +384,8 @@ def get_trainable_variables(scope):
    return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope)

 def get_vf_trainable_variables(scope):
-    return [v for v in get_trainable_variables(scope) if 'vf' in v.name[len(scope):].split('/')]    
+    return [v for v in get_trainable_variables(scope) if 'vf' in v.name[len(scope):].split('/')]

 def get_pi_trainable_variables(scope):
-    return [v for v in get_trainable_variables(scope) if 'pi' in v.name[len(scope):].split('/')]    
+    return [v for v in get_trainable_variables(scope) if 'pi' in v.name[len(scope):].split('/')]

--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
 [flake8]
-select = F,E999
+select = F,E999,W291,W293
 exclude = 
    .git,
    __pycache__,
--- a/setup.py
+++ b/setup.py
@@ -8,7 +8,7 @@ if sys.version_info.major != 3:

 extras = {
    'test': [
-        'filelock', 
+        'filelock',
        'pytest'
    ]
 }