Merge branch 'master' of github.com:openai/baselines into internal

2019-04-03 16:21:48 -07:00
parent ea20c8a034 6d1c6c78d3
commit 0a48a1fda9
17 changed files with 53 additions and 26 deletions
--- a/README.md
+++ b/README.md
@@ -89,7 +89,7 @@ python -m baselines.run --alg=ppo2 --env=Humanoid-v2 --network=mlp --num_timeste
 will set entropy coefficient to 0.1, and construct fully connected network with 3 layers with 32 hidden units in each, and create a separate network for value function estimation (so that its parameters are not shared with the policy network, but the structure is the same)

 See docstrings in [common/models.py](baselines/common/models.py) for description of network parameters for each type of model, and 
-docstring for [baselines/ppo2/ppo2.py/learn()](baselines/ppo2/ppo2.py#L152) for the description of the ppo2 hyperparamters. 
+docstring for [baselines/ppo2/ppo2.py/learn()](baselines/ppo2/ppo2.py#L152) for the description of the ppo2 hyperparameters. 

 ### Example 2. DQN on Atari 
 DQN with Atari is at this point a classics of benchmarks. To run the baselines implementation of DQN on Atari Pong:
--- a/baselines/a2c/a2c.py
+++ b/baselines/a2c/a2c.py
@@ -11,6 +11,8 @@ from baselines.common.policies import build_policy

 from baselines.a2c.utils import Scheduler, find_trainable_variables
 from baselines.a2c.runner import Runner
+from baselines.ppo2.ppo2 import safemean
+from collections import deque

 from tensorflow import losses

@@ -195,6 +197,7 @@ def learn(

    # Instantiate the runner object
    runner = Runner(env, model, nsteps=nsteps, gamma=gamma)
+    epinfobuf = deque(maxlen=100)

    # Calculate the batch_size
    nbatch = nenvs*nsteps
@@ -204,7 +207,8 @@ def learn(

    for update in range(1, total_timesteps//nbatch+1):
        # Get mini batch of experiences
-        obs, states, rewards, masks, actions, values = runner.run()
+        obs, states, rewards, masks, actions, values, epinfos = runner.run()
+        epinfobuf.extend(epinfos)

        policy_loss, value_loss, policy_entropy = model.train(obs, states, rewards, masks, actions, values)
        nseconds = time.time()-tstart
@@ -221,6 +225,8 @@ def learn(
            logger.record_tabular("policy_entropy", float(policy_entropy))
            logger.record_tabular("value_loss", float(value_loss))
            logger.record_tabular("explained_variance", float(ev))
+            logger.record_tabular("eprewmean", safemean([epinfo['r'] for epinfo in epinfobuf]))
+            logger.record_tabular("eplenmean", safemean([epinfo['l'] for epinfo in epinfobuf]))
            logger.dump_tabular()
    return model

--- a/baselines/a2c/runner.py
+++ b/baselines/a2c/runner.py
@@ -22,6 +22,7 @@ class Runner(AbstractEnvRunner):
        # We initialize the lists that will contain the mb of experiences
        mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[]
        mb_states = self.states
+        epinfos = []
        for n in range(self.nsteps):
            # Given observations, take action and value (V(s))
            # We already have self.obs because Runner superclass run self.obs[:] = env.reset() on init
@@ -34,7 +35,10 @@ class Runner(AbstractEnvRunner):
            mb_dones.append(self.dones)

            # Take actions in env and look the results
-            obs, rewards, dones, _ = self.env.step(actions)
+            obs, rewards, dones, infos = self.env.step(actions)
+            for info in infos:
+                maybeepinfo = info.get('episode')
+                if maybeepinfo: epinfos.append(maybeepinfo)
            self.states = states
            self.dones = dones
            self.obs = obs
@@ -69,4 +73,4 @@ class Runner(AbstractEnvRunner):
        mb_rewards = mb_rewards.flatten()
        mb_values = mb_values.flatten()
        mb_masks = mb_masks.flatten()
-        return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values
+        return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values, epinfos
--- a/baselines/acktr/acktr.py
+++ b/baselines/acktr/acktr.py
@@ -11,6 +11,8 @@ from baselines.common.tf_util import get_session, save_variables, load_variables
 from baselines.a2c.runner import Runner
 from baselines.a2c.utils import Scheduler, find_trainable_variables
 from baselines.acktr import kfac
+from baselines.ppo2.ppo2 import safemean
+from collections import deque


 class Model(object):
@@ -118,6 +120,7 @@ def learn(network, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interva
        model.load(load_path)

    runner = Runner(env, model, nsteps=nsteps, gamma=gamma)
+    epinfobuf = deque(maxlen=100)
    nbatch = nenvs*nsteps
    tstart = time.time()
    coord = tf.train.Coordinator()
@@ -127,7 +130,8 @@ def learn(network, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interva
        enqueue_threads = []

    for update in range(1, total_timesteps//nbatch+1):
-        obs, states, rewards, masks, actions, values = runner.run()
+        obs, states, rewards, masks, actions, values, epinfos = runner.run()
+        epinfobuf.extend(epinfos)
        policy_loss, value_loss, policy_entropy = model.train(obs, states, rewards, masks, actions, values)
        model.old_obs = obs
        nseconds = time.time()-tstart
@@ -141,6 +145,8 @@ def learn(network, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interva
            logger.record_tabular("policy_loss", float(policy_loss))
            logger.record_tabular("value_loss", float(value_loss))
            logger.record_tabular("explained_variance", float(ev))
+            logger.record_tabular("eprewmean", safemean([epinfo['r'] for epinfo in epinfobuf]))
+            logger.record_tabular("eplenmean", safemean([epinfo['l'] for epinfo in epinfobuf]))
            logger.dump_tabular()

        if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir():
--- a/baselines/common/cmd_util.py
+++ b/baselines/common/cmd_util.py
@@ -87,6 +87,8 @@ def make_env(env_id, env_type, mpi_rank=0, subrank=0, seed=None, reward_scale=1.
    if env_type == 'atari':
        env = wrap_deepmind(env, **wrapper_kwargs)
    elif env_type == 'retro':
+        if 'frame_stack' not in wrapper_kwargs:
+            wrapper_kwargs['frame_stack'] = 1
        env = retro_wrappers.wrap_deepmind_retro(env, **wrapper_kwargs)

    if isinstance(env.action_space, gym.spaces.Box):
--- a/baselines/common/mpi_util.py
+++ b/baselines/common/mpi_util.py
@@ -123,7 +123,7 @@ def mpi_weighted_mean(comm, local_name2valcount):
                    val = float(val)
                except ValueError:
                    if comm.rank == 0:
-                        warnings.warn(f'WARNING: tried to compute mean on non-float {name}={val}')
+                        warnings.warn('WARNING: tried to compute mean on non-float {}={}'.format(name, val))
                else:
                    name2sum[name] += val * count
                    name2count[name] += count
--- a/baselines/common/plot_util.py
+++ b/baselines/common/plot_util.py
@@ -248,7 +248,7 @@ def plot_results(
    figsize=None,
    legend_outside=False,
    resample=0,
-    smooth_step=1.0,
+    smooth_step=1.0
 ):
    '''
    Plot multiple Results objects
--- a/baselines/common/running_mean_std.py
+++ b/baselines/common/running_mean_std.py
@@ -177,7 +177,7 @@ def profile_tf_runningmeanstd():
    outfile = '/tmp/timeline.json'
    with open(outfile, 'wt') as f:
        f.write(chrome_trace)
-    print(f'Successfully saved profile to {outfile}. Exiting.')
+    print('Successfully saved profile to {}. Exiting.'.format(outfile))
    exit(0)
    '''

--- a/baselines/common/test_mpi_util.py
+++ b/baselines/common/test_mpi_util.py
@@ -16,7 +16,7 @@ def test_mpi_weighted_mean():
        d = mpi_util.mpi_weighted_mean(comm, name2valcount)
        correctval = {'a' : (10 * 2 + 19) / 3.0, 'b' : 20, 'c' : 42}
        if comm.rank == 0:
-            assert d == correctval, f'{d} != {correctval}'
+            assert d == correctval, '{} != {}'.format(d, correctval)

        for name, (val, count) in name2valcount.items():
            for _ in range(count):
--- a/baselines/common/tf_util.py
+++ b/baselines/common/tf_util.py
@@ -305,12 +305,17 @@ def display_var_info(vars):
    logger.info("Total model parameters: %0.2f million" % (count_params*1e-6))


-def get_available_gpus():
-    # recipe from here:
-    # https://stackoverflow.com/questions/38559755/how-to-get-current-available-gpus-in-tensorflow?utm_medium=organic&utm_source=google_rich_qa&utm_campaign=google_rich_qa
+def get_available_gpus(session_config=None):
+    # based on recipe from https://stackoverflow.com/a/38580201
+
+    # Unless we allocate a session here, subsequent attempts to create one
+    # will ignore our custom config (in particular, allow_growth=True will have
+    # no effect).
+    if session_config is None:
+        session_config = get_session()._config

    from tensorflow.python.client import device_lib
-    local_device_protos = device_lib.list_local_devices()
+    local_device_protos = device_lib.list_local_devices(session_config)
    return [x.name for x in local_device_protos if x.device_type == 'GPU']

 # ================================================================
--- a/baselines/deepq/experiments/custom_cartpole.py
+++ b/baselines/deepq/experiments/custom_cartpole.py
@@ -23,7 +23,7 @@ def model(inpt, num_actions, scope, reuse=False):


 if __name__ == '__main__':
-    with U.make_session(8):
+    with U.make_session(num_cpu=8):
        # Create the environment
        env = gym.make("CartPole-v0")
        # Create all the functions necessary to train the model
--- a/baselines/deepq/utils.py
+++ b/baselines/deepq/utils.py
@@ -20,7 +20,7 @@ class TfInput(object):
        """
        raise NotImplementedError

-    def make_feed_dict(data):
+    def make_feed_dict(self, data):
        """Given data input it to the placeholder(s)."""
        raise NotImplementedError

--- a/baselines/gail/README.md
+++ b/baselines/gail/README.md
@@ -12,13 +12,13 @@ Download the expert data into `./data`, [download link](https://drive.google.com

 ### Step 2: Run GAIL

-Run with single thread:
+Run with single rank:

 ```bash
 python -m baselines.gail.run_mujoco
 ```

-Run with multiple threads:
+Run with multiple ranks:

 ```bash
 mpirun -np 16 python -m baselines.gail.run_mujoco
--- a/baselines/gail/adversary.py
+++ b/baselines/gail/adversary.py
@@ -66,7 +66,7 @@ class TransitionClassifier(object):

            with tf.variable_scope("obfilter"):
                self.obs_rms = RunningMeanStd(shape=self.observation_shape)
-            obs = (obs_ph - self.obs_rms.mean / self.obs_rms.std)
+            obs = (obs_ph - self.obs_rms.mean) / self.obs_rms.std
            _input = tf.concat([obs, acs_ph], axis=1)  # concatenate the two input -> form a transition
            p_h1 = tf.contrib.layers.fully_connected(_input, self.hidden_size, activation_fn=tf.nn.tanh)
            p_h2 = tf.contrib.layers.fully_connected(p_h1, self.hidden_size, activation_fn=tf.nn.tanh)
--- a/baselines/gail/dataset/mujoco_dset.py
+++ b/baselines/gail/dataset/mujoco_dset.py
@@ -50,8 +50,12 @@ class Mujoco_Dset(object):
        # obs, acs: shape (N, L, ) + S where N = # episodes, L = episode length
        # and S is the environment observation/action space.
        # Flatten to (N * L, prod(S))
-        self.obs = np.reshape(obs, [-1, np.prod(obs.shape[2:])])
-        self.acs = np.reshape(acs, [-1, np.prod(acs.shape[2:])])
+        if len(obs.shape) > 2:
+            self.obs = np.reshape(obs, [-1, np.prod(obs.shape[2:])])
+            self.acs = np.reshape(acs, [-1, np.prod(acs.shape[2:])])
+        else:
+            self.obs = np.vstack(obs)
+            self.acs = np.vstack(acs)

        self.rets = traj_data['ep_rets'][:traj_limitation]
        self.avg_ret = sum(self.rets)/len(self.rets)
--- a/baselines/ppo2/ppo2.py
+++ b/baselines/ppo2/ppo2.py
@@ -119,13 +119,13 @@ def learn(*, network, env, total_timesteps, eval_env = None, seed=None, nsteps=2
        eval_epinfobuf = deque(maxlen=100)

    # Start total timer
-    tfirststart = time.time()
+    tfirststart = time.perf_counter()

    nupdates = total_timesteps//nbatch
    for update in range(1, nupdates+1):
        assert nbatch % nminibatches == 0
        # Start timer
-        tstart = time.time()
+        tstart = time.perf_counter()
        frac = 1.0 - (update - 1.0) / nupdates
        # Calculate the learning rate
        lrnow = lr(frac)
@@ -173,7 +173,7 @@ def learn(*, network, env, total_timesteps, eval_env = None, seed=None, nsteps=2
        # Feedforward --> get losses --> update
        lossvals = np.mean(mblossvals, axis=0)
        # End timer
-        tnow = time.time()
+        tnow = time.perf_counter()
        # Calculate the fps (frame per second)
        fps = int(nbatch / (tnow - tstart))
        if update % log_interval == 0 or update == 1:
--- a/baselines/run.py
+++ b/baselines/run.py
@@ -6,7 +6,7 @@ from collections import defaultdict
 import tensorflow as tf
 import numpy as np

-from baselines.common.vec_env import VecFrameStack, VecNormalize
+from baselines.common.vec_env import VecFrameStack, VecNormalize, VecEnv
 from baselines.common.vec_env.vec_video_recorder import VecVideoRecorder
 from baselines.common.cmd_util import common_arg_parser, parse_unknown_args, make_vec_env, make_env
 from baselines.common.tf_util import get_session
@@ -228,11 +228,11 @@ def main(args):
                actions, _, _, _ = model.step(obs)

            obs, rew, done, _ = env.step(actions)
-            episode_rew += rew[0]
+            episode_rew += rew[0] if isinstance(env, VecEnv) else rew
            env.render()
            done = done.any() if isinstance(done, np.ndarray) else done
            if done:
-                print(f'episode_rew={episode_rew}')
+                print('episode_rew={}'.format(episode_rew))
                episode_rew = 0
                obs = env.reset()