Merge branch 'master' of github.com:openai/baselines into internal

This commit is contained in:
Jacob Hilton
2019-04-03 16:21:48 -07:00
17 changed files with 53 additions and 26 deletions

View File

@@ -89,7 +89,7 @@ python -m baselines.run --alg=ppo2 --env=Humanoid-v2 --network=mlp --num_timeste
will set entropy coefficient to 0.1, and construct fully connected network with 3 layers with 32 hidden units in each, and create a separate network for value function estimation (so that its parameters are not shared with the policy network, but the structure is the same)
See docstrings in [common/models.py](baselines/common/models.py) for description of network parameters for each type of model, and
docstring for [baselines/ppo2/ppo2.py/learn()](baselines/ppo2/ppo2.py#L152) for the description of the ppo2 hyperparamters.
docstring for [baselines/ppo2/ppo2.py/learn()](baselines/ppo2/ppo2.py#L152) for the description of the ppo2 hyperparameters.
### Example 2. DQN on Atari
DQN with Atari is at this point a classics of benchmarks. To run the baselines implementation of DQN on Atari Pong:

View File

@@ -11,6 +11,8 @@ from baselines.common.policies import build_policy
from baselines.a2c.utils import Scheduler, find_trainable_variables
from baselines.a2c.runner import Runner
from baselines.ppo2.ppo2 import safemean
from collections import deque
from tensorflow import losses
@@ -195,6 +197,7 @@ def learn(
# Instantiate the runner object
runner = Runner(env, model, nsteps=nsteps, gamma=gamma)
epinfobuf = deque(maxlen=100)
# Calculate the batch_size
nbatch = nenvs*nsteps
@@ -204,7 +207,8 @@ def learn(
for update in range(1, total_timesteps//nbatch+1):
# Get mini batch of experiences
obs, states, rewards, masks, actions, values = runner.run()
obs, states, rewards, masks, actions, values, epinfos = runner.run()
epinfobuf.extend(epinfos)
policy_loss, value_loss, policy_entropy = model.train(obs, states, rewards, masks, actions, values)
nseconds = time.time()-tstart
@@ -221,6 +225,8 @@ def learn(
logger.record_tabular("policy_entropy", float(policy_entropy))
logger.record_tabular("value_loss", float(value_loss))
logger.record_tabular("explained_variance", float(ev))
logger.record_tabular("eprewmean", safemean([epinfo['r'] for epinfo in epinfobuf]))
logger.record_tabular("eplenmean", safemean([epinfo['l'] for epinfo in epinfobuf]))
logger.dump_tabular()
return model

View File

@@ -22,6 +22,7 @@ class Runner(AbstractEnvRunner):
# We initialize the lists that will contain the mb of experiences
mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[]
mb_states = self.states
epinfos = []
for n in range(self.nsteps):
# Given observations, take action and value (V(s))
# We already have self.obs because Runner superclass run self.obs[:] = env.reset() on init
@@ -34,7 +35,10 @@ class Runner(AbstractEnvRunner):
mb_dones.append(self.dones)
# Take actions in env and look the results
obs, rewards, dones, _ = self.env.step(actions)
obs, rewards, dones, infos = self.env.step(actions)
for info in infos:
maybeepinfo = info.get('episode')
if maybeepinfo: epinfos.append(maybeepinfo)
self.states = states
self.dones = dones
self.obs = obs
@@ -69,4 +73,4 @@ class Runner(AbstractEnvRunner):
mb_rewards = mb_rewards.flatten()
mb_values = mb_values.flatten()
mb_masks = mb_masks.flatten()
return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values
return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values, epinfos

View File

@@ -11,6 +11,8 @@ from baselines.common.tf_util import get_session, save_variables, load_variables
from baselines.a2c.runner import Runner
from baselines.a2c.utils import Scheduler, find_trainable_variables
from baselines.acktr import kfac
from baselines.ppo2.ppo2 import safemean
from collections import deque
class Model(object):
@@ -118,6 +120,7 @@ def learn(network, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interva
model.load(load_path)
runner = Runner(env, model, nsteps=nsteps, gamma=gamma)
epinfobuf = deque(maxlen=100)
nbatch = nenvs*nsteps
tstart = time.time()
coord = tf.train.Coordinator()
@@ -127,7 +130,8 @@ def learn(network, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interva
enqueue_threads = []
for update in range(1, total_timesteps//nbatch+1):
obs, states, rewards, masks, actions, values = runner.run()
obs, states, rewards, masks, actions, values, epinfos = runner.run()
epinfobuf.extend(epinfos)
policy_loss, value_loss, policy_entropy = model.train(obs, states, rewards, masks, actions, values)
model.old_obs = obs
nseconds = time.time()-tstart
@@ -141,6 +145,8 @@ def learn(network, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interva
logger.record_tabular("policy_loss", float(policy_loss))
logger.record_tabular("value_loss", float(value_loss))
logger.record_tabular("explained_variance", float(ev))
logger.record_tabular("eprewmean", safemean([epinfo['r'] for epinfo in epinfobuf]))
logger.record_tabular("eplenmean", safemean([epinfo['l'] for epinfo in epinfobuf]))
logger.dump_tabular()
if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir():

View File

@@ -87,6 +87,8 @@ def make_env(env_id, env_type, mpi_rank=0, subrank=0, seed=None, reward_scale=1.
if env_type == 'atari':
env = wrap_deepmind(env, **wrapper_kwargs)
elif env_type == 'retro':
if 'frame_stack' not in wrapper_kwargs:
wrapper_kwargs['frame_stack'] = 1
env = retro_wrappers.wrap_deepmind_retro(env, **wrapper_kwargs)
if isinstance(env.action_space, gym.spaces.Box):

View File

@@ -123,7 +123,7 @@ def mpi_weighted_mean(comm, local_name2valcount):
val = float(val)
except ValueError:
if comm.rank == 0:
warnings.warn(f'WARNING: tried to compute mean on non-float {name}={val}')
warnings.warn('WARNING: tried to compute mean on non-float {}={}'.format(name, val))
else:
name2sum[name] += val * count
name2count[name] += count

View File

@@ -248,7 +248,7 @@ def plot_results(
figsize=None,
legend_outside=False,
resample=0,
smooth_step=1.0,
smooth_step=1.0
):
'''
Plot multiple Results objects

View File

@@ -177,7 +177,7 @@ def profile_tf_runningmeanstd():
outfile = '/tmp/timeline.json'
with open(outfile, 'wt') as f:
f.write(chrome_trace)
print(f'Successfully saved profile to {outfile}. Exiting.')
print('Successfully saved profile to {}. Exiting.'.format(outfile))
exit(0)
'''

View File

@@ -16,7 +16,7 @@ def test_mpi_weighted_mean():
d = mpi_util.mpi_weighted_mean(comm, name2valcount)
correctval = {'a' : (10 * 2 + 19) / 3.0, 'b' : 20, 'c' : 42}
if comm.rank == 0:
assert d == correctval, f'{d} != {correctval}'
assert d == correctval, '{} != {}'.format(d, correctval)
for name, (val, count) in name2valcount.items():
for _ in range(count):

View File

@@ -305,12 +305,17 @@ def display_var_info(vars):
logger.info("Total model parameters: %0.2f million" % (count_params*1e-6))
def get_available_gpus():
# recipe from here:
# https://stackoverflow.com/questions/38559755/how-to-get-current-available-gpus-in-tensorflow?utm_medium=organic&utm_source=google_rich_qa&utm_campaign=google_rich_qa
def get_available_gpus(session_config=None):
# based on recipe from https://stackoverflow.com/a/38580201
# Unless we allocate a session here, subsequent attempts to create one
# will ignore our custom config (in particular, allow_growth=True will have
# no effect).
if session_config is None:
session_config = get_session()._config
from tensorflow.python.client import device_lib
local_device_protos = device_lib.list_local_devices()
local_device_protos = device_lib.list_local_devices(session_config)
return [x.name for x in local_device_protos if x.device_type == 'GPU']
# ================================================================

View File

@@ -23,7 +23,7 @@ def model(inpt, num_actions, scope, reuse=False):
if __name__ == '__main__':
with U.make_session(8):
with U.make_session(num_cpu=8):
# Create the environment
env = gym.make("CartPole-v0")
# Create all the functions necessary to train the model

View File

@@ -20,7 +20,7 @@ class TfInput(object):
"""
raise NotImplementedError
def make_feed_dict(data):
def make_feed_dict(self, data):
"""Given data input it to the placeholder(s)."""
raise NotImplementedError

View File

@@ -12,13 +12,13 @@ Download the expert data into `./data`, [download link](https://drive.google.com
### Step 2: Run GAIL
Run with single thread:
Run with single rank:
```bash
python -m baselines.gail.run_mujoco
```
Run with multiple threads:
Run with multiple ranks:
```bash
mpirun -np 16 python -m baselines.gail.run_mujoco

View File

@@ -66,7 +66,7 @@ class TransitionClassifier(object):
with tf.variable_scope("obfilter"):
self.obs_rms = RunningMeanStd(shape=self.observation_shape)
obs = (obs_ph - self.obs_rms.mean / self.obs_rms.std)
obs = (obs_ph - self.obs_rms.mean) / self.obs_rms.std
_input = tf.concat([obs, acs_ph], axis=1) # concatenate the two input -> form a transition
p_h1 = tf.contrib.layers.fully_connected(_input, self.hidden_size, activation_fn=tf.nn.tanh)
p_h2 = tf.contrib.layers.fully_connected(p_h1, self.hidden_size, activation_fn=tf.nn.tanh)

View File

@@ -50,8 +50,12 @@ class Mujoco_Dset(object):
# obs, acs: shape (N, L, ) + S where N = # episodes, L = episode length
# and S is the environment observation/action space.
# Flatten to (N * L, prod(S))
self.obs = np.reshape(obs, [-1, np.prod(obs.shape[2:])])
self.acs = np.reshape(acs, [-1, np.prod(acs.shape[2:])])
if len(obs.shape) > 2:
self.obs = np.reshape(obs, [-1, np.prod(obs.shape[2:])])
self.acs = np.reshape(acs, [-1, np.prod(acs.shape[2:])])
else:
self.obs = np.vstack(obs)
self.acs = np.vstack(acs)
self.rets = traj_data['ep_rets'][:traj_limitation]
self.avg_ret = sum(self.rets)/len(self.rets)

View File

@@ -119,13 +119,13 @@ def learn(*, network, env, total_timesteps, eval_env = None, seed=None, nsteps=2
eval_epinfobuf = deque(maxlen=100)
# Start total timer
tfirststart = time.time()
tfirststart = time.perf_counter()
nupdates = total_timesteps//nbatch
for update in range(1, nupdates+1):
assert nbatch % nminibatches == 0
# Start timer
tstart = time.time()
tstart = time.perf_counter()
frac = 1.0 - (update - 1.0) / nupdates
# Calculate the learning rate
lrnow = lr(frac)
@@ -173,7 +173,7 @@ def learn(*, network, env, total_timesteps, eval_env = None, seed=None, nsteps=2
# Feedforward --> get losses --> update
lossvals = np.mean(mblossvals, axis=0)
# End timer
tnow = time.time()
tnow = time.perf_counter()
# Calculate the fps (frame per second)
fps = int(nbatch / (tnow - tstart))
if update % log_interval == 0 or update == 1:

View File

@@ -6,7 +6,7 @@ from collections import defaultdict
import tensorflow as tf
import numpy as np
from baselines.common.vec_env import VecFrameStack, VecNormalize
from baselines.common.vec_env import VecFrameStack, VecNormalize, VecEnv
from baselines.common.vec_env.vec_video_recorder import VecVideoRecorder
from baselines.common.cmd_util import common_arg_parser, parse_unknown_args, make_vec_env, make_env
from baselines.common.tf_util import get_session
@@ -228,11 +228,11 @@ def main(args):
actions, _, _, _ = model.step(obs)
obs, rew, done, _ = env.step(actions)
episode_rew += rew[0]
episode_rew += rew[0] if isinstance(env, VecEnv) else rew
env.render()
done = done.any() if isinstance(done, np.ndarray) else done
if done:
print(f'episode_rew={episode_rew}')
print('episode_rew={}'.format(episode_rew))
episode_rew = 0
obs = env.reset()