Compare commits

...

1 Commits

Author SHA1 Message Date
Peter Zhokhov
efb071949e import rl-algs from 2e3a166 commit 2018-05-16 12:00:23 -07:00
14 changed files with 117 additions and 73 deletions

View File

@@ -1,4 +1,4 @@
<img src="data/logo.jpg" width=25% align="right" /> [![Build status](https://travis-ci.org/openai/baselines.svg?branch=master)](https://travis-ci.org/openai/baselines) <img src="data/logo.jpg" width=25% align="right" />
# Baselines # Baselines
@@ -52,6 +52,8 @@ Install baselines package
```bash ```bash
pip install -e . pip install -e .
``` ```
### MuJoCo
Some of the baselines examples use [MuJoCo](http://www.mujoco.org) (multi-joint dynamics in contact) physics simulator, which is proprietary and requires binaries and a license (temporary 30-day license can be obtained from [www.mujoco.org](http://www.mujoco.org)). Instructions on setting up MuJoCo can be found [here](https://github.com/openai/mujoco-py)
## Testing the installation ## Testing the installation
All unit tests in baselines can be run using pytest runner: All unit tests in baselines can be run using pytest runner:

View File

@@ -9,6 +9,8 @@ _atariexpl7 = ['Freeway', 'Gravitar', 'MontezumaRevenge', 'Pitfall', 'PrivateEye
_BENCHMARKS = [] _BENCHMARKS = []
remove_version_re = re.compile(r'-v\d+$') remove_version_re = re.compile(r'-v\d+$')
def register_benchmark(benchmark): def register_benchmark(benchmark):
for b in _BENCHMARKS: for b in _BENCHMARKS:
if b['name'] == benchmark['name']: if b['name'] == benchmark['name']:
@@ -138,3 +140,11 @@ register_benchmark({
'tasks': [{'desc': _game, 'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(10e6)} for _game in _atari50] 'tasks': [{'desc': _game, 'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(10e6)} for _game in _atari50]
}) })
# HER DDPG
register_benchmark({
'name': 'HerDdpg',
'description': 'Smoke-test only benchmark of HER',
'tasks': [{'trials': 1, 'env_id': 'FetchReach-v1'}]
})

View File

@@ -74,6 +74,7 @@ def mujoco_arg_parser():
parser.add_argument('--env', help='environment ID', type=str, default='Reacher-v2') parser.add_argument('--env', help='environment ID', type=str, default='Reacher-v2')
parser.add_argument('--seed', help='RNG seed', type=int, default=0) parser.add_argument('--seed', help='RNG seed', type=int, default=0)
parser.add_argument('--num-timesteps', type=int, default=int(1e6)) parser.add_argument('--num-timesteps', type=int, default=int(1e6))
parser.add_argument('--play', default=False, action='store_true')
return parser return parser
def robotics_arg_parser(): def robotics_arg_parser():

View File

@@ -1,18 +0,0 @@
import numpy as np
from abc import ABC, abstractmethod
class AbstractEnvRunner(ABC):
def __init__(self, *, env, model, nsteps):
self.env = env
self.model = model
nenv = env.num_envs
self.batch_ob_shape = (nenv*nsteps,) + env.observation_space.shape
self.obs = np.zeros((nenv,) + env.observation_space.shape, dtype=model.train_model.X.dtype.name)
self.obs[:] = env.reset()
self.nsteps = nsteps
self.states = model.initial_state
self.dones = [False for _ in range(nenv)]
@abstractmethod
def run(self):
raise NotImplementedError

View File

@@ -279,3 +279,27 @@ def display_var_info(vars):
logger.info(" %s%s %i params %s" % (name, " "*(55-len(name)), v_params, str(v.shape))) logger.info(" %s%s %i params %s" % (name, " "*(55-len(name)), v_params, str(v.shape)))
logger.info("Total model parameters: %0.2f million" % (count_params*1e-6)) logger.info("Total model parameters: %0.2f million" % (count_params*1e-6))
def get_available_gpus():
# recipe from here:
# https://stackoverflow.com/questions/38559755/how-to-get-current-available-gpus-in-tensorflow?utm_medium=organic&utm_source=google_rich_qa&utm_campaign=google_rich_qa
from tensorflow.python.client import device_lib
local_device_protos = device_lib.list_local_devices()
return [x.name for x in local_device_protos if x.device_type == 'GPU']
# ================================================================
# Saving variables
# ================================================================
def load_state(fname):
saver = tf.train.Saver()
saver.restore(tf.get_default_session(), fname)
def save_state(fname):
os.makedirs(os.path.dirname(fname), exist_ok=True)
saver = tf.train.Saver()
saver.save(tf.get_default_session(), fname)

View File

@@ -50,6 +50,9 @@ class DummyVecEnv(VecEnv):
def close(self): def close(self):
return return
def render(self):
return [e.render() for e in self.envs]
def _save_obs(self, e, obs): def _save_obs(self, e, obs):
for k in self.keys: for k in self.keys:
if k is None: if k is None:

View File

@@ -14,6 +14,9 @@ def main():
parser.add_argument('--prioritized-replay-alpha', type=float, default=0.6) parser.add_argument('--prioritized-replay-alpha', type=float, default=0.6)
parser.add_argument('--dueling', type=int, default=1) parser.add_argument('--dueling', type=int, default=1)
parser.add_argument('--num-timesteps', type=int, default=int(10e6)) parser.add_argument('--num-timesteps', type=int, default=int(10e6))
parser.add_argument('--checkpoint-freq', type=int, default=10000)
parser.add_argument('--checkpoint-path', type=str, default=None)
args = parser.parse_args() args = parser.parse_args()
logger.configure() logger.configure()
set_global_seeds(args.seed) set_global_seeds(args.seed)
@@ -39,7 +42,9 @@ def main():
target_network_update_freq=1000, target_network_update_freq=1000,
gamma=0.99, gamma=0.99,
prioritized_replay=bool(args.prioritized), prioritized_replay=bool(args.prioritized),
prioritized_replay_alpha=args.prioritized_replay_alpha prioritized_replay_alpha=args.prioritized_replay_alpha,
checkpoint_freq=args.checkpoint_freq,
checkpoint_path=args.checkpoint_path,
) )
env.close() env.close()

View File

@@ -6,13 +6,13 @@ import zipfile
import cloudpickle import cloudpickle
import numpy as np import numpy as np
import gym
import baselines.common.tf_util as U import baselines.common.tf_util as U
from baselines.common.tf_util import load_state, save_state
from baselines import logger from baselines import logger
from baselines.common.schedules import LinearSchedule from baselines.common.schedules import LinearSchedule
from baselines import deepq from baselines import deepq
from baselines.deepq.replay_buffer import ReplayBuffer, PrioritizedReplayBuffer from baselines.deepq.replay_buffer import ReplayBuffer, PrioritizedReplayBuffer
from baselines.deepq.utils import BatchInput, load_state, save_state from baselines.deepq.utils import BatchInput
class ActWrapper(object): class ActWrapper(object):
@@ -88,6 +88,7 @@ def learn(env,
batch_size=32, batch_size=32,
print_freq=100, print_freq=100,
checkpoint_freq=10000, checkpoint_freq=10000,
checkpoint_path=None,
learning_starts=1000, learning_starts=1000,
gamma=1.0, gamma=1.0,
target_network_update_freq=500, target_network_update_freq=500,
@@ -171,6 +172,7 @@ def learn(env,
# capture the shape outside the closure so that the env object is not serialized # capture the shape outside the closure so that the env object is not serialized
# by cloudpickle when serializing make_obs_ph # by cloudpickle when serializing make_obs_ph
observation_space_shape = env.observation_space.shape observation_space_shape = env.observation_space.shape
def make_obs_ph(name): def make_obs_ph(name):
return BatchInput(observation_space_shape, name=name) return BatchInput(observation_space_shape, name=name)
@@ -216,9 +218,17 @@ def learn(env,
saved_mean_reward = None saved_mean_reward = None
obs = env.reset() obs = env.reset()
reset = True reset = True
with tempfile.TemporaryDirectory() as td: with tempfile.TemporaryDirectory() as td:
model_saved = False td = checkpoint_path or td
model_file = os.path.join(td, "model") model_file = os.path.join(td, "model")
model_saved = False
if tf.train.latest_checkpoint(td) is not None:
load_state(model_file)
logger.log('Loaded model from {}'.format(model_file))
model_saved = True
for t in range(max_timesteps): for t in range(max_timesteps):
if callback is not None: if callback is not None:
if callback(locals(), globals()): if callback(locals(), globals()):

View File

@@ -1,24 +1,10 @@
import os
import tensorflow as tf import tensorflow as tf
# ================================================================
# Saving variables
# ================================================================
def load_state(fname):
saver = tf.train.Saver()
saver.restore(tf.get_default_session(), fname)
def save_state(fname):
os.makedirs(os.path.dirname(fname), exist_ok=True)
saver = tf.train.Saver()
saver.save(tf.get_default_session(), fname)
# ================================================================ # ================================================================
# Placeholders # Placeholders
# ================================================================ # ================================================================
class TfInput(object): class TfInput(object):
def __init__(self, name="(unnamed)"): def __init__(self, name="(unnamed)"):
"""Generalized Tensorflow placeholder. The main differences are: """Generalized Tensorflow placeholder. The main differences are:
@@ -50,6 +36,7 @@ class PlaceholderTfInput(TfInput):
def make_feed_dict(self, data): def make_feed_dict(self, data):
return {self._placeholder: data} return {self._placeholder: data}
class BatchInput(PlaceholderTfInput): class BatchInput(PlaceholderTfInput):
def __init__(self, shape, dtype=tf.float32, name=None): def __init__(self, shape, dtype=tf.float32, name=None):
"""Creates a placeholder for a batch of tensors of a given shape and dtype """Creates a placeholder for a batch of tensors of a given shape and dtype
@@ -65,6 +52,7 @@ class BatchInput(PlaceholderTfInput):
""" """
super().__init__(tf.placeholder(dtype, [None] + list(shape), name=name)) super().__init__(tf.placeholder(dtype, [None] + list(shape), name=name))
class Uint8Input(PlaceholderTfInput): class Uint8Input(PlaceholderTfInput):
def __init__(self, shape, name=None): def __init__(self, shape, name=None):
"""Takes input in uint8 format which is cast to float32 and divided by 255 """Takes input in uint8 format which is cast to float32 and divided by 255
@@ -85,4 +73,4 @@ class Uint8Input(PlaceholderTfInput):
self._output = tf.cast(super().get(), tf.float32) / 255.0 self._output = tf.cast(super().get(), tf.float32) / 255.0
def get(self): def get(self):
return self._output return self._output

View File

@@ -1,7 +1,4 @@
from copy import deepcopy
import numpy as np import numpy as np
import json
import os
import gym import gym
from baselines import logger from baselines import logger
@@ -10,7 +7,7 @@ from baselines.her.her import make_sample_her_transitions
DEFAULT_ENV_PARAMS = { DEFAULT_ENV_PARAMS = {
'FetchReach-v0': { 'FetchReach-v1': {
'n_cycles': 10, 'n_cycles': 10,
}, },
} }
@@ -51,6 +48,8 @@ DEFAULT_PARAMS = {
CACHED_ENVS = {} CACHED_ENVS = {}
def cached_make_env(make_env): def cached_make_env(make_env):
""" """
Only creates a new environment from the provided function if one has not yet already been Only creates a new environment from the provided function if one has not yet already been
@@ -68,6 +67,7 @@ def prepare_params(kwargs):
ddpg_params = dict() ddpg_params = dict()
env_name = kwargs['env_name'] env_name = kwargs['env_name']
def make_env(): def make_env():
return gym.make(env_name) return gym.make(env_name)
kwargs['make_env'] = make_env kwargs['make_env'] = make_env
@@ -75,7 +75,7 @@ def prepare_params(kwargs):
assert hasattr(tmp_env, '_max_episode_steps') assert hasattr(tmp_env, '_max_episode_steps')
kwargs['T'] = tmp_env._max_episode_steps kwargs['T'] = tmp_env._max_episode_steps
tmp_env.reset() tmp_env.reset()
kwargs['max_u'] = np.array(kwargs['max_u']) if type(kwargs['max_u']) == list else kwargs['max_u'] kwargs['max_u'] = np.array(kwargs['max_u']) if isinstance(kwargs['max_u'], list) else kwargs['max_u']
kwargs['gamma'] = 1. - 1. / kwargs['T'] kwargs['gamma'] = 1. - 1. / kwargs['T']
if 'lr' in kwargs: if 'lr' in kwargs:
kwargs['pi_lr'] = kwargs['lr'] kwargs['pi_lr'] = kwargs['lr']
@@ -83,7 +83,7 @@ def prepare_params(kwargs):
del kwargs['lr'] del kwargs['lr']
for name in ['buffer_size', 'hidden', 'layers', for name in ['buffer_size', 'hidden', 'layers',
'network_class', 'network_class',
'polyak', 'polyak',
'batch_size', 'Q_lr', 'pi_lr', 'batch_size', 'Q_lr', 'pi_lr',
'norm_eps', 'norm_clip', 'max_u', 'norm_eps', 'norm_clip', 'max_u',
'action_l2', 'clip_obs', 'scope', 'relative_goals']: 'action_l2', 'clip_obs', 'scope', 'relative_goals']:
@@ -103,6 +103,7 @@ def log_params(params, logger=logger):
def configure_her(params): def configure_her(params):
env = cached_make_env(params['make_env']) env = cached_make_env(params['make_env'])
env.reset() env.reset()
def reward_fun(ag_2, g, info): # vectorized def reward_fun(ag_2, g, info): # vectorized
return env.compute_reward(achieved_goal=ag_2, desired_goal=g, info=info) return env.compute_reward(achieved_goal=ag_2, desired_goal=g, info=info)

View File

@@ -13,6 +13,8 @@ import baselines.her.experiment.config as config
from baselines.her.rollout import RolloutWorker from baselines.her.rollout import RolloutWorker
from baselines.her.util import mpi_fork from baselines.her.util import mpi_fork
from subprocess import CalledProcessError
def mpi_average(value): def mpi_average(value):
if value == []: if value == []:
@@ -81,12 +83,17 @@ def train(policy, rollout_worker, evaluator,
def launch( def launch(
env_name, logdir, n_epochs, num_cpu, seed, replay_strategy, policy_save_interval, clip_return, env, logdir, n_epochs, num_cpu, seed, replay_strategy, policy_save_interval, clip_return,
override_params={}, save_policies=True override_params={}, save_policies=True
): ):
# Fork for multi-CPU MPI implementation. # Fork for multi-CPU MPI implementation.
if num_cpu > 1: if num_cpu > 1:
whoami = mpi_fork(num_cpu) try:
whoami = mpi_fork(num_cpu, ['--bind-to', 'core'])
except CalledProcessError:
# fancy version of mpi call failed, try simple version
whoami = mpi_fork(num_cpu)
if whoami == 'parent': if whoami == 'parent':
sys.exit(0) sys.exit(0)
import baselines.common.tf_util as U import baselines.common.tf_util as U
@@ -109,10 +116,10 @@ def launch(
# Prepare params. # Prepare params.
params = config.DEFAULT_PARAMS params = config.DEFAULT_PARAMS
params['env_name'] = env_name params['env_name'] = env
params['replay_strategy'] = replay_strategy params['replay_strategy'] = replay_strategy
if env_name in config.DEFAULT_ENV_PARAMS: if env in config.DEFAULT_ENV_PARAMS:
params.update(config.DEFAULT_ENV_PARAMS[env_name]) # merge env-specific parameters in params.update(config.DEFAULT_ENV_PARAMS[env]) # merge env-specific parameters in
params.update(**override_params) # makes it possible to override any parameter params.update(**override_params) # makes it possible to override any parameter
with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f: with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f:
json.dump(params, f) json.dump(params, f)
@@ -126,7 +133,7 @@ def launch(
'You are running HER with just a single MPI worker. This will work, but the ' + 'You are running HER with just a single MPI worker. This will work, but the ' +
'experiments that we report in Plappert et al. (2018, https://arxiv.org/abs/1802.09464) ' + 'experiments that we report in Plappert et al. (2018, https://arxiv.org/abs/1802.09464) ' +
'were obtained with --num_cpu 19. This makes a significant difference and if you ' + 'were obtained with --num_cpu 19. This makes a significant difference and if you ' +
'are looking to reproduce those results, be aware of this. Please also refer to ' + 'are looking to reproduce those results, be aware of this. Please also refer to ' +
'https://github.com/openai/baselines/issues/314 for further details.') 'https://github.com/openai/baselines/issues/314 for further details.')
logger.warn('****************') logger.warn('****************')
logger.warn() logger.warn()
@@ -168,7 +175,7 @@ def launch(
@click.command() @click.command()
@click.option('--env_name', type=str, default='FetchReach-v0', help='the name of the OpenAI Gym environment that you want to train on') @click.option('--env', type=str, default='FetchReach-v1', help='the name of the OpenAI Gym environment that you want to train on')
@click.option('--logdir', type=str, default=None, help='the path to where logs and policy pickles should go. If not specified, creates a folder in /tmp/') @click.option('--logdir', type=str, default=None, help='the path to where logs and policy pickles should go. If not specified, creates a folder in /tmp/')
@click.option('--n_epochs', type=int, default=50, help='the number of training epochs to run') @click.option('--n_epochs', type=int, default=50, help='the number of training epochs to run')
@click.option('--num_cpu', type=int, default=1, help='the number of CPU cores to use (using MPI)') @click.option('--num_cpu', type=int, default=1, help='the number of CPU cores to use (using MPI)')

View File

@@ -58,12 +58,12 @@ def nn(input, layers_sizes, reuse=None, flatten=False, name=""):
"""Creates a simple neural network """Creates a simple neural network
""" """
for i, size in enumerate(layers_sizes): for i, size in enumerate(layers_sizes):
activation = tf.nn.relu if i < len(layers_sizes)-1 else None activation = tf.nn.relu if i < len(layers_sizes) - 1 else None
input = tf.layers.dense(inputs=input, input = tf.layers.dense(inputs=input,
units=size, units=size,
kernel_initializer=tf.contrib.layers.xavier_initializer(), kernel_initializer=tf.contrib.layers.xavier_initializer(),
reuse=reuse, reuse=reuse,
name=name+'_'+str(i)) name=name + '_' + str(i))
if activation: if activation:
input = activation(input) input = activation(input)
if flatten: if flatten:
@@ -85,7 +85,7 @@ def install_mpi_excepthook():
sys.excepthook = new_hook sys.excepthook = new_hook
def mpi_fork(n): def mpi_fork(n, extra_mpi_args=[]):
"""Re-launches the current script with workers """Re-launches the current script with workers
Returns "parent" for original parent, "child" for MPI children Returns "parent" for original parent, "child" for MPI children
""" """
@@ -99,14 +99,10 @@ def mpi_fork(n):
IN_MPI="1" IN_MPI="1"
) )
# "-bind-to core" is crucial for good performance # "-bind-to core" is crucial for good performance
args = [ args = ["mpirun", "-np", str(n)] + \
"mpirun", extra_mpi_args + \
"-np", [sys.executable]
str(n),
"-bind-to",
"core",
sys.executable
]
args += sys.argv args += sys.argv
subprocess.check_call(args, env=env) subprocess.check_call(args, env=env)
return "parent" return "parent"
@@ -140,5 +136,5 @@ def reshape_for_broadcasting(source, target):
before broadcasting it with MPI. before broadcasting it with MPI.
""" """
dim = len(target.get_shape()) dim = len(target.get_shape())
shape = ([1] * (dim-1)) + [-1] shape = ([1] * (dim - 1)) + [-1]
return tf.reshape(tf.cast(source, target.dtype), shape) return tf.reshape(tf.cast(source, target.dtype), shape)

View File

@@ -236,6 +236,7 @@ def learn(*, policy, env, nsteps, total_timesteps, ent_coef, lr,
print('Saving to', savepath) print('Saving to', savepath)
model.save(savepath) model.save(savepath)
env.close() env.close()
return model
def safemean(xs): def safemean(xs):
return np.nan if len(xs) == 0 else np.mean(xs) return np.nan if len(xs) == 0 else np.mean(xs)

View File

@@ -1,8 +1,9 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import argparse import numpy as np
from baselines.common.cmd_util import mujoco_arg_parser from baselines.common.cmd_util import mujoco_arg_parser
from baselines import bench, logger from baselines import bench, logger
def train(env_id, num_timesteps, seed): def train(env_id, num_timesteps, seed):
from baselines.common import set_global_seeds from baselines.common import set_global_seeds
from baselines.common.vec_env.vec_normalize import VecNormalize from baselines.common.vec_env.vec_normalize import VecNormalize
@@ -16,27 +17,40 @@ def train(env_id, num_timesteps, seed):
intra_op_parallelism_threads=ncpu, intra_op_parallelism_threads=ncpu,
inter_op_parallelism_threads=ncpu) inter_op_parallelism_threads=ncpu)
tf.Session(config=config).__enter__() tf.Session(config=config).__enter__()
def make_env(): def make_env():
env = gym.make(env_id) env = gym.make(env_id)
env = bench.Monitor(env, logger.get_dir()) env = bench.Monitor(env, logger.get_dir(), allow_early_resets=True)
return env return env
env = DummyVecEnv([make_env]) env = DummyVecEnv([make_env])
env = VecNormalize(env) env = VecNormalize(env)
set_global_seeds(seed) set_global_seeds(seed)
policy = MlpPolicy policy = MlpPolicy
ppo2.learn(policy=policy, env=env, nsteps=2048, nminibatches=32, model = ppo2.learn(policy=policy, env=env, nsteps=2048, nminibatches=32,
lam=0.95, gamma=0.99, noptepochs=10, log_interval=1, lam=0.95, gamma=0.99, noptepochs=10, log_interval=1,
ent_coef=0.0, ent_coef=0.0,
lr=3e-4, lr=3e-4,
cliprange=0.2, cliprange=0.2,
total_timesteps=num_timesteps) total_timesteps=num_timesteps)
return model, env
def main(): def main():
args = mujoco_arg_parser().parse_args() args = mujoco_arg_parser().parse_args()
logger.configure() logger.configure()
train(args.env, num_timesteps=args.num_timesteps, seed=args.seed) model, env = train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
if args.play:
logger.log("Running trained model")
obs = np.zeros((env.num_envs,) + env.observation_space.shape)
obs[:] = env.reset()
while True:
actions = model.step(obs)[0]
obs[:] = env.step(actions)[0]
env.render()
if __name__ == '__main__': if __name__ == '__main__':