Compare commits
1 Commits
master
...
peterz_imp
Author | SHA1 | Date | |
---|---|---|---|
|
efb071949e |
@@ -1,4 +1,4 @@
|
||||
<img src="data/logo.jpg" width=25% align="right" /> [](https://travis-ci.org/openai/baselines)
|
||||
<img src="data/logo.jpg" width=25% align="right" />
|
||||
|
||||
# Baselines
|
||||
|
||||
@@ -52,6 +52,8 @@ Install baselines package
|
||||
```bash
|
||||
pip install -e .
|
||||
```
|
||||
### MuJoCo
|
||||
Some of the baselines examples use [MuJoCo](http://www.mujoco.org) (multi-joint dynamics in contact) physics simulator, which is proprietary and requires binaries and a license (temporary 30-day license can be obtained from [www.mujoco.org](http://www.mujoco.org)). Instructions on setting up MuJoCo can be found [here](https://github.com/openai/mujoco-py)
|
||||
|
||||
## Testing the installation
|
||||
All unit tests in baselines can be run using pytest runner:
|
||||
|
@@ -9,6 +9,8 @@ _atariexpl7 = ['Freeway', 'Gravitar', 'MontezumaRevenge', 'Pitfall', 'PrivateEye
|
||||
_BENCHMARKS = []
|
||||
|
||||
remove_version_re = re.compile(r'-v\d+$')
|
||||
|
||||
|
||||
def register_benchmark(benchmark):
|
||||
for b in _BENCHMARKS:
|
||||
if b['name'] == benchmark['name']:
|
||||
@@ -138,3 +140,11 @@ register_benchmark({
|
||||
'tasks': [{'desc': _game, 'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(10e6)} for _game in _atari50]
|
||||
})
|
||||
|
||||
# HER DDPG
|
||||
|
||||
register_benchmark({
|
||||
'name': 'HerDdpg',
|
||||
'description': 'Smoke-test only benchmark of HER',
|
||||
'tasks': [{'trials': 1, 'env_id': 'FetchReach-v1'}]
|
||||
})
|
||||
|
||||
|
@@ -74,6 +74,7 @@ def mujoco_arg_parser():
|
||||
parser.add_argument('--env', help='environment ID', type=str, default='Reacher-v2')
|
||||
parser.add_argument('--seed', help='RNG seed', type=int, default=0)
|
||||
parser.add_argument('--num-timesteps', type=int, default=int(1e6))
|
||||
parser.add_argument('--play', default=False, action='store_true')
|
||||
return parser
|
||||
|
||||
def robotics_arg_parser():
|
||||
|
@@ -1,18 +0,0 @@
|
||||
import numpy as np
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
class AbstractEnvRunner(ABC):
|
||||
def __init__(self, *, env, model, nsteps):
|
||||
self.env = env
|
||||
self.model = model
|
||||
nenv = env.num_envs
|
||||
self.batch_ob_shape = (nenv*nsteps,) + env.observation_space.shape
|
||||
self.obs = np.zeros((nenv,) + env.observation_space.shape, dtype=model.train_model.X.dtype.name)
|
||||
self.obs[:] = env.reset()
|
||||
self.nsteps = nsteps
|
||||
self.states = model.initial_state
|
||||
self.dones = [False for _ in range(nenv)]
|
||||
|
||||
@abstractmethod
|
||||
def run(self):
|
||||
raise NotImplementedError
|
@@ -279,3 +279,27 @@ def display_var_info(vars):
|
||||
logger.info(" %s%s %i params %s" % (name, " "*(55-len(name)), v_params, str(v.shape)))
|
||||
|
||||
logger.info("Total model parameters: %0.2f million" % (count_params*1e-6))
|
||||
|
||||
|
||||
def get_available_gpus():
|
||||
# recipe from here:
|
||||
# https://stackoverflow.com/questions/38559755/how-to-get-current-available-gpus-in-tensorflow?utm_medium=organic&utm_source=google_rich_qa&utm_campaign=google_rich_qa
|
||||
|
||||
from tensorflow.python.client import device_lib
|
||||
local_device_protos = device_lib.list_local_devices()
|
||||
return [x.name for x in local_device_protos if x.device_type == 'GPU']
|
||||
|
||||
# ================================================================
|
||||
# Saving variables
|
||||
# ================================================================
|
||||
|
||||
def load_state(fname):
|
||||
saver = tf.train.Saver()
|
||||
saver.restore(tf.get_default_session(), fname)
|
||||
|
||||
def save_state(fname):
|
||||
os.makedirs(os.path.dirname(fname), exist_ok=True)
|
||||
saver = tf.train.Saver()
|
||||
saver.save(tf.get_default_session(), fname)
|
||||
|
||||
|
||||
|
@@ -50,6 +50,9 @@ class DummyVecEnv(VecEnv):
|
||||
def close(self):
|
||||
return
|
||||
|
||||
def render(self):
|
||||
return [e.render() for e in self.envs]
|
||||
|
||||
def _save_obs(self, e, obs):
|
||||
for k in self.keys:
|
||||
if k is None:
|
||||
|
@@ -14,6 +14,9 @@ def main():
|
||||
parser.add_argument('--prioritized-replay-alpha', type=float, default=0.6)
|
||||
parser.add_argument('--dueling', type=int, default=1)
|
||||
parser.add_argument('--num-timesteps', type=int, default=int(10e6))
|
||||
parser.add_argument('--checkpoint-freq', type=int, default=10000)
|
||||
parser.add_argument('--checkpoint-path', type=str, default=None)
|
||||
|
||||
args = parser.parse_args()
|
||||
logger.configure()
|
||||
set_global_seeds(args.seed)
|
||||
@@ -39,7 +42,9 @@ def main():
|
||||
target_network_update_freq=1000,
|
||||
gamma=0.99,
|
||||
prioritized_replay=bool(args.prioritized),
|
||||
prioritized_replay_alpha=args.prioritized_replay_alpha
|
||||
prioritized_replay_alpha=args.prioritized_replay_alpha,
|
||||
checkpoint_freq=args.checkpoint_freq,
|
||||
checkpoint_path=args.checkpoint_path,
|
||||
)
|
||||
|
||||
env.close()
|
||||
|
@@ -6,13 +6,13 @@ import zipfile
|
||||
import cloudpickle
|
||||
import numpy as np
|
||||
|
||||
import gym
|
||||
import baselines.common.tf_util as U
|
||||
from baselines.common.tf_util import load_state, save_state
|
||||
from baselines import logger
|
||||
from baselines.common.schedules import LinearSchedule
|
||||
from baselines import deepq
|
||||
from baselines.deepq.replay_buffer import ReplayBuffer, PrioritizedReplayBuffer
|
||||
from baselines.deepq.utils import BatchInput, load_state, save_state
|
||||
from baselines.deepq.utils import BatchInput
|
||||
|
||||
|
||||
class ActWrapper(object):
|
||||
@@ -88,6 +88,7 @@ def learn(env,
|
||||
batch_size=32,
|
||||
print_freq=100,
|
||||
checkpoint_freq=10000,
|
||||
checkpoint_path=None,
|
||||
learning_starts=1000,
|
||||
gamma=1.0,
|
||||
target_network_update_freq=500,
|
||||
@@ -171,6 +172,7 @@ def learn(env,
|
||||
# capture the shape outside the closure so that the env object is not serialized
|
||||
# by cloudpickle when serializing make_obs_ph
|
||||
observation_space_shape = env.observation_space.shape
|
||||
|
||||
def make_obs_ph(name):
|
||||
return BatchInput(observation_space_shape, name=name)
|
||||
|
||||
@@ -216,9 +218,17 @@ def learn(env,
|
||||
saved_mean_reward = None
|
||||
obs = env.reset()
|
||||
reset = True
|
||||
|
||||
with tempfile.TemporaryDirectory() as td:
|
||||
model_saved = False
|
||||
td = checkpoint_path or td
|
||||
|
||||
model_file = os.path.join(td, "model")
|
||||
model_saved = False
|
||||
if tf.train.latest_checkpoint(td) is not None:
|
||||
load_state(model_file)
|
||||
logger.log('Loaded model from {}'.format(model_file))
|
||||
model_saved = True
|
||||
|
||||
for t in range(max_timesteps):
|
||||
if callback is not None:
|
||||
if callback(locals(), globals()):
|
||||
|
@@ -1,24 +1,10 @@
|
||||
import os
|
||||
|
||||
import tensorflow as tf
|
||||
|
||||
# ================================================================
|
||||
# Saving variables
|
||||
# ================================================================
|
||||
|
||||
def load_state(fname):
|
||||
saver = tf.train.Saver()
|
||||
saver.restore(tf.get_default_session(), fname)
|
||||
|
||||
def save_state(fname):
|
||||
os.makedirs(os.path.dirname(fname), exist_ok=True)
|
||||
saver = tf.train.Saver()
|
||||
saver.save(tf.get_default_session(), fname)
|
||||
|
||||
# ================================================================
|
||||
# Placeholders
|
||||
# ================================================================
|
||||
|
||||
|
||||
class TfInput(object):
|
||||
def __init__(self, name="(unnamed)"):
|
||||
"""Generalized Tensorflow placeholder. The main differences are:
|
||||
@@ -50,6 +36,7 @@ class PlaceholderTfInput(TfInput):
|
||||
def make_feed_dict(self, data):
|
||||
return {self._placeholder: data}
|
||||
|
||||
|
||||
class BatchInput(PlaceholderTfInput):
|
||||
def __init__(self, shape, dtype=tf.float32, name=None):
|
||||
"""Creates a placeholder for a batch of tensors of a given shape and dtype
|
||||
@@ -65,6 +52,7 @@ class BatchInput(PlaceholderTfInput):
|
||||
"""
|
||||
super().__init__(tf.placeholder(dtype, [None] + list(shape), name=name))
|
||||
|
||||
|
||||
class Uint8Input(PlaceholderTfInput):
|
||||
def __init__(self, shape, name=None):
|
||||
"""Takes input in uint8 format which is cast to float32 and divided by 255
|
||||
|
@@ -1,7 +1,4 @@
|
||||
from copy import deepcopy
|
||||
import numpy as np
|
||||
import json
|
||||
import os
|
||||
import gym
|
||||
|
||||
from baselines import logger
|
||||
@@ -10,7 +7,7 @@ from baselines.her.her import make_sample_her_transitions
|
||||
|
||||
|
||||
DEFAULT_ENV_PARAMS = {
|
||||
'FetchReach-v0': {
|
||||
'FetchReach-v1': {
|
||||
'n_cycles': 10,
|
||||
},
|
||||
}
|
||||
@@ -51,6 +48,8 @@ DEFAULT_PARAMS = {
|
||||
|
||||
|
||||
CACHED_ENVS = {}
|
||||
|
||||
|
||||
def cached_make_env(make_env):
|
||||
"""
|
||||
Only creates a new environment from the provided function if one has not yet already been
|
||||
@@ -68,6 +67,7 @@ def prepare_params(kwargs):
|
||||
ddpg_params = dict()
|
||||
|
||||
env_name = kwargs['env_name']
|
||||
|
||||
def make_env():
|
||||
return gym.make(env_name)
|
||||
kwargs['make_env'] = make_env
|
||||
@@ -75,7 +75,7 @@ def prepare_params(kwargs):
|
||||
assert hasattr(tmp_env, '_max_episode_steps')
|
||||
kwargs['T'] = tmp_env._max_episode_steps
|
||||
tmp_env.reset()
|
||||
kwargs['max_u'] = np.array(kwargs['max_u']) if type(kwargs['max_u']) == list else kwargs['max_u']
|
||||
kwargs['max_u'] = np.array(kwargs['max_u']) if isinstance(kwargs['max_u'], list) else kwargs['max_u']
|
||||
kwargs['gamma'] = 1. - 1. / kwargs['T']
|
||||
if 'lr' in kwargs:
|
||||
kwargs['pi_lr'] = kwargs['lr']
|
||||
@@ -103,6 +103,7 @@ def log_params(params, logger=logger):
|
||||
def configure_her(params):
|
||||
env = cached_make_env(params['make_env'])
|
||||
env.reset()
|
||||
|
||||
def reward_fun(ag_2, g, info): # vectorized
|
||||
return env.compute_reward(achieved_goal=ag_2, desired_goal=g, info=info)
|
||||
|
||||
|
@@ -13,6 +13,8 @@ import baselines.her.experiment.config as config
|
||||
from baselines.her.rollout import RolloutWorker
|
||||
from baselines.her.util import mpi_fork
|
||||
|
||||
from subprocess import CalledProcessError
|
||||
|
||||
|
||||
def mpi_average(value):
|
||||
if value == []:
|
||||
@@ -81,12 +83,17 @@ def train(policy, rollout_worker, evaluator,
|
||||
|
||||
|
||||
def launch(
|
||||
env_name, logdir, n_epochs, num_cpu, seed, replay_strategy, policy_save_interval, clip_return,
|
||||
env, logdir, n_epochs, num_cpu, seed, replay_strategy, policy_save_interval, clip_return,
|
||||
override_params={}, save_policies=True
|
||||
):
|
||||
# Fork for multi-CPU MPI implementation.
|
||||
if num_cpu > 1:
|
||||
try:
|
||||
whoami = mpi_fork(num_cpu, ['--bind-to', 'core'])
|
||||
except CalledProcessError:
|
||||
# fancy version of mpi call failed, try simple version
|
||||
whoami = mpi_fork(num_cpu)
|
||||
|
||||
if whoami == 'parent':
|
||||
sys.exit(0)
|
||||
import baselines.common.tf_util as U
|
||||
@@ -109,10 +116,10 @@ def launch(
|
||||
|
||||
# Prepare params.
|
||||
params = config.DEFAULT_PARAMS
|
||||
params['env_name'] = env_name
|
||||
params['env_name'] = env
|
||||
params['replay_strategy'] = replay_strategy
|
||||
if env_name in config.DEFAULT_ENV_PARAMS:
|
||||
params.update(config.DEFAULT_ENV_PARAMS[env_name]) # merge env-specific parameters in
|
||||
if env in config.DEFAULT_ENV_PARAMS:
|
||||
params.update(config.DEFAULT_ENV_PARAMS[env]) # merge env-specific parameters in
|
||||
params.update(**override_params) # makes it possible to override any parameter
|
||||
with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f:
|
||||
json.dump(params, f)
|
||||
@@ -168,7 +175,7 @@ def launch(
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--env_name', type=str, default='FetchReach-v0', help='the name of the OpenAI Gym environment that you want to train on')
|
||||
@click.option('--env', type=str, default='FetchReach-v1', help='the name of the OpenAI Gym environment that you want to train on')
|
||||
@click.option('--logdir', type=str, default=None, help='the path to where logs and policy pickles should go. If not specified, creates a folder in /tmp/')
|
||||
@click.option('--n_epochs', type=int, default=50, help='the number of training epochs to run')
|
||||
@click.option('--num_cpu', type=int, default=1, help='the number of CPU cores to use (using MPI)')
|
||||
|
@@ -58,12 +58,12 @@ def nn(input, layers_sizes, reuse=None, flatten=False, name=""):
|
||||
"""Creates a simple neural network
|
||||
"""
|
||||
for i, size in enumerate(layers_sizes):
|
||||
activation = tf.nn.relu if i < len(layers_sizes)-1 else None
|
||||
activation = tf.nn.relu if i < len(layers_sizes) - 1 else None
|
||||
input = tf.layers.dense(inputs=input,
|
||||
units=size,
|
||||
kernel_initializer=tf.contrib.layers.xavier_initializer(),
|
||||
reuse=reuse,
|
||||
name=name+'_'+str(i))
|
||||
name=name + '_' + str(i))
|
||||
if activation:
|
||||
input = activation(input)
|
||||
if flatten:
|
||||
@@ -85,7 +85,7 @@ def install_mpi_excepthook():
|
||||
sys.excepthook = new_hook
|
||||
|
||||
|
||||
def mpi_fork(n):
|
||||
def mpi_fork(n, extra_mpi_args=[]):
|
||||
"""Re-launches the current script with workers
|
||||
Returns "parent" for original parent, "child" for MPI children
|
||||
"""
|
||||
@@ -99,14 +99,10 @@ def mpi_fork(n):
|
||||
IN_MPI="1"
|
||||
)
|
||||
# "-bind-to core" is crucial for good performance
|
||||
args = [
|
||||
"mpirun",
|
||||
"-np",
|
||||
str(n),
|
||||
"-bind-to",
|
||||
"core",
|
||||
sys.executable
|
||||
]
|
||||
args = ["mpirun", "-np", str(n)] + \
|
||||
extra_mpi_args + \
|
||||
[sys.executable]
|
||||
|
||||
args += sys.argv
|
||||
subprocess.check_call(args, env=env)
|
||||
return "parent"
|
||||
@@ -140,5 +136,5 @@ def reshape_for_broadcasting(source, target):
|
||||
before broadcasting it with MPI.
|
||||
"""
|
||||
dim = len(target.get_shape())
|
||||
shape = ([1] * (dim-1)) + [-1]
|
||||
shape = ([1] * (dim - 1)) + [-1]
|
||||
return tf.reshape(tf.cast(source, target.dtype), shape)
|
||||
|
@@ -236,6 +236,7 @@ def learn(*, policy, env, nsteps, total_timesteps, ent_coef, lr,
|
||||
print('Saving to', savepath)
|
||||
model.save(savepath)
|
||||
env.close()
|
||||
return model
|
||||
|
||||
def safemean(xs):
|
||||
return np.nan if len(xs) == 0 else np.mean(xs)
|
||||
|
@@ -1,8 +1,9 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
import numpy as np
|
||||
from baselines.common.cmd_util import mujoco_arg_parser
|
||||
from baselines import bench, logger
|
||||
|
||||
|
||||
def train(env_id, num_timesteps, seed):
|
||||
from baselines.common import set_global_seeds
|
||||
from baselines.common.vec_env.vec_normalize import VecNormalize
|
||||
@@ -16,27 +17,40 @@ def train(env_id, num_timesteps, seed):
|
||||
intra_op_parallelism_threads=ncpu,
|
||||
inter_op_parallelism_threads=ncpu)
|
||||
tf.Session(config=config).__enter__()
|
||||
|
||||
def make_env():
|
||||
env = gym.make(env_id)
|
||||
env = bench.Monitor(env, logger.get_dir())
|
||||
env = bench.Monitor(env, logger.get_dir(), allow_early_resets=True)
|
||||
return env
|
||||
|
||||
env = DummyVecEnv([make_env])
|
||||
env = VecNormalize(env)
|
||||
|
||||
set_global_seeds(seed)
|
||||
policy = MlpPolicy
|
||||
ppo2.learn(policy=policy, env=env, nsteps=2048, nminibatches=32,
|
||||
model = ppo2.learn(policy=policy, env=env, nsteps=2048, nminibatches=32,
|
||||
lam=0.95, gamma=0.99, noptepochs=10, log_interval=1,
|
||||
ent_coef=0.0,
|
||||
lr=3e-4,
|
||||
cliprange=0.2,
|
||||
total_timesteps=num_timesteps)
|
||||
|
||||
return model, env
|
||||
|
||||
|
||||
def main():
|
||||
args = mujoco_arg_parser().parse_args()
|
||||
logger.configure()
|
||||
train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
|
||||
model, env = train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
|
||||
|
||||
if args.play:
|
||||
logger.log("Running trained model")
|
||||
obs = np.zeros((env.num_envs,) + env.observation_space.shape)
|
||||
obs[:] = env.reset()
|
||||
while True:
|
||||
actions = model.step(obs)[0]
|
||||
obs[:] = env.step(actions)[0]
|
||||
env.render()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
Reference in New Issue
Block a user