From 217b111c887a80e1350a19d0daad91a2782b303f Mon Sep 17 00:00:00 2001 From: Peter Zhokhov Date: Fri, 10 Aug 2018 14:14:46 -0700 Subject: [PATCH] merged refactor --- .benchmark_pattern | 1 + .gitignore | 2 - .travis.yml | 4 +- Dockerfile | 16 +- README.md | 54 ++++ baselines/a2c/a2c.py | 174 ++++++----- baselines/a2c/policies.py | 146 --------- baselines/a2c/run_atari.py | 30 -- baselines/a2c/runner.py | 60 ++++ baselines/a2c/utils.py | 16 +- baselines/acer/{acer_simple.py => acer.py} | 190 +++++++----- baselines/acer/defaults.py | 4 + baselines/acer/policies.py | 6 +- baselines/acer/run_atari.py | 30 -- baselines/acer/runner.py | 60 ++++ baselines/acktr/acktr.py | 1 + baselines/acktr/acktr_disc.py | 62 ++-- baselines/acktr/run_atari.py | 4 +- baselines/bench/benchmarks.py | 5 +- baselines/bench/monitor.py | 4 +- baselines/common/atari_wrappers.py | 2 + baselines/common/cmd_util.py | 64 +++- baselines/common/distributions.py | 18 +- baselines/common/input.py | 62 ++-- baselines/common/misc_util.py | 13 +- baselines/common/models.py | 177 +++++++++++ baselines/common/mpi_adam_optimizer.py | 31 ++ baselines/common/mpi_util.py | 101 ++++++ baselines/common/policies.py | 179 +++++++++++ baselines/common/retro_wrappers.py | 293 ++++++++++++++++++ baselines/common/runners.py | 3 +- baselines/common/running_mean_std.py | 165 +++++++++- baselines/common/test_identity.py | 44 --- baselines/common/tests/__init__.py | 0 baselines/common/tests/envs/__init__.py | 0 .../common/tests/envs/fixed_sequence_env.py | 44 +++ baselines/common/tests/envs/identity_env.py | 70 +++++ baselines/common/tests/envs/mnist_env.py | 70 +++++ baselines/common/tests/test_cartpole.py | 40 +++ baselines/common/tests/test_fixed_sequence.py | 51 +++ baselines/common/tests/test_identity.py | 55 ++++ baselines/common/tests/test_mnist.py | 50 +++ baselines/common/tests/test_serialization.py | 97 ++++++ baselines/common/tests/util.py | 91 ++++++ baselines/common/tf_util.py | 147 +++++++-- baselines/common/vec_env/dummy_vec_env.py | 21 +- baselines/common/vec_env/subproc_vec_env.py | 42 +-- baselines/common/vec_env/vec_normalize.py | 2 + baselines/ddpg/ddpg.py | 4 +- baselines/deepq/__init__.py | 4 +- baselines/deepq/{simple.py => deepq.py} | 57 +++- baselines/deepq/defaults.py | 21 ++ baselines/deepq/experiments/enjoy_retro.py | 34 ++ baselines/deepq/experiments/run_retro.py | 49 +++ baselines/deepq/models.py | 38 +++ baselines/deepq/test_identity.py | 43 --- baselines/deepq/utils.py | 3 +- baselines/ppo1/run_atari.py | 2 +- baselines/ppo2/defaults.py | 22 ++ baselines/ppo2/policies.py | 146 --------- baselines/ppo2/ppo2.py | 128 ++++++-- baselines/ppo2/run_atari.py | 40 --- baselines/ppo2/run_mujoco.py | 57 ---- baselines/run.py | 230 ++++++++++++++ baselines/trpo_mpi/defaults.py | 30 ++ baselines/trpo_mpi/nosharing_cnn_policy.py | 56 ---- baselines/trpo_mpi/run_atari.py | 43 --- baselines/trpo_mpi/run_mujoco.py | 36 --- baselines/trpo_mpi/trpo_mpi.py | 138 +++++++-- conftest.py | 19 ++ setup.py | 7 +- 71 files changed, 2939 insertions(+), 1069 deletions(-) create mode 100644 .benchmark_pattern delete mode 100644 baselines/a2c/policies.py delete mode 100644 baselines/a2c/run_atari.py create mode 100644 baselines/a2c/runner.py rename baselines/acer/{acer_simple.py => acer.py} (64%) create mode 100644 baselines/acer/defaults.py delete mode 100644 baselines/acer/run_atari.py create mode 100644 baselines/acer/runner.py create mode 100644 baselines/acktr/acktr.py create mode 100644 baselines/common/models.py create mode 100644 baselines/common/mpi_adam_optimizer.py create mode 100644 baselines/common/mpi_util.py create mode 100644 baselines/common/policies.py create mode 100644 baselines/common/retro_wrappers.py delete mode 100644 baselines/common/test_identity.py create mode 100644 baselines/common/tests/__init__.py create mode 100644 baselines/common/tests/envs/__init__.py create mode 100644 baselines/common/tests/envs/fixed_sequence_env.py create mode 100644 baselines/common/tests/envs/identity_env.py create mode 100644 baselines/common/tests/envs/mnist_env.py create mode 100644 baselines/common/tests/test_cartpole.py create mode 100644 baselines/common/tests/test_fixed_sequence.py create mode 100644 baselines/common/tests/test_identity.py create mode 100644 baselines/common/tests/test_mnist.py create mode 100644 baselines/common/tests/test_serialization.py create mode 100644 baselines/common/tests/util.py rename baselines/deepq/{simple.py => deepq.py} (90%) create mode 100644 baselines/deepq/defaults.py create mode 100644 baselines/deepq/experiments/enjoy_retro.py create mode 100644 baselines/deepq/experiments/run_retro.py delete mode 100644 baselines/deepq/test_identity.py create mode 100644 baselines/ppo2/defaults.py delete mode 100644 baselines/ppo2/policies.py delete mode 100644 baselines/ppo2/run_atari.py delete mode 100644 baselines/ppo2/run_mujoco.py create mode 100644 baselines/run.py create mode 100644 baselines/trpo_mpi/defaults.py delete mode 100644 baselines/trpo_mpi/nosharing_cnn_policy.py delete mode 100644 baselines/trpo_mpi/run_atari.py delete mode 100644 baselines/trpo_mpi/run_mujoco.py create mode 100644 conftest.py diff --git a/.benchmark_pattern b/.benchmark_pattern new file mode 100644 index 0000000..e53df25 --- /dev/null +++ b/.benchmark_pattern @@ -0,0 +1 @@ +ppo2 diff --git a/.gitignore b/.gitignore index 722e942..a41103d 100644 --- a/.gitignore +++ b/.gitignore @@ -34,5 +34,3 @@ src .cache MUJOCO_LOG.TXT - - diff --git a/.travis.yml b/.travis.yml index 5ba3ead..e267785 100644 --- a/.travis.yml +++ b/.travis.yml @@ -10,5 +10,5 @@ install: - docker build . -t baselines-test script: - - flake8 --select=F baselines/common - - docker run baselines-test pytest + - flake8 --select=F,E999 baselines/common baselines/trpo_mpi baselines/ppo2 baselines/a2c baselines/deepq baselines/acer + - docker run baselines-test pytest --runslow diff --git a/Dockerfile b/Dockerfile index eeac22a..1d432f3 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,20 +1,24 @@ FROM ubuntu:16.04 -RUN apt-get -y update && apt-get -y install git wget python-dev python3-dev libopenmpi-dev python-pip zlib1g-dev cmake +RUN apt-get -y update && apt-get -y install git wget python-dev python3-dev libopenmpi-dev python-pip zlib1g-dev cmake python-opencv ENV CODE_DIR /root/code ENV VENV /root/venv -COPY . $CODE_DIR/baselines RUN \ pip install virtualenv && \ virtualenv $VENV --python=python3 && \ . $VENV/bin/activate && \ - cd $CODE_DIR && \ - pip install --upgrade pip && \ - pip install -e baselines && \ - pip install pytest + pip install --upgrade pip ENV PATH=$VENV/bin:$PATH + +COPY . $CODE_DIR/baselines WORKDIR $CODE_DIR/baselines +# Clean up pycache and pyc files +RUN rm -rf __pycache__ && \ + find . -name "*.pyc" -delete && \ + pip install -e .[test] + + CMD /bin/bash diff --git a/README.md b/README.md index 197f01a..e8a4abb 100644 --- a/README.md +++ b/README.md @@ -62,6 +62,60 @@ pip install pytest pytest ``` +## Subpackages + +## Testing the installation +All unit tests in baselines can be run using pytest runner: +``` +pip install pytest +pytest +``` + +## Training models +Most of the algorithms in baselines repo are used as follows: +```bash + python -m baselines.run --alg= --env= [additional arguments] +``` +### Example 1. PPO with MuJoCo Humanoid +For instance, to train a fully-connected network controlling MuJoCo humanoid using a2c for 20M timesteps +```bash + python -m baselines.run --alg=a2c --env=Humanoid-v2 --network=mlp --num_timesteps=2e7 +``` +Note that for mujoco environments fully-connected network is default, so we can omit `--network=mlp` +The hyperparameters for both network and the learning algorithm can be controlled via the command line, for instance: +```bash + python -m baselines.run --alg=a2c --env=Humanoid-v2 --network=mlp --num_timesteps=2e7 --ent_coef=0.1 --num_hidden=32 --num_layers=3 --value_network=copy +``` +will set entropy coeffient to 0.1, and construct fully connected network with 3 layers with 32 hidden units in each, and create a separate network for value function estimation (so that its parameters are not shared with the policy network, but the structure is the same) + +See docstrings in [common/models.py](common/models.py) for description of network parameters for each type of model, and +docstring for [baselines/ppo2/ppo2.py/learn()](ppo2/ppo2.py) fir the description of the ppo2 hyperparamters. + +### Example 2. DQN on Atari +DQN with Atari is at this point a classics of benchmarks. To run the baselines implementation of DQN on Atari Pong: +``` + python -m baselines.run --alg=deepq --env=PongNoFrameskip-v4 --num_timesteps=1e6 +``` + +## Saving, loading and visualizing models +The algorithms serialization API is not properly unified yet; however, there is a simple method to save / restore trained models. +`--save_path` and `--load_path` command-line option loads the tensorflow state from a given path before training, and saves it after the training, respectively. +Let's imagine you'd like to train ppo2 on Atari Pong, save the model and then later visualize what has it learnt. +```bash + python -m baselines.run --alg=ppo2 --env=PongNoFrameskip-v4 --num-timesteps=2e7 --save_path=~/models/pong_20M_ppo2 +``` +This should get to the mean reward per episode about 5k. To load and visualize the model, we'll do the following - load the model, train it for 0 steps, and then visualize: +```bash + python -m baselines.run --alg=ppo2 --env=PongNoFrameskip-v4 --num-timesteps=0 --load_path=~/models/pong_20M_ppo2 --play +``` + +*NOTE:* At the moment Mujoco training uses VecNormalize wrapper for the environment which is not being saved correctly; so loading the models trained on Mujoco will not work well if the environment is recreated. If necessary, you can work around that by replacing RunningMeanStd by TfRunningMeanStd in [baselines/common/vec_env/vec_normalize.py](baselines/common/vec_env/vec_normalize.py#L12). This way, mean and std of environment normalizing wrapper will be saved in tensorflow variables and included in the model file; however, training is slower that way - hence not including it by default + + + + + + ## Subpackages - [A2C](baselines/a2c) diff --git a/baselines/a2c/a2c.py b/baselines/a2c/a2c.py index f1de88a..4c3013d 100644 --- a/baselines/a2c/a2c.py +++ b/baselines/a2c/a2c.py @@ -1,42 +1,48 @@ -import os.path as osp import time -import joblib -import numpy as np +import functools import tensorflow as tf + from baselines import logger from baselines.common import set_global_seeds, explained_variance -from baselines.common.runners import AbstractEnvRunner from baselines.common import tf_util +from baselines.common.policies import build_policy -from baselines.a2c.utils import discount_with_dones -from baselines.a2c.utils import Scheduler, make_path, find_trainable_variables -from baselines.a2c.utils import cat_entropy, mse + +from baselines.a2c.utils import Scheduler, find_trainable_variables +from baselines.a2c.runner import Runner + +from tensorflow import losses class Model(object): - def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, + def __init__(self, policy, env, nsteps, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear'): - sess = tf_util.make_session() + sess = tf_util.get_session() + nenvs = env.num_envs nbatch = nenvs*nsteps - A = tf.placeholder(tf.int32, [nbatch]) + + with tf.variable_scope('a2c_model', reuse=tf.AUTO_REUSE): + step_model = policy(nenvs, 1, sess) + train_model = policy(nbatch, nsteps, sess) + + A = tf.placeholder(train_model.action.dtype, train_model.action.shape) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) LR = tf.placeholder(tf.float32, []) - step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False) - train_model = policy(sess, ob_space, ac_space, nenvs*nsteps, nsteps, reuse=True) + neglogpac = train_model.pd.neglogp(A) + entropy = tf.reduce_mean(train_model.pd.entropy()) - neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=A) pg_loss = tf.reduce_mean(ADV * neglogpac) - vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R)) - entropy = tf.reduce_mean(cat_entropy(train_model.pi)) + vf_loss = losses.mean_squared_error(tf.squeeze(train_model.vf), R) + loss = pg_loss - entropy*ent_coef + vf_loss * vf_coef - params = find_trainable_variables("model") + params = find_trainable_variables("a2c_model") grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) @@ -50,6 +56,7 @@ class Model(object): advs = rewards - values for step in range(len(obs)): cur_lr = lr.value() + td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, LR:cur_lr} if states is not None: td_map[train_model.S] = states @@ -60,17 +67,6 @@ class Model(object): ) return policy_loss, value_loss, policy_entropy - def save(save_path): - ps = sess.run(params) - make_path(osp.dirname(save_path)) - joblib.dump(ps, save_path) - - def load(load_path): - loaded_params = joblib.load(load_path) - restores = [] - for p, loaded_p in zip(params, loaded_params): - restores.append(p.assign(loaded_p)) - sess.run(restores) self.train = train self.train_model = train_model @@ -78,66 +74,87 @@ class Model(object): self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state - self.save = save - self.load = load + self.save = functools.partial(tf_util.save_variables, sess=sess) + self.load = functools.partial(tf_util.load_variables, sess=sess) tf.global_variables_initializer().run(session=sess) -class Runner(AbstractEnvRunner): - def __init__(self, env, model, nsteps=5, gamma=0.99): - super().__init__(env=env, model=model, nsteps=nsteps) - self.gamma = gamma +def learn( + network, + env, + seed=None, + nsteps=5, + total_timesteps=int(80e6), + vf_coef=0.5, + ent_coef=0.01, + max_grad_norm=0.5, + lr=7e-4, + lrschedule='linear', + epsilon=1e-5, + alpha=0.99, + gamma=0.99, + log_interval=100, + load_path=None, + **network_kwargs): + + ''' + Main entrypoint for A2C algorithm. Train a policy with given network architecture on a given environment using a2c algorithm. + + Parameters: + ----------- + + network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list) + specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns + tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward + neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets. + See baselines.common/policies.py/lstm for more details on using recurrent nets in policies + + + env: RL environment. Should implement interface similar to VecEnv (baselines.common/vec_env) or be wrapped with DummyVecEnv (baselines.common/vec_env/dummy_vec_env.py) + + + seed: seed to make random number sequence in the alorightm reproducible. By default is None which means seed from system noise generator (not reproducible) + + nsteps: int, number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where + nenv is number of environment copies simulated in parallel) + + total_timesteps: int, total number of timesteps to train on (default: 80M) + + vf_coef: float, coefficient in front of value function loss in the total loss function (default: 0.5) + + ent_coef: float, coeffictiant in front of the policy entropy in the total loss function (default: 0.01) + + max_gradient_norm: float, gradient is clipped to have global L2 norm no more than this value (default: 0.5) + + lr: float, learning rate for RMSProp (current implementation has RMSProp hardcoded in) (default: 7e-4) + + lrschedule: schedule of learning rate. Can be 'linear', 'constant', or a function [0..1] -> [0..1] that takes fraction of the training progress as input and + returns fraction of the learning rate (specified as lr) as output + + epsilon: float, RMSProp epsilon (stabilizes square root computation in denominator of RMSProp update) (default: 1e-5) + + alpha: float, RMSProp decay parameter (default: 0.99) + + gamma: float, reward discounting parameter (default: 0.99) + + log_interval: int, specifies how frequently the logs are printed out (default: 100) + + **network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network + For instance, 'mlp' network architecture has arguments num_hidden and num_layers. + + ''' + - def run(self): - mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[] - mb_states = self.states - for n in range(self.nsteps): - actions, values, states, _ = self.model.step(self.obs, self.states, self.dones) - mb_obs.append(np.copy(self.obs)) - mb_actions.append(actions) - mb_values.append(values) - mb_dones.append(self.dones) - obs, rewards, dones, _ = self.env.step(actions) - self.states = states - self.dones = dones - for n, done in enumerate(dones): - if done: - self.obs[n] = self.obs[n]*0 - self.obs = obs - mb_rewards.append(rewards) - mb_dones.append(self.dones) - #batch of steps to batch of rollouts - mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape(self.batch_ob_shape) - mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0) - mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0) - mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0) - mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0) - mb_masks = mb_dones[:, :-1] - mb_dones = mb_dones[:, 1:] - last_values = self.model.value(self.obs, self.states, self.dones).tolist() - #discount/bootstrap off value fn - for n, (rewards, dones, value) in enumerate(zip(mb_rewards, mb_dones, last_values)): - rewards = rewards.tolist() - dones = dones.tolist() - if dones[-1] == 0: - rewards = discount_with_dones(rewards+[value], dones+[0], self.gamma)[:-1] - else: - rewards = discount_with_dones(rewards, dones, self.gamma) - mb_rewards[n] = rewards - mb_rewards = mb_rewards.flatten() - mb_actions = mb_actions.flatten() - mb_values = mb_values.flatten() - mb_masks = mb_masks.flatten() - return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values -def learn(policy, env, seed, nsteps=5, total_timesteps=int(80e6), vf_coef=0.5, ent_coef=0.01, max_grad_norm=0.5, lr=7e-4, lrschedule='linear', epsilon=1e-5, alpha=0.99, gamma=0.99, log_interval=100): set_global_seeds(seed) nenvs = env.num_envs - ob_space = env.observation_space - ac_space = env.action_space - model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nenvs=nenvs, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, + policy = build_policy(env, network, **network_kwargs) + + model = Model(policy=policy, env=env, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, lr=lr, alpha=alpha, epsilon=epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule) + if load_path is not None: + model.load(load_path) runner = Runner(env, model, nsteps=nsteps, gamma=gamma) nbatch = nenvs*nsteps @@ -158,3 +175,4 @@ def learn(policy, env, seed, nsteps=5, total_timesteps=int(80e6), vf_coef=0.5, e logger.dump_tabular() env.close() return model + diff --git a/baselines/a2c/policies.py b/baselines/a2c/policies.py deleted file mode 100644 index 6fbbb14..0000000 --- a/baselines/a2c/policies.py +++ /dev/null @@ -1,146 +0,0 @@ -import numpy as np -import tensorflow as tf -from baselines.a2c.utils import conv, fc, conv_to_fc, batch_to_seq, seq_to_batch, lstm, lnlstm -from baselines.common.distributions import make_pdtype -from baselines.common.input import observation_input - -def nature_cnn(unscaled_images, **conv_kwargs): - """ - CNN from Nature paper. - """ - scaled_images = tf.cast(unscaled_images, tf.float32) / 255. - activ = tf.nn.relu - h = activ(conv(scaled_images, 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2), - **conv_kwargs)) - h2 = activ(conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2), **conv_kwargs)) - h3 = activ(conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2), **conv_kwargs)) - h3 = conv_to_fc(h3) - return activ(fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))) - -class LnLstmPolicy(object): - def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False): - nenv = nbatch // nsteps - X, processed_x = observation_input(ob_space, nbatch) - M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) - S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states - self.pdtype = make_pdtype(ac_space) - with tf.variable_scope("model", reuse=reuse): - h = nature_cnn(processed_x) - xs = batch_to_seq(h, nenv, nsteps) - ms = batch_to_seq(M, nenv, nsteps) - h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm) - h5 = seq_to_batch(h5) - vf = fc(h5, 'v', 1) - self.pd, self.pi = self.pdtype.pdfromlatent(h5) - - v0 = vf[:, 0] - a0 = self.pd.sample() - neglogp0 = self.pd.neglogp(a0) - self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32) - - def step(ob, state, mask): - return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask}) - - def value(ob, state, mask): - return sess.run(v0, {X:ob, S:state, M:mask}) - - self.X = X - self.M = M - self.S = S - self.vf = vf - self.step = step - self.value = value - -class LstmPolicy(object): - - def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False): - nenv = nbatch // nsteps - self.pdtype = make_pdtype(ac_space) - X, processed_x = observation_input(ob_space, nbatch) - - M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) - S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states - with tf.variable_scope("model", reuse=reuse): - h = nature_cnn(X) - xs = batch_to_seq(h, nenv, nsteps) - ms = batch_to_seq(M, nenv, nsteps) - h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm) - h5 = seq_to_batch(h5) - vf = fc(h5, 'v', 1) - self.pd, self.pi = self.pdtype.pdfromlatent(h5) - - v0 = vf[:, 0] - a0 = self.pd.sample() - neglogp0 = self.pd.neglogp(a0) - self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32) - - def step(ob, state, mask): - return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask}) - - def value(ob, state, mask): - return sess.run(v0, {X:ob, S:state, M:mask}) - - self.X = X - self.M = M - self.S = S - self.vf = vf - self.step = step - self.value = value - -class CnnPolicy(object): - - def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False, **conv_kwargs): #pylint: disable=W0613 - self.pdtype = make_pdtype(ac_space) - X, processed_x = observation_input(ob_space, nbatch) - with tf.variable_scope("model", reuse=reuse): - h = nature_cnn(processed_x, **conv_kwargs) - vf = fc(h, 'v', 1)[:,0] - self.pd, self.pi = self.pdtype.pdfromlatent(h, init_scale=0.01) - - a0 = self.pd.sample() - neglogp0 = self.pd.neglogp(a0) - self.initial_state = None - - def step(ob, *_args, **_kwargs): - a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob}) - return a, v, self.initial_state, neglogp - - def value(ob, *_args, **_kwargs): - return sess.run(vf, {X:ob}) - - self.X = X - self.vf = vf - self.step = step - self.value = value - -class MlpPolicy(object): - def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False): #pylint: disable=W0613 - self.pdtype = make_pdtype(ac_space) - with tf.variable_scope("model", reuse=reuse): - X, processed_x = observation_input(ob_space, nbatch) - activ = tf.tanh - processed_x = tf.layers.flatten(processed_x) - pi_h1 = activ(fc(processed_x, 'pi_fc1', nh=64, init_scale=np.sqrt(2))) - pi_h2 = activ(fc(pi_h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2))) - vf_h1 = activ(fc(processed_x, 'vf_fc1', nh=64, init_scale=np.sqrt(2))) - vf_h2 = activ(fc(vf_h1, 'vf_fc2', nh=64, init_scale=np.sqrt(2))) - vf = fc(vf_h2, 'vf', 1)[:,0] - - self.pd, self.pi = self.pdtype.pdfromlatent(pi_h2, init_scale=0.01) - - - a0 = self.pd.sample() - neglogp0 = self.pd.neglogp(a0) - self.initial_state = None - - def step(ob, *_args, **_kwargs): - a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob}) - return a, v, self.initial_state, neglogp - - def value(ob, *_args, **_kwargs): - return sess.run(vf, {X:ob}) - - self.X = X - self.vf = vf - self.step = step - self.value = value diff --git a/baselines/a2c/run_atari.py b/baselines/a2c/run_atari.py deleted file mode 100644 index b09d9bb..0000000 --- a/baselines/a2c/run_atari.py +++ /dev/null @@ -1,30 +0,0 @@ -#!/usr/bin/env python3 - -from baselines import logger -from baselines.common.cmd_util import make_atari_env, atari_arg_parser -from baselines.common.vec_env.vec_frame_stack import VecFrameStack -from baselines.a2c.a2c import learn -from baselines.ppo2.policies import CnnPolicy, LstmPolicy, LnLstmPolicy - -def train(env_id, num_timesteps, seed, policy, lrschedule, num_env): - if policy == 'cnn': - policy_fn = CnnPolicy - elif policy == 'lstm': - policy_fn = LstmPolicy - elif policy == 'lnlstm': - policy_fn = LnLstmPolicy - env = VecFrameStack(make_atari_env(env_id, num_env, seed), 4) - learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), lrschedule=lrschedule) - env.close() - -def main(): - parser = atari_arg_parser() - parser.add_argument('--policy', help='Policy architecture', choices=['cnn', 'lstm', 'lnlstm'], default='cnn') - parser.add_argument('--lrschedule', help='Learning rate schedule', choices=['constant', 'linear'], default='constant') - args = parser.parse_args() - logger.configure() - train(args.env, num_timesteps=args.num_timesteps, seed=args.seed, - policy=args.policy, lrschedule=args.lrschedule, num_env=16) - -if __name__ == '__main__': - main() diff --git a/baselines/a2c/runner.py b/baselines/a2c/runner.py new file mode 100644 index 0000000..60b5e1d --- /dev/null +++ b/baselines/a2c/runner.py @@ -0,0 +1,60 @@ +import numpy as np +from baselines.a2c.utils import discount_with_dones +from baselines.common.runners import AbstractEnvRunner + +class Runner(AbstractEnvRunner): + + def __init__(self, env, model, nsteps=5, gamma=0.99): + super().__init__(env=env, model=model, nsteps=nsteps) + self.gamma = gamma + self.batch_action_shape = [x if x is not None else -1 for x in model.train_model.action.shape.as_list()] + self.ob_dtype = model.train_model.X.dtype.as_numpy_dtype + + def run(self): + mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[] + mb_states = self.states + for n in range(self.nsteps): + actions, values, states, _ = self.model.step(self.obs, S=self.states, M=self.dones) + mb_obs.append(np.copy(self.obs)) + mb_actions.append(actions) + mb_values.append(values) + mb_dones.append(self.dones) + obs, rewards, dones, _ = self.env.step(actions) + self.states = states + self.dones = dones + for n, done in enumerate(dones): + if done: + self.obs[n] = self.obs[n]*0 + self.obs = obs + mb_rewards.append(rewards) + mb_dones.append(self.dones) + #batch of steps to batch of rollouts + + mb_obs = np.asarray(mb_obs, dtype=self.ob_dtype).swapaxes(1, 0).reshape(self.batch_ob_shape) + mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0) + mb_actions = np.asarray(mb_actions, dtype=self.model.train_model.action.dtype.name).swapaxes(1, 0) + mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0) + mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0) + mb_masks = mb_dones[:, :-1] + mb_dones = mb_dones[:, 1:] + + + if self.gamma > 0.0: + #discount/bootstrap off value fn + last_values = self.model.value(self.obs, S=self.states, M=self.dones).tolist() + for n, (rewards, dones, value) in enumerate(zip(mb_rewards, mb_dones, last_values)): + rewards = rewards.tolist() + dones = dones.tolist() + if dones[-1] == 0: + rewards = discount_with_dones(rewards+[value], dones+[0], self.gamma)[:-1] + else: + rewards = discount_with_dones(rewards, dones, self.gamma) + + mb_rewards[n] = rewards + + mb_actions = mb_actions.reshape(self.batch_action_shape) + + mb_rewards = mb_rewards.flatten() + mb_values = mb_values.flatten() + mb_masks = mb_masks.flatten() + return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values diff --git a/baselines/a2c/utils.py b/baselines/a2c/utils.py index a7610eb..f38085b 100644 --- a/baselines/a2c/utils.py +++ b/baselines/a2c/utils.py @@ -1,8 +1,6 @@ import os -import gym import numpy as np import tensorflow as tf -from gym import spaces from collections import deque def sample(logits): @@ -10,18 +8,15 @@ def sample(logits): return tf.argmax(logits - tf.log(-tf.log(noise)), 1) def cat_entropy(logits): - a0 = logits - tf.reduce_max(logits, 1, keep_dims=True) + a0 = logits - tf.reduce_max(logits, 1, keepdims=True) ea0 = tf.exp(a0) - z0 = tf.reduce_sum(ea0, 1, keep_dims=True) + z0 = tf.reduce_sum(ea0, 1, keepdims=True) p0 = ea0 / z0 return tf.reduce_sum(p0 * (tf.log(z0) - a0), 1) def cat_entropy_softmax(p0): return - tf.reduce_sum(p0 * tf.log(p0 + 1e-6), axis = 1) -def mse(pred, target): - return tf.square(pred-target)/2. - def ortho_init(scale=1.0): def _ortho_init(shape, dtype, partition_info=None): #lasagne ortho init for tf @@ -58,7 +53,7 @@ def conv(x, scope, *, nf, rf, stride, pad='VALID', init_scale=1.0, data_format=' b = tf.get_variable("b", bias_var_shape, initializer=tf.constant_initializer(0.0)) if not one_dim_bias and data_format == 'NHWC': b = tf.reshape(b, bshape) - return b + tf.nn.conv2d(x, w, strides=strides, padding=pad, data_format=data_format) + return tf.nn.conv2d(x, w, strides=strides, padding=pad, data_format=data_format) + b def fc(x, scope, nh, *, init_scale=1.0, init_bias=0.0): with tf.variable_scope(scope): @@ -85,7 +80,6 @@ def seq_to_batch(h, flat = False): def lstm(xs, ms, s, scope, nh, init_scale=1.0): nbatch, nin = [v.value for v in xs[0].get_shape()] - nsteps = len(xs) with tf.variable_scope(scope): wx = tf.get_variable("wx", [nin, nh*4], initializer=ortho_init(init_scale)) wh = tf.get_variable("wh", [nh, nh*4], initializer=ortho_init(init_scale)) @@ -115,7 +109,6 @@ def _ln(x, g, b, e=1e-5, axes=[1]): def lnlstm(xs, ms, s, scope, nh, init_scale=1.0): nbatch, nin = [v.value for v in xs[0].get_shape()] - nsteps = len(xs) with tf.variable_scope(scope): wx = tf.get_variable("wx", [nin, nh*4], initializer=ortho_init(init_scale)) gx = tf.get_variable("gx", [nh*4], initializer=tf.constant_initializer(1.0)) @@ -160,8 +153,7 @@ def discount_with_dones(rewards, dones, gamma): return discounted[::-1] def find_trainable_variables(key): - with tf.variable_scope(key): - return tf.trainable_variables() + return tf.trainable_variables(key) def make_path(f): return os.makedirs(f, exist_ok=True) diff --git a/baselines/acer/acer_simple.py b/baselines/acer/acer.py similarity index 64% rename from baselines/acer/acer_simple.py rename to baselines/acer/acer.py index bed486a..1bb8129 100644 --- a/baselines/acer/acer_simple.py +++ b/baselines/acer/acer.py @@ -1,20 +1,20 @@ import time -import joblib +import functools import numpy as np import tensorflow as tf from baselines import logger from baselines.common import set_global_seeds -from baselines.common.runners import AbstractEnvRunner +from baselines.common.policies import build_policy +from baselines.common.tf_util import get_session, save_variables from baselines.a2c.utils import batch_to_seq, seq_to_batch -from baselines.a2c.utils import Scheduler, make_path, find_trainable_variables from baselines.a2c.utils import cat_entropy_softmax +from baselines.a2c.utils import Scheduler, find_trainable_variables from baselines.a2c.utils import EpisodeStats from baselines.a2c.utils import get_by_index, check_shape, avg_norm, gradient_add, q_explained_variance from baselines.acer.buffer import Buffer - -import os.path as osp +from baselines.acer.runner import Runner # remove last step def strip(var, nenvs, nsteps, flat = False): @@ -59,10 +59,8 @@ class Model(object): ent_coef, q_coef, gamma, max_grad_norm, lr, rprop_alpha, rprop_epsilon, total_timesteps, lrschedule, c, trust_region, alpha, delta): - config = tf.ConfigProto(allow_soft_placement=True, - intra_op_parallelism_threads=num_procs, - inter_op_parallelism_threads=num_procs) - sess = tf.Session(config=config) + + sess = get_session() nact = ac_space.n nbatch = nenvs * nsteps @@ -72,11 +70,16 @@ class Model(object): MU = tf.placeholder(tf.float32, [nbatch, nact]) # mu's LR = tf.placeholder(tf.float32, []) eps = 1e-6 + + step_ob_placeholder = tf.placeholder(dtype=ob_space.dtype, shape=(nenvs,) + ob_space.shape[:-1] + (ob_space.shape[-1] * nstack,)) + train_ob_placeholder = tf.placeholder(dtype=ob_space.dtype, shape=(nenvs*(nsteps+1),) + ob_space.shape[:-1] + (ob_space.shape[-1] * nstack,)) + with tf.variable_scope('acer_model', reuse=tf.AUTO_REUSE): - step_model = policy(sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False) - train_model = policy(sess, ob_space, ac_space, nenvs, nsteps + 1, nstack, reuse=True) + step_model = policy(observ_placeholder=step_ob_placeholder, sess=sess) + train_model = policy(observ_placeholder=train_ob_placeholder, sess=sess) - params = find_trainable_variables("model") + + params = find_trainable_variables("acer_model") print("Params {}".format(len(params))) for var in params: print(var) @@ -90,14 +93,20 @@ class Model(object): print(v.name) return v - with tf.variable_scope("", custom_getter=custom_getter, reuse=True): - polyak_model = policy(sess, ob_space, ac_space, nenvs, nsteps + 1, nstack, reuse=True) + with tf.variable_scope("acer_model", custom_getter=custom_getter, reuse=True): + polyak_model = policy(observ_placeholder=train_ob_placeholder, sess=sess) # Notation: (var) = batch variable, (var)s = seqeuence variable, (var)_i = variable index by action at step i - v = tf.reduce_sum(train_model.pi * train_model.q, axis = -1) # shape is [nenvs * (nsteps + 1)] + + # action probability distributions according to train_model, polyak_model and step_model + # poilcy.pi is probability distribution parameters; to obtain distribution that sums to 1 need to take softmax + train_model_p = tf.nn.softmax(train_model.pi) + polyak_model_p = tf.nn.softmax(polyak_model.pi) + step_model_p = tf.nn.softmax(step_model.pi) + v = tf.reduce_sum(train_model_p * train_model.q, axis = -1) # shape is [nenvs * (nsteps + 1)] # strip off last step - f, f_pol, q = map(lambda var: strip(var, nenvs, nsteps), [train_model.pi, polyak_model.pi, train_model.q]) + f, f_pol, q = map(lambda var: strip(var, nenvs, nsteps), [train_model_p, polyak_model_p, train_model.q]) # Get pi and q values for actions taken f_i = get_by_index(f, A) q_i = get_by_index(q, A) @@ -110,7 +119,8 @@ class Model(object): qret = q_retrace(R, D, q_i, v, rho_i, nenvs, nsteps, gamma) # Calculate losses - # Entropy + # Entropy + # entropy = tf.reduce_mean(strip(train_model.pd.entropy(), nenvs, nsteps)) entropy = tf.reduce_mean(cat_entropy_softmax(f)) # Policy Graident loss, with truncated importance sampling & bias correction @@ -192,80 +202,29 @@ class Model(object): def train(obs, actions, rewards, dones, mus, states, masks, steps): cur_lr = lr.value_steps(steps) td_map = {train_model.X: obs, polyak_model.X: obs, A: actions, R: rewards, D: dones, MU: mus, LR: cur_lr} - if states != []: + if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks td_map[polyak_model.S] = states td_map[polyak_model.M] = masks + return names_ops, sess.run(run_ops, td_map)[1:] # strip off _train - def save(save_path): - ps = sess.run(params) - make_path(osp.dirname(save_path)) - joblib.dump(ps, save_path) + def _step(observation, **kwargs): + return step_model._evaluate([step_model.action, step_model_p, step_model.state], observation, **kwargs) + + self.train = train - self.save = save + self.save = functools.partial(save_variables, sess=sess, variables=params) self.train_model = train_model self.step_model = step_model - self.step = step_model.step + self._step = _step + self.step = self.step_model.step + self.initial_state = step_model.initial_state tf.global_variables_initializer().run(session=sess) -class Runner(AbstractEnvRunner): - def __init__(self, env, model, nsteps, nstack): - super().__init__(env=env, model=model, nsteps=nsteps) - self.nstack = nstack - nh, nw, nc = env.observation_space.shape - self.nc = nc # nc = 1 for atari, but just in case - self.nenv = nenv = env.num_envs - self.nact = env.action_space.n - self.nbatch = nenv * nsteps - self.batch_ob_shape = (nenv*(nsteps+1), nh, nw, nc*nstack) - self.obs = np.zeros((nenv, nh, nw, nc * nstack), dtype=np.uint8) - obs = env.reset() - self.update_obs(obs) - - def update_obs(self, obs, dones=None): - if dones is not None: - self.obs *= (1 - dones.astype(np.uint8))[:, None, None, None] - self.obs = np.roll(self.obs, shift=-self.nc, axis=3) - self.obs[:, :, :, -self.nc:] = obs[:, :, :, :] - - def run(self): - enc_obs = np.split(self.obs, self.nstack, axis=3) # so now list of obs steps - mb_obs, mb_actions, mb_mus, mb_dones, mb_rewards = [], [], [], [], [] - for _ in range(self.nsteps): - actions, mus, states = self.model.step(self.obs, state=self.states, mask=self.dones) - mb_obs.append(np.copy(self.obs)) - mb_actions.append(actions) - mb_mus.append(mus) - mb_dones.append(self.dones) - obs, rewards, dones, _ = self.env.step(actions) - # states information for statefull models like LSTM - self.states = states - self.dones = dones - self.update_obs(obs, dones) - mb_rewards.append(rewards) - enc_obs.append(obs) - mb_obs.append(np.copy(self.obs)) - mb_dones.append(self.dones) - - enc_obs = np.asarray(enc_obs, dtype=np.uint8).swapaxes(1, 0) - mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0) - mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0) - mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0) - mb_mus = np.asarray(mb_mus, dtype=np.float32).swapaxes(1, 0) - - mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0) - - mb_masks = mb_dones # Used for statefull models like LSTM's to mask state when done - mb_dones = mb_dones[:, 1:] # Used for calculating returns. The dones array is now aligned with rewards - - # shapes are now [nenv, nsteps, []] - # When pulling from buffer, arrays will now be reshaped in place, preventing a deep copy. - - return enc_obs, mb_obs, mb_actions, mb_rewards, mb_mus, mb_dones, mb_masks class Acer(): def __init__(self, runner, model, buffer, log_interval): @@ -311,19 +270,84 @@ class Acer(): logger.dump_tabular() -def learn(policy, env, seed, nsteps=20, nstack=4, total_timesteps=int(80e6), q_coef=0.5, ent_coef=0.01, +def learn(network, env, seed=None, nsteps=20, nstack=4, total_timesteps=int(80e6), q_coef=0.5, ent_coef=0.01, max_grad_norm=10, lr=7e-4, lrschedule='linear', rprop_epsilon=1e-5, rprop_alpha=0.99, gamma=0.99, log_interval=100, buffer_size=50000, replay_ratio=4, replay_start=10000, c=10.0, - trust_region=True, alpha=0.99, delta=1): + trust_region=True, alpha=0.99, delta=1, load_path=None, **network_kwargs): + + ''' + Main entrypoint for ACER (Actor-Critic with Experience Replay) algorithm (https://arxiv.org/pdf/1611.01224.pdf) + Train an agent with given network architecture on a given environment using ACER. + + Parameters: + ---------- + + network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list) + specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns + tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward + neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets. + See baselines.common/policies.py/lstm for more details on using recurrent nets in policies + + env: environment. Needs to be vectorized for parallel environment simulation. + The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class. + + nsteps: int, number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where + nenv is number of environment copies simulated in parallel) (default: 20) + + nstack: int, size of the frame stack, i.e. number of the frames passed to the step model. Frames are stacked along channel dimension + (last image dimension) (default: 4) + + total_timesteps: int, number of timesteps (i.e. number of actions taken in the environment) (default: 80M) + + q_coef: float, value function loss coefficient in the optimization objective (analog of vf_coef for other actor-critic methods) + + ent_coef: float, policy entropy coefficient in the optimization objective (default: 0.01) + + max_grad_norm: float, gradient norm clipping coefficient. If set to None, no clipping. (default: 10), + + lr: float, learning rate for RMSProp (current implementation has RMSProp hardcoded in) (default: 7e-4) + + lrschedule: schedule of learning rate. Can be 'linear', 'constant', or a function [0..1] -> [0..1] that takes fraction of the training progress as input and + returns fraction of the learning rate (specified as lr) as output + + rprop_epsilon: float, RMSProp epsilon (stabilizes square root computation in denominator of RMSProp update) (default: 1e-5) + + rprop_alpha: float, RMSProp decay parameter (default: 0.99) + + gamma: float, reward discounting factor (default: 0.99) + + log_interval: int, number of updates between logging events (default: 100) + + buffer_size: int, size of the replay buffer (default: 50k) + + replay_ratio: int, now many (on average) batches of data to sample from the replay buffer take after batch from the environment (default: 4) + + replay_start: int, the sampling from the replay buffer does not start until replay buffer has at least that many samples (default: 10k) + + c: float, importance weight clipping factor (default: 10) + + trust_region bool, whether or not algorithms estimates the gradient KL divergence between the old and updated policy and uses it to determine step size (default: True) + + delta: float, max KL divergence between the old policy and updated policy (default: 1) + + alpha: float, momentum factor in the Polyak (exponential moving average) averaging of the model parameters (default: 0.99) + + load_path: str, path to load the model from (default: None) + + **network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network + For instance, 'mlp' network architecture has arguments num_hidden and num_layers. + + ''' + print("Running Acer Simple") print(locals()) - tf.reset_default_graph() set_global_seeds(seed) + policy = build_policy(env, network, estimate_q=True, **network_kwargs) nenvs = env.num_envs ob_space = env.observation_space ac_space = env.action_space - num_procs = len(env.remotes) # HACK + num_procs = len(env.remotes) if hasattr(env, 'remotes') else 1# HACK model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nenvs=nenvs, nsteps=nsteps, nstack=nstack, num_procs=num_procs, ent_coef=ent_coef, q_coef=q_coef, gamma=gamma, max_grad_norm=max_grad_norm, lr=lr, rprop_alpha=rprop_alpha, rprop_epsilon=rprop_epsilon, @@ -338,6 +362,7 @@ def learn(policy, env, seed, nsteps=20, nstack=4, total_timesteps=int(80e6), q_c nbatch = nenvs*nsteps acer = Acer(runner, model, buffer, log_interval) acer.tstart = time.time() + for acer.steps in range(0, total_timesteps, nbatch): #nbatch samples, 1 on_policy call and multiple off-policy calls acer.call(on_policy=True) if replay_ratio > 0 and buffer.has_atleast(replay_start): @@ -346,3 +371,4 @@ def learn(policy, env, seed, nsteps=20, nstack=4, total_timesteps=int(80e6), q_c acer.call(on_policy=False) # no simulation steps in this env.close() + return model diff --git a/baselines/acer/defaults.py b/baselines/acer/defaults.py new file mode 100644 index 0000000..0334bae --- /dev/null +++ b/baselines/acer/defaults.py @@ -0,0 +1,4 @@ +def atari(): + return dict( + lrschedule='constant' + ) diff --git a/baselines/acer/policies.py b/baselines/acer/policies.py index 627c400..6dad6f3 100644 --- a/baselines/acer/policies.py +++ b/baselines/acer/policies.py @@ -1,6 +1,6 @@ import numpy as np import tensorflow as tf -from baselines.ppo2.policies import nature_cnn +from baselines.common.policies import nature_cnn from baselines.a2c.utils import fc, batch_to_seq, seq_to_batch, lstm, sample @@ -18,11 +18,13 @@ class AcerCnnPolicy(object): pi = tf.nn.softmax(pi_logits) q = fc(h, 'q', nact) - a = sample(pi_logits) # could change this to use self.pi instead + a = sample(tf.nn.softmax(pi_logits)) # could change this to use self.pi instead self.initial_state = [] # not stateful self.X = X self.pi = pi # actual policy params now + self.pi_logits = pi_logits self.q = q + self.vf = q def step(ob, *args, **kwargs): # returns actions, mus, states diff --git a/baselines/acer/run_atari.py b/baselines/acer/run_atari.py deleted file mode 100644 index cce979e..0000000 --- a/baselines/acer/run_atari.py +++ /dev/null @@ -1,30 +0,0 @@ -#!/usr/bin/env python3 -from baselines import logger -from baselines.acer.acer_simple import learn -from baselines.acer.policies import AcerCnnPolicy, AcerLstmPolicy -from baselines.common.cmd_util import make_atari_env, atari_arg_parser - -def train(env_id, num_timesteps, seed, policy, lrschedule, num_cpu): - env = make_atari_env(env_id, num_cpu, seed) - if policy == 'cnn': - policy_fn = AcerCnnPolicy - elif policy == 'lstm': - policy_fn = AcerLstmPolicy - else: - print("Policy {} not implemented".format(policy)) - return - learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), lrschedule=lrschedule) - env.close() - -def main(): - parser = atari_arg_parser() - parser.add_argument('--policy', help='Policy architecture', choices=['cnn', 'lstm', 'lnlstm'], default='cnn') - parser.add_argument('--lrschedule', help='Learning rate schedule', choices=['constant', 'linear'], default='constant') - parser.add_argument('--logdir', help ='Directory for logging') - args = parser.parse_args() - logger.configure(args.logdir) - train(args.env, num_timesteps=args.num_timesteps, seed=args.seed, - policy=args.policy, lrschedule=args.lrschedule, num_cpu=16) - -if __name__ == '__main__': - main() diff --git a/baselines/acer/runner.py b/baselines/acer/runner.py new file mode 100644 index 0000000..6bc1b4c --- /dev/null +++ b/baselines/acer/runner.py @@ -0,0 +1,60 @@ +import numpy as np +from baselines.common.runners import AbstractEnvRunner + +class Runner(AbstractEnvRunner): + + def __init__(self, env, model, nsteps, nstack): + super().__init__(env=env, model=model, nsteps=nsteps) + self.nstack = nstack + nh, nw, nc = env.observation_space.shape + self.nc = nc # nc = 1 for atari, but just in case + self.nact = env.action_space.n + nenv = self.nenv + self.nbatch = nenv * nsteps + self.batch_ob_shape = (nenv*(nsteps+1), nh, nw, nc*nstack) + self.obs = np.zeros((nenv, nh, nw, nc * nstack), dtype=np.uint8) + obs = env.reset() + self.update_obs(obs) + + def update_obs(self, obs, dones=None): + #self.obs = obs + if dones is not None: + self.obs *= (1 - dones.astype(np.uint8))[:, None, None, None] + self.obs = np.roll(self.obs, shift=-self.nc, axis=3) + self.obs[:, :, :, -self.nc:] = obs[:, :, :, :] + + def run(self): + enc_obs = np.split(self.obs, self.nstack, axis=3) # so now list of obs steps + mb_obs, mb_actions, mb_mus, mb_dones, mb_rewards = [], [], [], [], [] + for _ in range(self.nsteps): + actions, mus, states = self.model._step(self.obs, S=self.states, M=self.dones) + mb_obs.append(np.copy(self.obs)) + mb_actions.append(actions) + mb_mus.append(mus) + mb_dones.append(self.dones) + obs, rewards, dones, _ = self.env.step(actions) + # states information for statefull models like LSTM + self.states = states + self.dones = dones + self.update_obs(obs, dones) + mb_rewards.append(rewards) + enc_obs.append(obs) + mb_obs.append(np.copy(self.obs)) + mb_dones.append(self.dones) + + enc_obs = np.asarray(enc_obs, dtype=np.uint8).swapaxes(1, 0) + mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0) + mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0) + mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0) + mb_mus = np.asarray(mb_mus, dtype=np.float32).swapaxes(1, 0) + + mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0) + + mb_masks = mb_dones # Used for statefull models like LSTM's to mask state when done + mb_dones = mb_dones[:, 1:] # Used for calculating returns. The dones array is now aligned with rewards + + # shapes are now [nenv, nsteps, []] + # When pulling from buffer, arrays will now be reshaped in place, preventing a deep copy. + + return enc_obs, mb_obs, mb_actions, mb_rewards, mb_mus, mb_dones, mb_masks + diff --git a/baselines/acktr/acktr.py b/baselines/acktr/acktr.py new file mode 100644 index 0000000..97090b4 --- /dev/null +++ b/baselines/acktr/acktr.py @@ -0,0 +1 @@ +from baselines.acktr.acktr_disc import * diff --git a/baselines/acktr/acktr_disc.py b/baselines/acktr/acktr_disc.py index a8b77b6..cfa028d 100644 --- a/baselines/acktr/acktr_disc.py +++ b/baselines/acktr/acktr_disc.py @@ -1,16 +1,17 @@ import os.path as osp import time -import joblib +import functools import numpy as np import tensorflow as tf from baselines import logger from baselines.common import set_global_seeds, explained_variance +from baselines.common.policies import build_policy +from baselines.common.tf_util import get_session, save_variables, load_variables -from baselines.a2c.a2c import Runner +from baselines.a2c.runner import Runner from baselines.a2c.utils import discount_with_dones from baselines.a2c.utils import Scheduler, find_trainable_variables -from baselines.a2c.utils import cat_entropy, mse from baselines.acktr import kfac @@ -19,11 +20,8 @@ class Model(object): def __init__(self, policy, ob_space, ac_space, nenvs,total_timesteps, nprocs=32, nsteps=20, ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5, kfac_clip=0.001, lrschedule='linear'): - config = tf.ConfigProto(allow_soft_placement=True, - intra_op_parallelism_threads=nprocs, - inter_op_parallelism_threads=nprocs) - config.gpu_options.allow_growth = True - self.sess = sess = tf.Session(config=config) + + self.sess = sess = get_session() nact = ac_space.n nbatch = nenvs * nsteps A = tf.placeholder(tf.int32, [nbatch]) @@ -32,27 +30,28 @@ class Model(object): PG_LR = tf.placeholder(tf.float32, []) VF_LR = tf.placeholder(tf.float32, []) - self.model = step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False) - self.model2 = train_model = policy(sess, ob_space, ac_space, nenvs*nsteps, nsteps, reuse=True) + with tf.variable_scope('acktr_model', reuse=tf.AUTO_REUSE): + self.model = step_model = policy(nenvs, 1, sess=sess) + self.model2 = train_model = policy(nenvs*nsteps, nsteps, sess=sess) - logpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=A) + neglogpac = train_model.pd.neglogp(A) self.logits = logits = train_model.pi ##training loss - pg_loss = tf.reduce_mean(ADV*logpac) - entropy = tf.reduce_mean(cat_entropy(train_model.pi)) + pg_loss = tf.reduce_mean(ADV*neglogpac) + entropy = tf.reduce_mean(train_model.pd.entropy()) pg_loss = pg_loss - ent_coef * entropy - vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R)) + vf_loss = tf.losses.mean_squared_error(tf.squeeze(train_model.vf), R) train_loss = pg_loss + vf_coef * vf_loss ##Fisher loss construction - self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(logpac) + self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(neglogpac) sample_net = train_model.vf + tf.random_normal(tf.shape(train_model.vf)) self.vf_fisher = vf_fisher_loss = - vf_fisher_coef*tf.reduce_mean(tf.pow(train_model.vf - tf.stop_gradient(sample_net), 2)) self.joint_fisher = joint_fisher_loss = pg_fisher_loss + vf_fisher_loss - self.params=params = find_trainable_variables("model") + self.params=params = find_trainable_variables("acktr_model") self.grads_check = grads = tf.gradients(train_loss,params) @@ -82,22 +81,10 @@ class Model(object): ) return policy_loss, value_loss, policy_entropy - def save(save_path): - ps = sess.run(params) - joblib.dump(ps, save_path) - - def load(load_path): - loaded_params = joblib.load(load_path) - restores = [] - for p, loaded_p in zip(params, loaded_params): - restores.append(p.assign(loaded_p)) - sess.run(restores) - - self.train = train - self.save = save - self.load = load + self.save = functools.partial(save_variables, sess=sess) + self.load = functools.partial(load_variables, sess=sess) self.train_model = train_model self.step_model = step_model self.step = step_model.step @@ -105,12 +92,17 @@ class Model(object): self.initial_state = step_model.initial_state tf.global_variables_initializer().run(session=sess) -def learn(policy, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interval=1, nprocs=32, nsteps=20, +def learn(network, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interval=1, nprocs=32, nsteps=20, ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5, - kfac_clip=0.001, save_interval=None, lrschedule='linear'): - tf.reset_default_graph() + kfac_clip=0.001, save_interval=None, lrschedule='linear', load_path=None, **network_kwargs): set_global_seeds(seed) + + if network == 'cnn': + network_kwargs['one_dim_bias'] = True + + policy = build_policy(env, network, **network_kwargs) + nenvs = env.num_envs ob_space = env.observation_space ac_space = env.action_space @@ -123,6 +115,9 @@ def learn(policy, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interval with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh: fh.write(cloudpickle.dumps(make_model)) model = make_model() + + if load_path is not None: + model.load(load_path) runner = Runner(env, model, nsteps=nsteps, gamma=gamma) nbatch = nenvs*nsteps @@ -153,3 +148,4 @@ def learn(policy, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interval coord.request_stop() coord.join(enqueue_threads) env.close() + return model diff --git a/baselines/acktr/run_atari.py b/baselines/acktr/run_atari.py index 6e398ce..50e1580 100644 --- a/baselines/acktr/run_atari.py +++ b/baselines/acktr/run_atari.py @@ -6,11 +6,11 @@ from baselines import logger from baselines.acktr.acktr_disc import learn from baselines.common.cmd_util import make_atari_env, atari_arg_parser from baselines.common.vec_env.vec_frame_stack import VecFrameStack -from baselines.ppo2.policies import CnnPolicy +from baselines.common.policies import cnn def train(env_id, num_timesteps, seed, num_cpu): env = VecFrameStack(make_atari_env(env_id, num_cpu, seed), 4) - policy_fn = partial(CnnPolicy, one_dim_bias=True) + policy_fn = cnn(env=env, one_dim_bias=True) learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), nprocs=num_cpu) env.close() diff --git a/baselines/bench/benchmarks.py b/baselines/bench/benchmarks.py index a5a35f8..e9328b2 100644 --- a/baselines/bench/benchmarks.py +++ b/baselines/bench/benchmarks.py @@ -59,7 +59,7 @@ register_benchmark({ register_benchmark({ 'name': 'Atari10M', 'description': '7 Atari games from Mnih et al. (2013), with pixel observations, 10M timesteps', - 'tasks': [{'desc': _game, 'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(10e6)} for _game in _atari7] + 'tasks': [{'desc': _game, 'env_id': _game + _ATARI_SUFFIX, 'trials': 6, 'num_timesteps': int(10e6)} for _game in _atari7] }) register_benchmark({ @@ -84,8 +84,9 @@ _mujocosmall = [ register_benchmark({ 'name': 'Mujoco1M', 'description': 'Some small 2D MuJoCo tasks, run for 1M timesteps', - 'tasks': [{'env_id': _envid, 'trials': 3, 'num_timesteps': int(1e6)} for _envid in _mujocosmall] + 'tasks': [{'env_id': _envid, 'trials': 6, 'num_timesteps': int(1e6)} for _envid in _mujocosmall] }) + register_benchmark({ 'name': 'MujocoWalkers', 'description': 'MuJoCo forward walkers, run for 8M, humanoid 100M', diff --git a/baselines/bench/monitor.py b/baselines/bench/monitor.py index 0da1b4f..bb0c282 100644 --- a/baselines/bench/monitor.py +++ b/baselines/bench/monitor.py @@ -112,6 +112,8 @@ def load_results(dir): with open(fname, 'rt') as fh: if fname.endswith('csv'): firstline = fh.readline() + if not firstline: + continue assert firstline[0] == '#' header = json.loads(firstline[1:]) df = pandas.read_csv(fh, index_col=None) @@ -158,4 +160,4 @@ def test_monitor(): last_logline = pandas.read_csv(f, index_col=None) assert set(last_logline.keys()) == {'l', 't', 'r'}, "Incorrect keys in monitor logline" f.close() - os.remove(mon_file) \ No newline at end of file + os.remove(mon_file) diff --git a/baselines/common/atari_wrappers.py b/baselines/common/atari_wrappers.py index 2aefad7..4598e23 100644 --- a/baselines/common/atari_wrappers.py +++ b/baselines/common/atari_wrappers.py @@ -1,4 +1,6 @@ import numpy as np +import os +os.environ.setdefault('PATH', '') from collections import deque import gym from gym import spaces diff --git a/baselines/common/cmd_util.py b/baselines/common/cmd_util.py index 5707695..681a80c 100644 --- a/baselines/common/cmd_util.py +++ b/baselines/common/cmd_util.py @@ -3,7 +3,11 @@ Helpers for scripts like run_atari.py. """ import os -from mpi4py import MPI +try: + from mpi4py import MPI +except ImportError: + MPI = None + import gym from gym.wrappers import FlattenDictWrapper from baselines import logger @@ -17,25 +21,32 @@ def make_atari_env(env_id, num_env, seed, wrapper_kwargs=None, start_index=0): Create a wrapped, monitored SubprocVecEnv for Atari. """ if wrapper_kwargs is None: wrapper_kwargs = {} + mpi_rank = MPI.COMM_WORLD.Get_rank() if MPI else 0 def make_env(rank): # pylint: disable=C0111 def _thunk(): env = make_atari(env_id) - env.seed(seed + rank) - env = Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) + env.seed(seed + 10000*mpi_rank + rank if seed is not None else None) + env = Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(mpi_rank) + '.' + str(rank))) return wrap_deepmind(env, **wrapper_kwargs) return _thunk set_global_seeds(seed) return SubprocVecEnv([make_env(i + start_index) for i in range(num_env)]) -def make_mujoco_env(env_id, seed): +def make_mujoco_env(env_id, seed, reward_scale=1.0): """ Create a wrapped, monitored gym.Env for MuJoCo. """ rank = MPI.COMM_WORLD.Get_rank() - set_global_seeds(seed + 10000 * rank) + myseed = seed + 1000 * rank if seed is not None else None + set_global_seeds(myseed) env = gym.make(env_id) - env = Monitor(env, os.path.join(logger.get_dir(), str(rank))) + env = Monitor(env, os.path.join(logger.get_dir(), str(rank)), allow_early_resets=True) env.seed(seed) + + if reward_scale != 1.0: + from baselines.common.retro_wrappers import RewardScaler + env = RewardScaler(env, reward_scale) + return env def make_robotics_env(env_id, seed, rank=0): @@ -62,20 +73,27 @@ def atari_arg_parser(): """ Create an argparse.ArgumentParser for run_atari.py. """ - parser = arg_parser() - parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4') - parser.add_argument('--seed', help='RNG seed', type=int, default=0) - parser.add_argument('--num-timesteps', type=int, default=int(10e6)) - return parser + print('Obsolete - use common_arg_parser instead') + return common_arg_parser() def mujoco_arg_parser(): + print('Obsolete - use common_arg_parser instead') + return common_arg_parser() + +def common_arg_parser(): """ Create an argparse.ArgumentParser for run_mujoco.py. """ parser = arg_parser() parser.add_argument('--env', help='environment ID', type=str, default='Reacher-v2') - parser.add_argument('--seed', help='RNG seed', type=int, default=0) - parser.add_argument('--num-timesteps', type=int, default=int(1e6)) + parser.add_argument('--seed', help='RNG seed', type=int, default=None) + parser.add_argument('--alg', help='Algorithm', type=str, default='ppo2') + parser.add_argument('--num_timesteps', type=float, default=1e6), + parser.add_argument('--network', help='network type (mlp, cnn, lstm, cnn_lstm, conv_only)', default=None) + parser.add_argument('--gamestate', help='game state to load (so far only used in retro games)', default=None) + parser.add_argument('--num_env', help='Number of environment copies being run in parallel. When not specified, set to number of cpus for Atari, and to 1 for Mujoco', default=None, type=int) + parser.add_argument('--reward_scale', help='Reward scale factor. Default: 1.0', default=1.0, type=float) + parser.add_argument('--save_path', help='Path to save trained model to', default=None, type=str) parser.add_argument('--play', default=False, action='store_true') return parser @@ -85,6 +103,24 @@ def robotics_arg_parser(): """ parser = arg_parser() parser.add_argument('--env', help='environment ID', type=str, default='FetchReach-v0') - parser.add_argument('--seed', help='RNG seed', type=int, default=0) + parser.add_argument('--seed', help='RNG seed', type=int, default=None) parser.add_argument('--num-timesteps', type=int, default=int(1e6)) return parser + + +def parse_unknown_args(args): + """ + Parse arguments not consumed by arg parser into a dicitonary + """ + retval = {} + for arg in args: + assert arg.startswith('--') + assert '=' in arg, 'cannot parse arg {}'.format(arg) + key = arg.split('=')[0][2:] + value = arg.split('=')[1] + retval[key] = value + + return retval + + + diff --git a/baselines/common/distributions.py b/baselines/common/distributions.py index 8a57c37..29f3632 100644 --- a/baselines/common/distributions.py +++ b/baselines/common/distributions.py @@ -85,7 +85,7 @@ class DiagGaussianPdType(PdType): def pdfromlatent(self, latent_vector, init_scale=1.0, init_bias=0.0): mean = fc(latent_vector, 'pi', self.size, init_scale=init_scale, init_bias=init_bias) - logstd = tf.get_variable(name='logstd', shape=[1, self.size], initializer=tf.zeros_initializer()) + logstd = tf.get_variable(name='pi/logstd', shape=[1, self.size], initializer=tf.zeros_initializer()) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) return self.pdfromflat(pdparam), mean @@ -143,26 +143,26 @@ class CategoricalPd(Pd): # Note: we can't use sparse_softmax_cross_entropy_with_logits because # the implementation does not allow second-order derivatives... one_hot_actions = tf.one_hot(x, self.logits.get_shape().as_list()[-1]) - return tf.nn.softmax_cross_entropy_with_logits( + return tf.nn.softmax_cross_entropy_with_logits_v2( logits=self.logits, labels=one_hot_actions) def kl(self, other): - a0 = self.logits - tf.reduce_max(self.logits, axis=-1, keep_dims=True) - a1 = other.logits - tf.reduce_max(other.logits, axis=-1, keep_dims=True) + a0 = self.logits - tf.reduce_max(self.logits, axis=-1, keepdims=True) + a1 = other.logits - tf.reduce_max(other.logits, axis=-1, keepdims=True) ea0 = tf.exp(a0) ea1 = tf.exp(a1) - z0 = tf.reduce_sum(ea0, axis=-1, keep_dims=True) - z1 = tf.reduce_sum(ea1, axis=-1, keep_dims=True) + z0 = tf.reduce_sum(ea0, axis=-1, keepdims=True) + z1 = tf.reduce_sum(ea1, axis=-1, keepdims=True) p0 = ea0 / z0 return tf.reduce_sum(p0 * (a0 - tf.log(z0) - a1 + tf.log(z1)), axis=-1) def entropy(self): - a0 = self.logits - tf.reduce_max(self.logits, axis=-1, keep_dims=True) + a0 = self.logits - tf.reduce_max(self.logits, axis=-1, keepdims=True) ea0 = tf.exp(a0) - z0 = tf.reduce_sum(ea0, axis=-1, keep_dims=True) + z0 = tf.reduce_sum(ea0, axis=-1, keepdims=True) p0 = ea0 / z0 return tf.reduce_sum(p0 * (tf.log(z0) - a0), axis=-1) def sample(self): - u = tf.random_uniform(tf.shape(self.logits)) + u = tf.random_uniform(tf.shape(self.logits), dtype=self.logits.dtype) return tf.argmax(self.logits - tf.log(-tf.log(u)), axis=-1) @classmethod def fromflat(cls, flat): diff --git a/baselines/common/input.py b/baselines/common/input.py index 7fbf9fc..dff9480 100644 --- a/baselines/common/input.py +++ b/baselines/common/input.py @@ -1,30 +1,56 @@ import tensorflow as tf from gym.spaces import Discrete, Box -def observation_input(ob_space, batch_size=None, name='Ob'): - ''' - Build observation input with encoding depending on the - observation space type - Params: +def observation_placeholder(ob_space, batch_size=None, name='Ob'): + ''' + Create placeholder to feed observations into of the size appropriate to the observation space - ob_space: observation space (should be one of gym.spaces) - batch_size: batch size for input (default is None, so that resulting input placeholder can take tensors with any batch size) - name: tensorflow variable name for input placeholder + Parameters: + ---------- - returns: tuple (input_placeholder, processed_input_tensor) + ob_space: gym.Space observation space + + batch_size: int size of the batch to be fed into input. Can be left None in most cases. + + name: str name of the placeholder + + Returns: + ------- + + tensorflow placeholder tensor + ''' + + assert isinstance(ob_space, Discrete) or isinstance(ob_space, Box), \ + 'Can only deal with Discrete and Box observation spaces for now' + + return tf.placeholder(shape=(batch_size,) + ob_space.shape, dtype=ob_space.dtype, name=name) + + +def observation_input(ob_space, batch_size=None, name='Ob'): + ''' + Create placeholder to feed observations into of the size appropriate to the observation space, and add input + encoder of the appropriate type. + ''' + + placeholder = observation_placeholder(ob_space, batch_size, name) + return placeholder, encode_observation(ob_space, placeholder) + +def encode_observation(ob_space, placeholder): + ''' + Encode input in the way that is appropriate to the observation space + + Parameters: + ---------- + + ob_space: gym.Space observation space + + placeholder: tf.placeholder observation input placeholder ''' if isinstance(ob_space, Discrete): - input_x = tf.placeholder(shape=(batch_size,), dtype=tf.int32, name=name) - processed_x = tf.to_float(tf.one_hot(input_x, ob_space.n)) - return input_x, processed_x + return tf.to_float(tf.one_hot(placeholder, ob_space.n)) elif isinstance(ob_space, Box): - input_shape = (batch_size,) + ob_space.shape - input_x = tf.placeholder(shape=input_shape, dtype=ob_space.dtype, name=name) - processed_x = tf.to_float(input_x) - return input_x, processed_x - + return tf.to_float(placeholder) else: raise NotImplementedError - diff --git a/baselines/common/misc_util.py b/baselines/common/misc_util.py index 9985dea..451de1c 100644 --- a/baselines/common/misc_util.py +++ b/baselines/common/misc_util.py @@ -67,14 +67,21 @@ class EzPickle(object): def set_global_seeds(i): + try: + import MPI + rank = MPI.COMM_WORLD.Get_rank() + except ImportError: + rank = 0 + + myseed = i + 1000 * rank if i is not None else None try: import tensorflow as tf except ImportError: pass else: - tf.set_random_seed(i) - np.random.seed(i) - random.seed(i) + tf.set_random_seed(myseed) + np.random.seed(myseed) + random.seed(myseed) def pretty_eta(seconds_left): diff --git a/baselines/common/models.py b/baselines/common/models.py new file mode 100644 index 0000000..0763095 --- /dev/null +++ b/baselines/common/models.py @@ -0,0 +1,177 @@ +import numpy as np +import tensorflow as tf +from baselines.a2c import utils +from baselines.a2c.utils import conv, fc, conv_to_fc, batch_to_seq, seq_to_batch +from baselines.common.mpi_running_mean_std import RunningMeanStd +import tensorflow.contrib.layers as layers + + +def nature_cnn(unscaled_images, **conv_kwargs): + """ + CNN from Nature paper. + """ + scaled_images = tf.cast(unscaled_images, tf.float32) / 255. + activ = tf.nn.relu + h = activ(conv(scaled_images, 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2), + **conv_kwargs)) + h2 = activ(conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2), **conv_kwargs)) + h3 = activ(conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2), **conv_kwargs)) + h3 = conv_to_fc(h3) + return activ(fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))) + + +def mlp(num_layers=2, num_hidden=64, activation=tf.tanh): + """ + Simple fully connected layer policy. Separate stacks of fully-connected layers are used for policy and value function estimation. + More customized fully-connected policies can be obtained by using PolicyWithV class directly. + + Parameters: + ---------- + + num_layers: int number of fully-connected layers (default: 2) + + num_hidden: int size of fully-connected layers (default: 64) + + activation: activation function (default: tf.tanh) + + Returns: + ------- + + function that builds fully connected network with a given input placeholder + """ + def network_fn(X): + h = tf.layers.flatten(X) + for i in range(num_layers): + h = activation(fc(h, 'mlp_fc{}'.format(i), nh=num_hidden, init_scale=np.sqrt(2))) + return h, None + + return network_fn + + +def cnn(**conv_kwargs): + def network_fn(X): + return nature_cnn(X, **conv_kwargs), None + return network_fn + +def cnn_small(**conv_kwargs): + def network_fn(X): + h = tf.cast(X, tf.float32) / 255. + + activ = tf.nn.relu + h = activ(conv(h, 'c1', nf=8, rf=8, stride=4, init_scale=np.sqrt(2), **conv_kwargs)) + h = activ(conv(h, 'c2', nf=16, rf=4, stride=2, init_scale=np.sqrt(2), **conv_kwargs)) + h = conv_to_fc(h) + h = activ(fc(h, 'fc1', nh=128, init_scale=np.sqrt(2))) + return h, None + return network_fn + + + +def lstm(nlstm=128, layer_norm=False): + def network_fn(X, nenv=1): + nbatch = X.shape[0] + nsteps = nbatch // nenv + + h = tf.layers.flatten(X) + + M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) + S = tf.placeholder(tf.float32, [nenv, 2*nlstm]) #states + + xs = batch_to_seq(h, nenv, nsteps) + ms = batch_to_seq(M, nenv, nsteps) + + if layer_norm: + h5, snew = utils.lnlstm(xs, ms, S, scope='lnlstm', nh=nlstm) + else: + h5, snew = utils.lstm(xs, ms, S, scope='lstm', nh=nlstm) + + h = seq_to_batch(h5) + initial_state = np.zeros(S.shape.as_list(), dtype=float) + + return h, {'S':S, 'M':M, 'state':snew, 'initial_state':initial_state} + + return network_fn + + +def cnn_lstm(nlstm=128, layer_norm=False, **conv_kwargs): + def network_fn(X, nenv=1): + nbatch = X.shape[0] + nsteps = nbatch // nenv + + h = nature_cnn(X, **conv_kwargs) + + M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) + S = tf.placeholder(tf.float32, [nenv, 2*nlstm]) #states + + xs = batch_to_seq(h, nenv, nsteps) + ms = batch_to_seq(M, nenv, nsteps) + + if layer_norm: + h5, snew = utils.lnlstm(xs, ms, S, scope='lnlstm', nh=nlstm) + else: + h5, snew = utils.lstm(xs, ms, S, scope='lstm', nh=nlstm) + + h = seq_to_batch(h5) + initial_state = np.zeros(S.shape.as_list(), dtype=float) + + return h, {'S':S, 'M':M, 'state':snew, 'initial_state':initial_state} + + return network_fn + +def cnn_lnlstm(nlstm=128, **conv_kwargs): + return cnn_lstm(nlstm, layer_norm=True, **conv_kwargs) + + +def conv_only(convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], **conv_kwargs): + ''' + convolutions-only net + + Parameters: + ---------- + + conv: list of triples (filter_number, filter_size, stride) specifying parameters for each layer. + + Returns: + + function that takes tensorflow tensor as input and returns the output of the last convolutional layer + + ''' + + def network_fn(X): + out = X + with tf.variable_scope("convnet"): + for num_outputs, kernel_size, stride in convs: + out = layers.convolution2d(out, + num_outputs=num_outputs, + kernel_size=kernel_size, + stride=stride, + activation_fn=tf.nn.relu, + **conv_kwargs) + + return out, None + return network_fn + +def _normalize_clip_observation(x, clip_range=[-5.0, 5.0]): + rms = RunningMeanStd(shape=x.shape[1:]) + norm_x = tf.clip_by_value((x - rms.mean) / rms.std, min(clip_range), max(clip_range)) + return norm_x, rms + + +def get_network_builder(name): + # TODO: replace with reflection? + if name == 'cnn': + return cnn + elif name == 'cnn_small': + return cnn_small + elif name == 'conv_only': + return conv_only + elif name == 'mlp': + return mlp + elif name == 'lstm': + return lstm + elif name == 'cnn_lstm': + return cnn_lstm + elif name == 'cnn_lnlstm': + return cnn_lnlstm + else: + raise ValueError('Unknown network type: {}'.format(name)) diff --git a/baselines/common/mpi_adam_optimizer.py b/baselines/common/mpi_adam_optimizer.py new file mode 100644 index 0000000..8cf09c4 --- /dev/null +++ b/baselines/common/mpi_adam_optimizer.py @@ -0,0 +1,31 @@ +import numpy as np +import tensorflow as tf +from mpi4py import MPI + +class MpiAdamOptimizer(tf.train.AdamOptimizer): + """Adam optimizer that averages gradients across mpi processes.""" + def __init__(self, comm, **kwargs): + self.comm = comm + tf.train.AdamOptimizer.__init__(self, **kwargs) + def compute_gradients(self, loss, var_list, **kwargs): + grads_and_vars = tf.train.AdamOptimizer.compute_gradients(self, loss, var_list, **kwargs) + grads_and_vars = [(g, v) for g, v in grads_and_vars if g is not None] + flat_grad = tf.concat([tf.reshape(g, (-1,)) for g, v in grads_and_vars], axis=0) + shapes = [v.shape.as_list() for g, v in grads_and_vars] + sizes = [int(np.prod(s)) for s in shapes] + + num_tasks = self.comm.Get_size() + buf = np.zeros(sum(sizes), np.float32) + + def _collect_grads(flat_grad): + self.comm.Allreduce(flat_grad, buf, op=MPI.SUM) + np.divide(buf, float(num_tasks), out=buf) + return buf + + avg_flat_grad = tf.py_func(_collect_grads, [flat_grad], tf.float32) + avg_flat_grad.set_shape(flat_grad.shape) + avg_grads = tf.split(avg_flat_grad, sizes, axis=0) + avg_grads_and_vars = [(tf.reshape(g, v.shape), v) + for g, (_, v) in zip(avg_grads, grads_and_vars)] + + return avg_grads_and_vars diff --git a/baselines/common/mpi_util.py b/baselines/common/mpi_util.py new file mode 100644 index 0000000..f04187b --- /dev/null +++ b/baselines/common/mpi_util.py @@ -0,0 +1,101 @@ +from collections import defaultdict +from mpi4py import MPI +import os, numpy as np +import platform +import shutil +import subprocess + +def sync_from_root(sess, variables, comm=None): + """ + Send the root node's parameters to every worker. + Arguments: + sess: the TensorFlow session. + variables: all parameter variables including optimizer's + """ + if comm is None: comm = MPI.COMM_WORLD + rank = comm.Get_rank() + for var in variables: + if rank == 0: + comm.Bcast(sess.run(var)) + else: + import tensorflow as tf + returned_var = np.empty(var.shape, dtype='float32') + comm.Bcast(returned_var) + sess.run(tf.assign(var, returned_var)) + +def gpu_count(): + """ + Count the GPUs on this machine. + """ + if shutil.which('nvidia-smi') is None: + return 0 + output = subprocess.check_output(['nvidia-smi', '--query-gpu=gpu_name', '--format=csv']) + return max(0, len(output.split(b'\n')) - 2) + +def setup_mpi_gpus(): + """ + Set CUDA_VISIBLE_DEVICES using MPI. + """ + num_gpus = gpu_count() + if num_gpus == 0: + return + local_rank, _ = get_local_rank_size(MPI.COMM_WORLD) + os.environ['CUDA_VISIBLE_DEVICES'] = str(local_rank % num_gpus) + +def get_local_rank_size(comm): + """ + Returns the rank of each process on its machine + The processes on a given machine will be assigned ranks + 0, 1, 2, ..., N-1, + where N is the number of processes on this machine. + + Useful if you want to assign one gpu per machine + """ + this_node = platform.node() + ranks_nodes = comm.allgather((comm.Get_rank(), this_node)) + node2rankssofar = defaultdict(int) + local_rank = None + for (rank, node) in ranks_nodes: + if rank == comm.Get_rank(): + local_rank = node2rankssofar[node] + node2rankssofar[node] += 1 + assert local_rank is not None + return local_rank, node2rankssofar[this_node] + +def share_file(comm, path): + """ + Copies the file from rank 0 to all other ranks + Puts it in the same place on all machines + """ + localrank, _ = get_local_rank_size(comm) + if comm.Get_rank() == 0: + with open(path, 'rb') as fh: + data = fh.read() + comm.bcast(data) + else: + data = comm.bcast(None) + if localrank == 0: + os.makedirs(os.path.dirname(path), exist_ok=True) + with open(path, 'wb') as fh: + fh.write(data) + comm.Barrier() + +def dict_gather(comm, d, op='mean', assert_all_have_data=True): + if comm is None: return d + alldicts = comm.allgather(d) + size = comm.size + k2li = defaultdict(list) + for d in alldicts: + for (k,v) in d.items(): + k2li[k].append(v) + result = {} + for (k,li) in k2li.items(): + if assert_all_have_data: + assert len(li)==size, "only %i out of %i MPI workers have sent '%s'" % (len(li), size, k) + if op=='mean': + result[k] = np.mean(li, axis=0) + elif op=='sum': + result[k] = np.sum(li, axis=0) + else: + assert 0, op + return result diff --git a/baselines/common/policies.py b/baselines/common/policies.py new file mode 100644 index 0000000..4ad41cb --- /dev/null +++ b/baselines/common/policies.py @@ -0,0 +1,179 @@ +import tensorflow as tf +from baselines.common import tf_util +from baselines.a2c.utils import fc +from baselines.common.distributions import make_pdtype +from baselines.common.input import observation_placeholder, encode_observation +from baselines.common.tf_util import adjust_shape +from baselines.common.mpi_running_mean_std import RunningMeanStd +from baselines.common.models import get_network_builder + +import gym + + +class PolicyWithValue(object): + """ + Encapsulates fields and methods for RL policy and value function estimation with shared parameters + """ + + def __init__(self, env, observations, latent, estimate_q=False, vf_latent=None, sess=None, **tensors): + """ + Parameters: + ---------- + env RL environment + + observations tensorflow placeholder in which the observations will be fed + + latent latent state from which policy distribution parameters should be inferred + + vf_latent latent state from which value function should be inferred (if None, then latent is used) + + sess tensorflow session to run calculations in (if None, default session is used) + + **tensors tensorflow tensors for additional attributes such as state or mask + + """ + + self.X = observations + self.state = tf.constant([]) + self.initial_state = None + self.__dict__.update(tensors) + + vf_latent = vf_latent if vf_latent is not None else latent + + vf_latent = tf.layers.flatten(vf_latent) + latent = tf.layers.flatten(latent) + + self.pdtype = make_pdtype(env.action_space) + + self.pd, self.pi = self.pdtype.pdfromlatent(latent, init_scale=0.01) + + self.action = self.pd.sample() + self.neglogp = self.pd.neglogp(self.action) + self.sess = sess + + if estimate_q: + assert isinstance(env.action_space, gym.spaces.Discrete) + self.q = fc(vf_latent, 'q', env.action_space.n) + self.vf = self.q + else: + self.vf = fc(vf_latent, 'vf', 1) + self.vf = self.vf[:,0] + + def _evaluate(self, variables, observation, **extra_feed): + sess = self.sess or tf.get_default_session() + feed_dict = {self.X: adjust_shape(self.X, observation)} + for inpt_name, data in extra_feed.items(): + if inpt_name in self.__dict__.keys(): + inpt = self.__dict__[inpt_name] + if isinstance(inpt, tf.Tensor) and inpt._op.type == 'Placeholder': + feed_dict[inpt] = adjust_shape(inpt, data) + + return sess.run(variables, feed_dict) + + def step(self, observation, **extra_feed): + """ + Compute next action(s) given the observaion(s) + + Parameters: + ---------- + + observation observation data (either single or a batch) + + **extra_feed additional data such as state or mask (names of the arguments should match the ones in constructor, see __init__) + + Returns: + ------- + (action, value estimate, next state, negative log likelihood of the action under current policy parameters) tuple + """ + + a, v, state, neglogp = self._evaluate([self.action, self.vf, self.state, self.neglogp], observation, **extra_feed) + if state.size == 0: + state = None + return a, v, state, neglogp + + def value(self, ob, *args, **kwargs): + """ + Compute value estimate(s) given the observaion(s) + + Parameters: + ---------- + + observation observation data (either single or a batch) + + **extra_feed additional data such as state or mask (names of the arguments should match the ones in constructor, see __init__) + + Returns: + ------- + value estimate + """ + return self._evaluate(self.vf, ob, *args, **kwargs) + + def save(self, save_path): + tf_util.save_state(save_path, sess=self.sess) + + def load(self, load_path): + tf_util.load_state(load_path, sess=self.sess) + +def build_policy(env, policy_network, value_network=None, normalize_observations=False, estimate_q=False, **policy_kwargs): + if isinstance(policy_network, str): + network_type = policy_network + policy_network = get_network_builder(network_type)(**policy_kwargs) + + def policy_fn(nbatch=None, nsteps=None, sess=None, observ_placeholder=None): + ob_space = env.observation_space + + X = observ_placeholder if observ_placeholder is not None else observation_placeholder(ob_space, batch_size=nbatch) + + extra_tensors = {} + + if normalize_observations and X.dtype == tf.float32: + encoded_x, rms = _normalize_clip_observation(X) + extra_tensors['rms'] = rms + else: + encoded_x = X + + encoded_x = encode_observation(ob_space, encoded_x) + + with tf.variable_scope('pi', reuse=tf.AUTO_REUSE): + policy_latent, recurrent_tensors = policy_network(encoded_x) + + if recurrent_tensors is not None: + # recurrent architecture, need a few more steps + nenv = nbatch // nsteps + assert nenv > 0, 'Bad input for recurrent policy: batch size {} smaller than nsteps {}'.format(nbatch, nsteps) + policy_latent, recurrent_tensors = policy_network(encoded_x, nenv) + extra_tensors.update(recurrent_tensors) + + + _v_net = value_network + + if _v_net is None or _v_net == 'shared': + vf_latent = policy_latent + else: + if _v_net == 'copy': + _v_net = policy_network + else: + assert callable(_v_net) + + with tf.variable_scope('vf', reuse=tf.AUTO_REUSE): + vf_latent, _ = _v_net(encoded_x) + + policy = PolicyWithValue( + env=env, + observations=X, + latent=policy_latent, + vf_latent=vf_latent, + sess=sess, + estimate_q=estimate_q, + **extra_tensors + ) + return policy + + return policy_fn + + +def _normalize_clip_observation(x, clip_range=[-5.0, 5.0]): + rms = RunningMeanStd(shape=x.shape[1:]) + norm_x = tf.clip_by_value((x - rms.mean) / rms.std, min(clip_range), max(clip_range)) + return norm_x, rms + diff --git a/baselines/common/retro_wrappers.py b/baselines/common/retro_wrappers.py new file mode 100644 index 0000000..3eb2eb3 --- /dev/null +++ b/baselines/common/retro_wrappers.py @@ -0,0 +1,293 @@ + # flake8: noqa F403, F405 +from .atari_wrappers import * +import numpy as np +import gym + +class TimeLimit(gym.Wrapper): + def __init__(self, env, max_episode_steps=None): + super(TimeLimit, self).__init__(env) + self._max_episode_steps = max_episode_steps + self._elapsed_steps = 0 + + def step(self, ac): + observation, reward, done, info = self.env.step(ac) + self._elapsed_steps += 1 + if self._elapsed_steps >= self._max_episode_steps: + done = True + info['TimeLimit.truncated'] = True + return observation, reward, done, info + + def reset(self, **kwargs): + self._elapsed_steps = 0 + return self.env.reset(**kwargs) + +class StochasticFrameSkip(gym.Wrapper): + def __init__(self, env, n, stickprob): + gym.Wrapper.__init__(self, env) + self.n = n + self.stickprob = stickprob + self.curac = None + self.rng = np.random.RandomState() + self.supports_want_render = hasattr(env, "supports_want_render") + + def reset(self, **kwargs): + self.curac = None + return self.env.reset(**kwargs) + + def step(self, ac): + done = False + totrew = 0 + for i in range(self.n): + # First step after reset, use action + if self.curac is None: + self.curac = ac + # First substep, delay with probability=stickprob + elif i==0: + if self.rng.rand() > self.stickprob: + self.curac = ac + # Second substep, new action definitely kicks in + elif i==1: + self.curac = ac + if self.supports_want_render and i self.channel + for _ in range(self.k): + self.frames.append(ob) + return self._get_ob() + + def step(self, ac): + ob, reward, done, info = self.env.step(ac) + self.frames.append(ob) + return self._get_ob(), reward, done, info + + def _get_ob(self): + assert len(self.frames) == self.k + return np.concatenate([frame if i==self.k-1 else frame[:,:,self.channel:self.channel+1] + for (i, frame) in enumerate(self.frames)], axis=2) + +class Downsample(gym.ObservationWrapper): + def __init__(self, env, ratio): + """ + Downsample images by a factor of ratio + """ + gym.ObservationWrapper.__init__(self, env) + (oldh, oldw, oldc) = env.observation_space.shape + newshape = (oldh//ratio, oldw//ratio, oldc) + self.observation_space = spaces.Box(low=0, high=255, + shape=newshape, dtype=np.uint8) + + def observation(self, frame): + height, width, _ = self.observation_space.shape + frame = cv2.resize(frame, (width, height), interpolation=cv2.INTER_AREA) + if frame.ndim == 2: + frame = frame[:,:,None] + return frame + +class Rgb2gray(gym.ObservationWrapper): + def __init__(self, env): + """ + Downsample images by a factor of ratio + """ + gym.ObservationWrapper.__init__(self, env) + (oldh, oldw, _oldc) = env.observation_space.shape + self.observation_space = spaces.Box(low=0, high=255, + shape=(oldh, oldw, 1), dtype=np.uint8) + + def observation(self, frame): + frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY) + return frame[:,:,None] + + +class MovieRecord(gym.Wrapper): + def __init__(self, env, savedir, k): + gym.Wrapper.__init__(self, env) + self.savedir = savedir + self.k = k + self.epcount = 0 + def reset(self): + if self.epcount % self.k == 0: + print('saving movie this episode', self.savedir) + self.env.unwrapped.movie_path = self.savedir + else: + print('not saving this episode') + self.env.unwrapped.movie_path = None + self.env.unwrapped.movie = None + self.epcount += 1 + return self.env.reset() + +class AppendTimeout(gym.Wrapper): + def __init__(self, env): + gym.Wrapper.__init__(self, env) + self.action_space = env.action_space + self.timeout_space = gym.spaces.Box(low=np.array([0.0]), high=np.array([1.0]), dtype=np.float32) + self.original_os = env.observation_space + if isinstance(self.original_os, gym.spaces.Dict): + import copy + ordered_dict = copy.deepcopy(self.original_os.spaces) + ordered_dict['value_estimation_timeout'] = self.timeout_space + self.observation_space = gym.spaces.Dict(ordered_dict) + self.dict_mode = True + else: + self.observation_space = gym.spaces.Dict({ + 'original': self.original_os, + 'value_estimation_timeout': self.timeout_space + }) + self.dict_mode = False + self.ac_count = None + while 1: + if not hasattr(env, "_max_episode_steps"): # Looking for TimeLimit wrapper that has this field + env = env.env + continue + break + self.timeout = env._max_episode_steps + + def step(self, ac): + self.ac_count += 1 + ob, rew, done, info = self.env.step(ac) + return self._process(ob), rew, done, info + + def reset(self): + self.ac_count = 0 + return self._process(self.env.reset()) + + def _process(self, ob): + fracmissing = 1 - self.ac_count / self.timeout + if self.dict_mode: + ob['value_estimation_timeout'] = fracmissing + else: + return { 'original': ob, 'value_estimation_timeout': fracmissing } + +class StartDoingRandomActionsWrapper(gym.Wrapper): + """ + Warning: can eat info dicts, not good if you depend on them + """ + def __init__(self, env, max_random_steps, on_startup=True, every_episode=False): + gym.Wrapper.__init__(self, env) + self.on_startup = on_startup + self.every_episode = every_episode + self.random_steps = max_random_steps + self.last_obs = None + if on_startup: + self.some_random_steps() + + def some_random_steps(self): + self.last_obs = self.env.reset() + n = np.random.randint(self.random_steps) + #print("running for random %i frames" % n) + for _ in range(n): + self.last_obs, _, done, _ = self.env.step(self.env.action_space.sample()) + if done: self.last_obs = self.env.reset() + + def reset(self): + return self.last_obs + + def step(self, a): + self.last_obs, rew, done, info = self.env.step(a) + if done: + self.last_obs = self.env.reset() + if self.every_episode: + self.some_random_steps() + return self.last_obs, rew, done, info + +def make_retro(*, game, state, max_episode_steps, **kwargs): + import retro + env = retro.make(game, state, **kwargs) + env = StochasticFrameSkip(env, n=4, stickprob=0.25) + if max_episode_steps is not None: + env = TimeLimit(env, max_episode_steps=max_episode_steps) + return env + +def wrap_deepmind_retro(env, scale=True, frame_stack=4): + """ + Configure environment for retro games, using config similar to DeepMind-style Atari in wrap_deepmind + """ + env = WarpFrame(env) + env = ClipRewardEnv(env) + env = FrameStack(env, frame_stack) + if scale: + env = ScaledFloatFrame(env) + return env + +class SonicDiscretizer(gym.ActionWrapper): + """ + Wrap a gym-retro environment and make it use discrete + actions for the Sonic game. + """ + def __init__(self, env): + super(SonicDiscretizer, self).__init__(env) + buttons = ["B", "A", "MODE", "START", "UP", "DOWN", "LEFT", "RIGHT", "C", "Y", "X", "Z"] + actions = [['LEFT'], ['RIGHT'], ['LEFT', 'DOWN'], ['RIGHT', 'DOWN'], ['DOWN'], + ['DOWN', 'B'], ['B']] + self._actions = [] + for action in actions: + arr = np.array([False] * 12) + for button in action: + arr[buttons.index(button)] = True + self._actions.append(arr) + self.action_space = gym.spaces.Discrete(len(self._actions)) + + def action(self, a): # pylint: disable=W0221 + return self._actions[a].copy() + +class RewardScaler(gym.RewardWrapper): + """ + Bring rewards to a reasonable scale for PPO. + This is incredibly important and effects performance + drastically. + """ + def __init__(self, env, scale=0.01): + super(RewardScaler, self).__init__(env) + self.scale = scale + + def reward(self, reward): + return reward * self.scale + +class AllowBacktracking(gym.Wrapper): + """ + Use deltas in max(X) as the reward, rather than deltas + in X. This way, agents are not discouraged too heavily + from exploring backwards if there is no way to advance + head-on in the level. + """ + def __init__(self, env): + super(AllowBacktracking, self).__init__(env) + self._cur_x = 0 + self._max_x = 0 + + def reset(self, **kwargs): # pylint: disable=E0202 + self._cur_x = 0 + self._max_x = 0 + return self.env.reset(**kwargs) + + def step(self, action): # pylint: disable=E0202 + obs, rew, done, info = self.env.step(action) + self._cur_x += rew + rew = max(0, self._cur_x - self._max_x) + self._max_x = max(self._max_x, self._cur_x) + return obs, rew, done, info diff --git a/baselines/common/runners.py b/baselines/common/runners.py index 0a4b221..c30e322 100644 --- a/baselines/common/runners.py +++ b/baselines/common/runners.py @@ -5,7 +5,7 @@ class AbstractEnvRunner(ABC): def __init__(self, *, env, model, nsteps): self.env = env self.model = model - nenv = env.num_envs + self.nenv = nenv = env.num_envs if hasattr(env, 'num_envs') else 1 self.batch_ob_shape = (nenv*nsteps,) + env.observation_space.shape self.obs = np.zeros((nenv,) + env.observation_space.shape, dtype=env.observation_space.dtype.name) self.obs[:] = env.reset() @@ -16,3 +16,4 @@ class AbstractEnvRunner(ABC): @abstractmethod def run(self): raise NotImplementedError + diff --git a/baselines/common/running_mean_std.py b/baselines/common/running_mean_std.py index 06ba8d8..504c7c9 100644 --- a/baselines/common/running_mean_std.py +++ b/baselines/common/running_mean_std.py @@ -1,4 +1,7 @@ +import tensorflow as tf import numpy as np +from baselines.common.tf_util import get_session + class RunningMeanStd(object): # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm def __init__(self, epsilon=1e-4, shape=()): @@ -13,20 +16,71 @@ class RunningMeanStd(object): self.update_from_moments(batch_mean, batch_var, batch_count) def update_from_moments(self, batch_mean, batch_var, batch_count): - delta = batch_mean - self.mean - tot_count = self.count + batch_count + self.mean, self.var, self.count = update_mean_var_count_from_moments( + self.mean, self.var, self.count, batch_mean, batch_var, batch_count) - new_mean = self.mean + delta * batch_count / tot_count - m_a = self.var * (self.count) - m_b = batch_var * (batch_count) - M2 = m_a + m_b + np.square(delta) * self.count * batch_count / (self.count + batch_count) - new_var = M2 / (self.count + batch_count) +def update_mean_var_count_from_moments(mean, var, count, batch_mean, batch_var, batch_count): + delta = batch_mean - mean + tot_count = count + batch_count - new_count = batch_count + self.count + new_mean = mean + delta * batch_count / tot_count + m_a = var * count + m_b = batch_var * batch_count + M2 = m_a + m_b + np.square(delta) * count * batch_count / (count + batch_count) + new_var = M2 / (count + batch_count) + new_count = batch_count + count + + return new_mean, new_var, new_count + - self.mean = new_mean - self.var = new_var - self.count = new_count +class TfRunningMeanStd(object): + # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm + ''' + TensorFlow variables-based implmentation of computing running mean and std + Benefit of this implementation is that it can be saved / loaded together with the tensorflow model + ''' + def __init__(self, epsilon=1e-4, shape=(), scope=''): + sess = get_session() + + self._new_mean = tf.placeholder(shape=shape, dtype=tf.float64) + self._new_var = tf.placeholder(shape=shape, dtype=tf.float64) + self._new_count = tf.placeholder(shape=(), dtype=tf.float64) + + + with tf.variable_scope(scope, reuse=tf.AUTO_REUSE): + self._mean = tf.get_variable('mean', initializer=np.zeros(shape, 'float64'), dtype=tf.float64) + self._var = tf.get_variable('std', initializer=np.ones(shape, 'float64'), dtype=tf.float64) + self._count = tf.get_variable('count', initializer=np.full((), epsilon, 'float64'), dtype=tf.float64) + + self.update_ops = tf.group([ + self._var.assign(self._new_var), + self._mean.assign(self._new_mean), + self._count.assign(self._new_count) + ]) + + sess.run(tf.variables_initializer([self._mean, self._var, self._count])) + self.sess = sess + self._set_mean_var_count() + + def _set_mean_var_count(self): + self.mean, self.var, self.count = self.sess.run([self._mean, self._var, self._count]) + + def update(self, x): + batch_mean = np.mean(x, axis=0) + batch_var = np.var(x, axis=0) + batch_count = x.shape[0] + + new_mean, new_var, new_count = update_mean_var_count_from_moments(self.mean, self.var, self.count, batch_mean, batch_var, batch_count) + + self.sess.run(self.update_ops, feed_dict={ + self._new_mean: new_mean, + self._new_var: new_var, + self._new_count: new_count + }) + + self._set_mean_var_count() + + def test_runningmeanstd(): for (x1, x2, x3) in [ @@ -43,4 +97,91 @@ def test_runningmeanstd(): rms.update(x3) ms2 = [rms.mean, rms.var] - assert np.allclose(ms1, ms2) + np.testing.assert_allclose(ms1, ms2) + +def test_tf_runningmeanstd(): + for (x1, x2, x3) in [ + (np.random.randn(3), np.random.randn(4), np.random.randn(5)), + (np.random.randn(3,2), np.random.randn(4,2), np.random.randn(5,2)), + ]: + + rms = TfRunningMeanStd(epsilon=0.0, shape=x1.shape[1:], scope='running_mean_std' + str(np.random.randint(0, 128))) + + x = np.concatenate([x1, x2, x3], axis=0) + ms1 = [x.mean(axis=0), x.var(axis=0)] + rms.update(x1) + rms.update(x2) + rms.update(x3) + ms2 = [rms.mean, rms.var] + + np.testing.assert_allclose(ms1, ms2) + + +def profile_tf_runningmeanstd(): + import time + from baselines.common import tf_util + + tf_util.get_session( config=tf.ConfigProto( + inter_op_parallelism_threads=1, + intra_op_parallelism_threads=1, + allow_soft_placement=True + )) + + x = np.random.random((376,)) + + n_trials = 10000 + rms = RunningMeanStd() + tfrms = TfRunningMeanStd() + + tic1 = time.time() + for _ in range(n_trials): + rms.update(x) + + tic2 = time.time() + for _ in range(n_trials): + tfrms.update(x) + + tic3 = time.time() + + print('rms update time ({} trials): {} s'.format(n_trials, tic2 - tic1)) + print('tfrms update time ({} trials): {} s'.format(n_trials, tic3 - tic2)) + + + tic1 = time.time() + for _ in range(n_trials): + z1 = rms.mean + + tic2 = time.time() + for _ in range(n_trials): + z2 = tfrms.mean + + assert z1 == z2 + + tic3 = time.time() + + print('rms get mean time ({} trials): {} s'.format(n_trials, tic2 - tic1)) + print('tfrms get mean time ({} trials): {} s'.format(n_trials, tic3 - tic2)) + + + + ''' + options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) #pylint: disable=E1101 + run_metadata = tf.RunMetadata() + profile_opts = dict(options=options, run_metadata=run_metadata) + + + + from tensorflow.python.client import timeline + fetched_timeline = timeline.Timeline(run_metadata.step_stats) #pylint: disable=E1101 + chrome_trace = fetched_timeline.generate_chrome_trace_format() + outfile = '/tmp/timeline.json' + with open(outfile, 'wt') as f: + f.write(chrome_trace) + print(f'Successfully saved profile to {outfile}. Exiting.') + exit(0) + ''' + + + +if __name__ == '__main__': + profile_tf_runningmeanstd() diff --git a/baselines/common/test_identity.py b/baselines/common/test_identity.py deleted file mode 100644 index a429e0c..0000000 --- a/baselines/common/test_identity.py +++ /dev/null @@ -1,44 +0,0 @@ -import pytest -import tensorflow as tf -import random -import numpy as np -from gym.spaces import np_random - -from baselines.a2c import a2c -from baselines.ppo2 import ppo2 -from baselines.common.identity_env import IdentityEnv -from baselines.common.vec_env.dummy_vec_env import DummyVecEnv -from baselines.ppo2.policies import MlpPolicy - - -learn_func_list = [ - lambda e: a2c.learn(policy=MlpPolicy, env=e, seed=0, total_timesteps=50000), - lambda e: ppo2.learn(policy=MlpPolicy, env=e, total_timesteps=50000, lr=1e-3, nsteps=128, ent_coef=0.01) -] - - -@pytest.mark.slow -@pytest.mark.parametrize("learn_func", learn_func_list) -def test_identity(learn_func): - ''' - Test if the algorithm (with a given policy) - can learn an identity transformation (i.e. return observation as an action) - ''' - np.random.seed(0) - np_random.seed(0) - random.seed(0) - - env = DummyVecEnv([lambda: IdentityEnv(10)]) - - with tf.Graph().as_default(), tf.Session().as_default(): - tf.set_random_seed(0) - model = learn_func(env) - - N_TRIALS = 1000 - sum_rew = 0 - obs = env.reset() - for i in range(N_TRIALS): - obs, rew, done, _ = env.step(model.step(obs)[0]) - sum_rew += rew - - assert sum_rew > 0.9 * N_TRIALS diff --git a/baselines/common/tests/__init__.py b/baselines/common/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/baselines/common/tests/envs/__init__.py b/baselines/common/tests/envs/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/baselines/common/tests/envs/fixed_sequence_env.py b/baselines/common/tests/envs/fixed_sequence_env.py new file mode 100644 index 0000000..9f1b03d --- /dev/null +++ b/baselines/common/tests/envs/fixed_sequence_env.py @@ -0,0 +1,44 @@ +import numpy as np +from gym import Env +from gym.spaces import Discrete + + +class FixedSequenceEnv(Env): + def __init__( + self, + n_actions=10, + seed=0, + episode_len=100 + ): + self.np_random = np.random.RandomState() + self.np_random.seed(seed) + self.sequence = [self.np_random.randint(0, n_actions-1) for _ in range(episode_len)] + + self.action_space = Discrete(n_actions) + self.observation_space = Discrete(1) + + self.episode_len = episode_len + self.time = 0 + self.reset() + + def reset(self): + self.time = 0 + return 0 + + def step(self, actions): + rew = self._get_reward(actions) + self._choose_next_state() + done = False + if self.episode_len and self.time >= self.episode_len: + rew = 0 + done = True + + return 0, rew, done, {} + + def _choose_next_state(self): + self.time += 1 + + def _get_reward(self, actions): + return 1 if actions == self.sequence[self.time] else 0 + + diff --git a/baselines/common/tests/envs/identity_env.py b/baselines/common/tests/envs/identity_env.py new file mode 100644 index 0000000..005d3ff --- /dev/null +++ b/baselines/common/tests/envs/identity_env.py @@ -0,0 +1,70 @@ +import numpy as np +from abc import abstractmethod +from gym import Env +from gym.spaces import Discrete, Box + + +class IdentityEnv(Env): + def __init__( + self, + episode_len=None + ): + + self.episode_len = episode_len + self.time = 0 + self.reset() + + def reset(self): + self._choose_next_state() + self.time = 0 + self.observation_space = self.action_space + + return self.state + + def step(self, actions): + rew = self._get_reward(actions) + self._choose_next_state() + done = False + if self.episode_len and self.time >= self.episode_len: + rew = 0 + done = True + + return self.state, rew, done, {} + + def _choose_next_state(self): + self.state = self.action_space.sample() + self.time += 1 + + @abstractmethod + def _get_reward(self, actions): + raise NotImplementedError + + +class DiscreteIdentityEnv(IdentityEnv): + def __init__( + self, + dim, + episode_len=None, + ): + + self.action_space = Discrete(dim) + super().__init__(episode_len=episode_len) + + def _get_reward(self, actions): + return 1 if self.state == actions else 0 + + +class BoxIdentityEnv(IdentityEnv): + def __init__( + self, + shape, + episode_len=None, + ): + + self.action_space = Box(low=-1.0, high=1.0, shape=shape) + super().__init__(episode_len=episode_len) + + def _get_reward(self, actions): + diff = actions - self.state + diff = diff[:] + return -0.5 * np.dot(diff, diff) diff --git a/baselines/common/tests/envs/mnist_env.py b/baselines/common/tests/envs/mnist_env.py new file mode 100644 index 0000000..563e215 --- /dev/null +++ b/baselines/common/tests/envs/mnist_env.py @@ -0,0 +1,70 @@ +import os.path as osp +import numpy as np +import tempfile +import filelock +from gym import Env +from gym.spaces import Discrete, Box + + + +class MnistEnv(Env): + def __init__( + self, + seed=0, + episode_len=None, + no_images=None + ): + from tensorflow.examples.tutorials.mnist import input_data + # we could use temporary directory for this with a context manager and + # TemporaryDirecotry, but then each test that uses mnist would re-download the data + # this way the data is not cleaned up, but we only download it once per machine + mnist_path = osp.join(tempfile.gettempdir(), 'MNIST_data') + with filelock.FileLock(mnist_path + '.lock'): + self.mnist = input_data.read_data_sets(mnist_path) + + self.np_random = np.random.RandomState() + self.np_random.seed(seed) + + self.observation_space = Box(low=0.0, high=1.0, shape=(28,28,1)) + self.action_space = Discrete(10) + self.episode_len = episode_len + self.time = 0 + self.no_images = no_images + + self.train_mode() + self.reset() + + def reset(self): + self._choose_next_state() + self.time = 0 + + return self.state[0] + + def step(self, actions): + rew = self._get_reward(actions) + self._choose_next_state() + done = False + if self.episode_len and self.time >= self.episode_len: + rew = 0 + done = True + + return self.state[0], rew, done, {} + + def train_mode(self): + self.dataset = self.mnist.train + + def test_mode(self): + self.dataset = self.mnist.test + + def _choose_next_state(self): + max_index = (self.no_images if self.no_images is not None else self.dataset.num_examples) - 1 + index = self.np_random.randint(0, max_index) + image = self.dataset.images[index].reshape(28,28,1)*255 + label = self.dataset.labels[index] + self.state = (image, label) + self.time += 1 + + def _get_reward(self, actions): + return 1 if self.state[1] == actions else 0 + + diff --git a/baselines/common/tests/test_cartpole.py b/baselines/common/tests/test_cartpole.py new file mode 100644 index 0000000..359006c --- /dev/null +++ b/baselines/common/tests/test_cartpole.py @@ -0,0 +1,40 @@ +import pytest +import gym + +from baselines.run import get_learn_function +from baselines.common.tests.util import reward_per_episode_test + +common_kwargs = dict( + total_timesteps=30000, + network='mlp', + gamma=1.0, + seed=0, +) + +learn_kwargs = { + 'a2c' : dict(nsteps=32, value_network='copy', lr=0.05), + 'acktr': dict(nsteps=32, value_network='copy'), + 'deepq': {}, + 'ppo2': dict(value_network='copy'), + 'trpo_mpi': {} +} + +@pytest.mark.slow +@pytest.mark.parametrize("alg", learn_kwargs.keys()) +def test_cartpole(alg): + ''' + Test if the algorithm (with an mlp policy) + can learn to balance the cartpole + ''' + + kwargs = common_kwargs.copy() + kwargs.update(learn_kwargs[alg]) + + learn_fn = lambda e: get_learn_function(alg)(env=e, **kwargs) + def env_fn(): + + env = gym.make('CartPole-v0') + env.seed(0) + return env + + reward_per_episode_test(env_fn, learn_fn, 100) diff --git a/baselines/common/tests/test_fixed_sequence.py b/baselines/common/tests/test_fixed_sequence.py new file mode 100644 index 0000000..f15ce0f --- /dev/null +++ b/baselines/common/tests/test_fixed_sequence.py @@ -0,0 +1,51 @@ +import pytest +from baselines.common.tests.envs.fixed_sequence_env import FixedSequenceEnv + +from baselines.common.tests.util import simple_test +from baselines.run import get_learn_function + +common_kwargs = dict( + seed=0, + total_timesteps=50000, +) + +learn_kwargs = { + 'a2c': {}, + 'ppo2': dict(nsteps=10, ent_coef=0.0, nminibatches=1), + # TODO enable sequential models for trpo_mpi (proper handling of nbatch and nsteps) + # github issue: https://github.com/openai/baselines/issues/188 + # 'trpo_mpi': lambda e, p: trpo_mpi.learn(policy_fn=p(env=e), env=e, max_timesteps=30000, timesteps_per_batch=100, cg_iters=10, gamma=0.9, lam=1.0, max_kl=0.001) +} + + +alg_list = learn_kwargs.keys() +rnn_list = ['lstm'] + +@pytest.mark.slow +@pytest.mark.parametrize("alg", alg_list) +@pytest.mark.parametrize("rnn", rnn_list) +def test_fixed_sequence(alg, rnn): + ''' + Test if the algorithm (with a given policy) + can learn an identity transformation (i.e. return observation as an action) + ''' + + kwargs = learn_kwargs[alg] + kwargs.update(common_kwargs) + + episode_len = 5 + env_fn = lambda: FixedSequenceEnv(10, episode_len=episode_len) + learn = lambda e: get_learn_function(alg)( + env=e, + network=rnn, + **kwargs + ) + + simple_test(env_fn, learn, 0.7) + + +if __name__ == '__main__': + test_fixed_sequence('ppo2', 'lstm') + + + diff --git a/baselines/common/tests/test_identity.py b/baselines/common/tests/test_identity.py new file mode 100644 index 0000000..71d5a3e --- /dev/null +++ b/baselines/common/tests/test_identity.py @@ -0,0 +1,55 @@ +import pytest +from baselines.common.tests.envs.identity_env import DiscreteIdentityEnv, BoxIdentityEnv +from baselines.run import get_learn_function +from baselines.common.tests.util import simple_test + +common_kwargs = dict( + total_timesteps=30000, + network='mlp', + gamma=0.9, + seed=0, +) + +learn_kwargs = { + 'a2c' : {}, + 'acktr': {}, + 'deepq': {}, + 'ppo2': dict(lr=1e-3, nsteps=64, ent_coef=0.0), + 'trpo_mpi': dict(timesteps_per_batch=100, cg_iters=10, gamma=0.9, lam=1.0, max_kl=0.01) +} + + +@pytest.mark.slow +@pytest.mark.parametrize("alg", learn_kwargs.keys()) +def test_discrete_identity(alg): + ''' + Test if the algorithm (with an mlp policy) + can learn an identity transformation (i.e. return observation as an action) + ''' + + kwargs = learn_kwargs[alg] + kwargs.update(common_kwargs) + + learn_fn = lambda e: get_learn_function(alg)(env=e, **kwargs) + env_fn = lambda: DiscreteIdentityEnv(10, episode_len=100) + simple_test(env_fn, learn_fn, 0.9) + +@pytest.mark.slow +@pytest.mark.parametrize("alg", ['a2c', 'ppo2', 'trpo_mpi']) +def test_continuous_identity(alg): + ''' + Test if the algorithm (with an mlp policy) + can learn an identity transformation (i.e. return observation as an action) + to a required precision + ''' + + kwargs = learn_kwargs[alg] + kwargs.update(common_kwargs) + learn_fn = lambda e: get_learn_function(alg)(env=e, **kwargs) + + env_fn = lambda: BoxIdentityEnv((1,), episode_len=100) + simple_test(env_fn, learn_fn, -0.1) + +if __name__ == '__main__': + test_continuous_identity('a2c') + diff --git a/baselines/common/tests/test_mnist.py b/baselines/common/tests/test_mnist.py new file mode 100644 index 0000000..5489c3a --- /dev/null +++ b/baselines/common/tests/test_mnist.py @@ -0,0 +1,50 @@ +import pytest + +# from baselines.acer import acer_simple as acer +from baselines.common.tests.envs.mnist_env import MnistEnv +from baselines.common.tests.util import simple_test +from baselines.run import get_learn_function + + +# TODO investigate a2c and ppo2 failures - is it due to bad hyperparameters for this problem? +# GitHub issue https://github.com/openai/baselines/issues/189 +common_kwargs = { + 'seed': 0, + 'network':'cnn', + 'gamma':0.9, + 'pad':'SAME' +} + +learn_args = { + 'a2c': dict(total_timesteps=50000), + # TODO need to resolve inference (step) API differences for acer; also slow + # 'acer': dict(seed=0, total_timesteps=1000), + 'deepq': dict(total_timesteps=5000), + 'acktr': dict(total_timesteps=30000), + 'ppo2': dict(total_timesteps=50000, lr=1e-3, nsteps=128, ent_coef=0.0), + 'trpo_mpi': dict(total_timesteps=80000, timesteps_per_batch=100, cg_iters=10, lam=1.0, max_kl=0.001) +} + + +#tests pass, but are too slow on travis. Same algorithms are covered +# by other tests with less compute-hungry nn's and by benchmarks +@pytest.mark.skip +@pytest.mark.slow +@pytest.mark.parametrize("alg", learn_args.keys()) +def test_mnist(alg): + ''' + Test if the algorithm can learn to classify MNIST digits. + Uses CNN policy. + ''' + + learn_kwargs = learn_args[alg] + learn_kwargs.update(common_kwargs) + + learn = get_learn_function(alg) + learn_fn = lambda e: learn(env=e, **learn_kwargs) + env_fn = lambda: MnistEnv(seed=0, episode_len=100) + + simple_test(env_fn, learn_fn, 0.6) + +if __name__ == '__main__': + test_mnist('deepq') diff --git a/baselines/common/tests/test_serialization.py b/baselines/common/tests/test_serialization.py new file mode 100644 index 0000000..ca3d222 --- /dev/null +++ b/baselines/common/tests/test_serialization.py @@ -0,0 +1,97 @@ +import os +import tempfile +import pytest +import tensorflow as tf +import numpy as np + +from baselines.common.tests.envs.mnist_env import MnistEnv +from baselines.common.vec_env.dummy_vec_env import DummyVecEnv +from baselines.run import get_learn_function +from baselines.common.tf_util import make_session, get_session + +from functools import partial + + +learn_kwargs = { + 'deepq': {}, + 'a2c': {}, + 'acktr': {}, + 'ppo2': {'nminibatches': 1, 'nsteps': 10}, + 'trpo_mpi': {}, +} + +network_kwargs = { + 'mlp': {}, + 'cnn': {'pad': 'SAME'}, + 'lstm': {}, + 'cnn_lnlstm': {'pad': 'SAME'} +} + + +@pytest.mark.parametrize("learn_fn", learn_kwargs.keys()) +@pytest.mark.parametrize("network_fn", network_kwargs.keys()) +def test_serialization(learn_fn, network_fn): + ''' + Test if the trained model can be serialized + ''' + + + if network_fn.endswith('lstm') and learn_fn in ['acktr', 'trpo_mpi', 'deepq']: + # TODO make acktr work with recurrent policies + # and test + # github issue: https://github.com/openai/baselines/issues/194 + return + + env = DummyVecEnv([lambda: MnistEnv(10, episode_len=100)]) + ob = env.reset().copy() + learn = get_learn_function(learn_fn) + + kwargs = {} + kwargs.update(network_kwargs[network_fn]) + kwargs.update(learn_kwargs[learn_fn]) + + + learn = partial(learn, env=env, network=network_fn, seed=0, **kwargs) + + with tempfile.TemporaryDirectory() as td: + model_path = os.path.join(td, 'serialization_test_model') + + with tf.Graph().as_default(), make_session().as_default(): + model = learn(total_timesteps=100) + model.save(model_path) + mean1, std1 = _get_action_stats(model, ob) + variables_dict1 = _serialize_variables() + + with tf.Graph().as_default(), make_session().as_default(): + model = learn(total_timesteps=0, load_path=model_path) + mean2, std2 = _get_action_stats(model, ob) + variables_dict2 = _serialize_variables() + + for k, v in variables_dict1.items(): + np.testing.assert_allclose(v, variables_dict2[k], atol=0.01, + err_msg='saved and loaded variable {} value mismatch'.format(k)) + + np.testing.assert_allclose(mean1, mean2, atol=0.5) + np.testing.assert_allclose(std1, std2, atol=0.5) + + + +def _serialize_variables(): + sess = get_session() + variables = tf.trainable_variables() + values = sess.run(variables) + return {var.name: value for var, value in zip(variables, values)} + + +def _get_action_stats(model, ob): + ntrials = 1000 + if model.initial_state is None or model.initial_state == []: + actions = np.array([model.step(ob)[0] for _ in range(ntrials)]) + else: + actions = np.array([model.step(ob, S=model.initial_state, M=[False])[0] for _ in range(ntrials)]) + + mean = np.mean(actions, axis=0) + std = np.std(actions, axis=0) + + return mean, std + diff --git a/baselines/common/tests/util.py b/baselines/common/tests/util.py new file mode 100644 index 0000000..30b8954 --- /dev/null +++ b/baselines/common/tests/util.py @@ -0,0 +1,91 @@ +import tensorflow as tf +import numpy as np +from gym.spaces import np_random +from baselines.common.vec_env.dummy_vec_env import DummyVecEnv + +N_TRIALS = 10000 +N_EPISODES = 100 + +def simple_test(env_fn, learn_fn, min_reward_fraction, n_trials=N_TRIALS): + np.random.seed(0) + np_random.seed(0) + + env = DummyVecEnv([env_fn]) + + + with tf.Graph().as_default(), tf.Session(config=tf.ConfigProto(allow_soft_placement=True)).as_default(): + tf.set_random_seed(0) + + model = learn_fn(env) + + sum_rew = 0 + done = True + + for i in range(n_trials): + if done: + obs = env.reset() + state = model.initial_state + + if state is not None: + a, v, state, _ = model.step(obs, S=state, M=[False]) + else: + a, v, _, _ = model.step(obs) + + obs, rew, done, _ = env.step(a) + sum_rew += float(rew) + + print("Reward in {} trials is {}".format(n_trials, sum_rew)) + assert sum_rew > min_reward_fraction * n_trials, \ + 'sum of rewards {} is less than {} of the total number of trials {}'.format(sum_rew, min_reward_fraction, n_trials) + + + +def reward_per_episode_test(env_fn, learn_fn, min_avg_reward, n_trials=N_EPISODES): + env = DummyVecEnv([env_fn]) + + with tf.Graph().as_default(), tf.Session(config=tf.ConfigProto(allow_soft_placement=True)).as_default(): + model = learn_fn(env) + + N_TRIALS = 100 + + observations, actions, rewards = rollout(env, model, N_TRIALS) + rewards = [sum(r) for r in rewards] + + avg_rew = sum(rewards) / N_TRIALS + print("Average reward in {} episodes is {}".format(n_trials, avg_rew)) + assert avg_rew > min_avg_reward, \ + 'average reward in {} episodes ({}) is less than {}'.format(n_trials, avg_rew, min_avg_reward) + +def rollout(env, model, n_trials): + rewards = [] + actions = [] + observations = [] + + for i in range(n_trials): + obs = env.reset() + state = model.initial_state + episode_rew = [] + episode_actions = [] + episode_obs = [] + + while True: + if state is not None: + a, v, state, _ = model.step(obs, S=state, M=[False]) + else: + a,v, _, _ = model.step(obs) + + obs, rew, done, _ = env.step(a) + + episode_rew.append(rew) + episode_actions.append(a) + episode_obs.append(obs) + + if done: + break + + rewards.append(episode_rew) + actions.append(episode_actions) + observations.append(episode_obs) + + return observations, actions, rewards + diff --git a/baselines/common/tf_util.py b/baselines/common/tf_util.py index afcd593..733a15e 100644 --- a/baselines/common/tf_util.py +++ b/baselines/common/tf_util.py @@ -1,3 +1,4 @@ +import joblib import numpy as np import tensorflow as tf # pylint: ignore-module import copy @@ -48,17 +49,28 @@ def huber_loss(x, delta=1.0): # Global session # ================================================================ -def make_session(num_cpu=None, make_default=False, graph=None): +def get_session(config=None): + """Get default session or create one with a given config""" + sess = tf.get_default_session() + if sess is None: + sess = make_session(config=config, make_default=True) + return sess + +def make_session(config=None, num_cpu=None, make_default=False, graph=None): """Returns a session that will use CPU's only""" if num_cpu is None: num_cpu = int(os.getenv('RCALL_NUM_CPU', multiprocessing.cpu_count())) - tf_config = tf.ConfigProto( - inter_op_parallelism_threads=num_cpu, - intra_op_parallelism_threads=num_cpu) + if config is None: + config = tf.ConfigProto( + allow_soft_placement=True, + inter_op_parallelism_threads=num_cpu, + intra_op_parallelism_threads=num_cpu) + config.gpu_options.allow_growth = True + if make_default: - return tf.InteractiveSession(config=tf_config, graph=graph) + return tf.InteractiveSession(config=config, graph=graph) else: - return tf.Session(config=tf_config, graph=graph) + return tf.Session(config=config, graph=graph) def single_threaded_session(): """Returns a session which will only use a single CPU""" @@ -76,7 +88,7 @@ ALREADY_INITIALIZED = set() def initialize(): """Initialize all the uninitialized variables in the global scope.""" new_variables = set(tf.global_variables()) - ALREADY_INITIALIZED - tf.get_default_session().run(tf.variables_initializer(new_variables)) + get_session().run(tf.variables_initializer(new_variables)) ALREADY_INITIALIZED.update(new_variables) # ================================================================ @@ -85,7 +97,7 @@ def initialize(): def normc_initializer(std=1.0, axis=0): def _initializer(shape, dtype=None, partition_info=None): # pylint: disable=W0613 - out = np.random.randn(*shape).astype(np.float32) + out = np.random.randn(*shape).astype(dtype.as_numpy_dtype) out *= std / np.sqrt(np.square(out).sum(axis=axis, keepdims=True)) return tf.constant(out) return _initializer @@ -179,7 +191,7 @@ class _Function(object): if hasattr(inpt, 'make_feed_dict'): feed_dict.update(inpt.make_feed_dict(value)) else: - feed_dict[inpt] = value + feed_dict[inpt] = adjust_shape(inpt, value) def __call__(self, *args): assert len(args) <= len(self.inputs), "Too many arguments provided" @@ -189,8 +201,8 @@ class _Function(object): self._feed_input(feed_dict, inpt, value) # Update feed dict with givens. for inpt in self.givens: - feed_dict[inpt] = feed_dict.get(inpt, self.givens[inpt]) - results = tf.get_default_session().run(self.outputs_update, feed_dict=feed_dict)[:-1] + feed_dict[inpt] = adjust_shape(inpt, feed_dict.get(inpt, self.givens[inpt])) + results = get_session().run(self.outputs_update, feed_dict=feed_dict)[:-1] return results # ================================================================ @@ -243,27 +255,34 @@ class GetFlat(object): def __call__(self): return tf.get_default_session().run(self.op) +def flattenallbut0(x): + return tf.reshape(x, [-1, intprod(x.get_shape().as_list()[1:])]) + +# ============================================================= +# TF placeholders management +# ============================================================ + _PLACEHOLDER_CACHE = {} # name -> (placeholder, dtype, shape) def get_placeholder(name, dtype, shape): if name in _PLACEHOLDER_CACHE: out, dtype1, shape1 = _PLACEHOLDER_CACHE[name] - assert dtype1 == dtype and shape1 == shape - return out - else: - out = tf.placeholder(dtype=dtype, shape=shape, name=name) - _PLACEHOLDER_CACHE[name] = (out, dtype, shape) - return out + if out.graph == tf.get_default_graph(): + assert dtype1 == dtype and shape1 == shape, \ + 'Placeholder with name {} has already been registered and has shape {}, different from requested {}'.format(name, shape1, shape) + return out + + out = tf.placeholder(dtype=dtype, shape=shape, name=name) + _PLACEHOLDER_CACHE[name] = (out, dtype, shape) + return out def get_placeholder_cached(name): return _PLACEHOLDER_CACHE[name][0] -def flattenallbut0(x): - return tf.reshape(x, [-1, intprod(x.get_shape().as_list()[1:])]) # ================================================================ -# Diagnostics +# Diagnostics # ================================================================ def display_var_info(vars): @@ -283,7 +302,7 @@ def display_var_info(vars): def get_available_gpus(): # recipe from here: # https://stackoverflow.com/questions/38559755/how-to-get-current-available-gpus-in-tensorflow?utm_medium=organic&utm_source=google_rich_qa&utm_campaign=google_rich_qa - + from tensorflow.python.client import device_lib local_device_protos = device_lib.list_local_devices() return [x.name for x in local_device_protos if x.device_type == 'GPU'] @@ -292,13 +311,95 @@ def get_available_gpus(): # Saving variables # ================================================================ -def load_state(fname): +def load_state(fname, sess=None): + sess = sess or get_session() saver = tf.train.Saver() saver.restore(tf.get_default_session(), fname) -def save_state(fname): +def save_state(fname, sess=None): + sess = sess or get_session() os.makedirs(os.path.dirname(fname), exist_ok=True) saver = tf.train.Saver() saver.save(tf.get_default_session(), fname) +# The methods above and below are clearly doing the same thing, and in a rather similar way +# TODO: ensure there is no subtle differences and remove one + +def save_variables(save_path, variables=None, sess=None): + sess = sess or get_session() + variables = variables or tf.trainable_variables() + + ps = sess.run(variables) + save_dict = {v.name: value for v, value in zip(variables, ps)} + os.makedirs(os.path.dirname(save_path), exist_ok=True) + joblib.dump(save_dict, save_path) + +def load_variables(load_path, variables=None, sess=None): + sess = sess or get_session() + variables = variables or tf.trainable_variables() + + loaded_params = joblib.load(os.path.expanduser(load_path)) + restores = [] + for v in variables: + restores.append(v.assign(loaded_params[v.name])) + sess.run(restores) + + +# ================================================================ +# Shape adjustment for feeding into tf placeholders +# ================================================================ +def adjust_shape(placeholder, data): + ''' + adjust shape of the data to the shape of the placeholder if possible. + If shape is incompatible, AssertionError is thrown + + Parameters: + placeholder tensorflow input placeholder + + data input data to be (potentially) reshaped to be fed into placeholder + + Returns: + reshaped data + ''' + + if not isinstance(data, np.ndarray) and not isinstance(data, list): + return data + if isinstance(data, list): + data = np.array(data) + + placeholder_shape = [x or -1 for x in placeholder.shape.as_list()] + + assert _check_shape(placeholder_shape, data.shape), \ + 'Shape of data {} is not compatible with shape of the placeholder {}'.format(data.shape, placeholder_shape) + + return np.reshape(data, placeholder_shape) + + +def _check_shape(placeholder_shape, data_shape): + ''' check if two shapes are compatible (i.e. differ only by dimensions of size 1, or by the batch dimension)''' + + return True + squeezed_placeholder_shape = _squeeze_shape(placeholder_shape) + squeezed_data_shape = _squeeze_shape(data_shape) + + for i, s_data in enumerate(squeezed_data_shape): + s_placeholder = squeezed_placeholder_shape[i] + if s_placeholder != -1 and s_data != s_placeholder: + return False + + return True + + +def _squeeze_shape(shape): + return [x for x in shape if x != 1] + +# Tensorboard interfacing +# ================================================================ + +def launch_tensorboard_in_background(log_dir): + from tensorboard import main as tb + import threading + tf.flags.FLAGS.logdir = log_dir + t = threading.Thread(target=tb.main, args=([])) + t.start() diff --git a/baselines/common/vec_env/dummy_vec_env.py b/baselines/common/vec_env/dummy_vec_env.py index d0ae455..477bf30 100644 --- a/baselines/common/vec_env/dummy_vec_env.py +++ b/baselines/common/vec_env/dummy_vec_env.py @@ -30,15 +30,30 @@ class DummyVecEnv(VecEnv): self.actions = None def step_async(self, actions): - self.actions = actions + listify = True + try: + if len(actions) == self.num_envs: + listify = False + except TypeError: + pass + + if not listify: + self.actions = actions + else: + assert self.num_envs == 1, "actions {} is either not a list or has a wrong size - cannot match to {} environments".format(actions, self.num_envs) + self.actions = [actions] def step_wait(self): for e in range(self.num_envs): - obs, self.buf_rews[e], self.buf_dones[e], self.buf_infos[e] = self.envs[e].step(self.actions[e]) + action = self.actions[e] + if isinstance(self.envs[e].action_space, spaces.Discrete): + action = int(action) + + obs, self.buf_rews[e], self.buf_dones[e], self.buf_infos[e] = self.envs[e].step(action) if self.buf_dones[e]: obs = self.envs[e].reset() self._save_obs(e, obs) - return (self._obs_from_buf(), np.copy(self.buf_rews), np.copy(self.buf_dones), + return (np.copy(self._obs_from_buf()), np.copy(self.buf_rews), np.copy(self.buf_dones), self.buf_infos.copy()) def reset(self): diff --git a/baselines/common/vec_env/subproc_vec_env.py b/baselines/common/vec_env/subproc_vec_env.py index fb55df4..e5b5b32 100644 --- a/baselines/common/vec_env/subproc_vec_env.py +++ b/baselines/common/vec_env/subproc_vec_env.py @@ -7,26 +7,30 @@ from baselines.common.tile_images import tile_images def worker(remote, parent_remote, env_fn_wrapper): parent_remote.close() env = env_fn_wrapper.x() - while True: - cmd, data = remote.recv() - if cmd == 'step': - ob, reward, done, info = env.step(data) - if done: + try: + while True: + cmd, data = remote.recv() + if cmd == 'step': + ob, reward, done, info = env.step(data) + if done: + ob = env.reset() + remote.send((ob, reward, done, info)) + elif cmd == 'reset': ob = env.reset() - remote.send((ob, reward, done, info)) - elif cmd == 'reset': - ob = env.reset() - remote.send(ob) - elif cmd == 'render': - remote.send(env.render(mode='rgb_array')) - elif cmd == 'close': - remote.close() - break - elif cmd == 'get_spaces': - remote.send((env.observation_space, env.action_space)) - else: - raise NotImplementedError - + remote.send(ob) + elif cmd == 'render': + remote.send(env.render(mode='rgb_array')) + elif cmd == 'close': + remote.close() + break + elif cmd == 'get_spaces': + remote.send((env.observation_space, env.action_space)) + else: + raise NotImplementedError + except KeyboardInterrupt: + print('SubprocVecEnv worker: got KeyboardInterrupt') + finally: + env.close() class SubprocVecEnv(VecEnv): def __init__(self, env_fns, spaces=None): diff --git a/baselines/common/vec_env/vec_normalize.py b/baselines/common/vec_env/vec_normalize.py index dda767d..5d5c5ad 100644 --- a/baselines/common/vec_env/vec_normalize.py +++ b/baselines/common/vec_env/vec_normalize.py @@ -10,6 +10,8 @@ class VecNormalize(VecEnvWrapper): VecEnvWrapper.__init__(self, venv) self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if ob else None self.ret_rms = RunningMeanStd(shape=()) if ret else None + #self.ob_rms = TfRunningMeanStd(shape=self.observation_space.shape, scope='observation_running_mean_std') if ob else None + #self.ret_rms = TfRunningMeanStd(shape=(), scope='return_running_mean_std') if ret else None self.clipob = clipob self.cliprew = cliprew self.ret = np.zeros(self.num_envs) diff --git a/baselines/ddpg/ddpg.py b/baselines/ddpg/ddpg.py index e2d4950..6664cc4 100644 --- a/baselines/ddpg/ddpg.py +++ b/baselines/ddpg/ddpg.py @@ -26,9 +26,9 @@ def reduce_std(x, axis=None, keepdims=False): return tf.sqrt(reduce_var(x, axis=axis, keepdims=keepdims)) def reduce_var(x, axis=None, keepdims=False): - m = tf.reduce_mean(x, axis=axis, keep_dims=True) + m = tf.reduce_mean(x, axis=axis, keepdims=True) devs_squared = tf.square(x - m) - return tf.reduce_mean(devs_squared, axis=axis, keep_dims=keepdims) + return tf.reduce_mean(devs_squared, axis=axis, keepdims=keepdims) def get_target_updates(vars, target_vars, tau): logger.info('setting up target updates ...') diff --git a/baselines/deepq/__init__.py b/baselines/deepq/__init__.py index 4472399..6859c05 100644 --- a/baselines/deepq/__init__.py +++ b/baselines/deepq/__init__.py @@ -1,8 +1,8 @@ from baselines.deepq import models # noqa from baselines.deepq.build_graph import build_act, build_train # noqa -from baselines.deepq.simple import learn, load # noqa +from baselines.deepq.deepq import learn, load_act # noqa from baselines.deepq.replay_buffer import ReplayBuffer, PrioritizedReplayBuffer # noqa def wrap_atari_dqn(env): from baselines.common.atari_wrappers import wrap_deepmind - return wrap_deepmind(env, frame_stack=True, scale=True) \ No newline at end of file + return wrap_deepmind(env, frame_stack=True, scale=True) diff --git a/baselines/deepq/simple.py b/baselines/deepq/deepq.py similarity index 90% rename from baselines/deepq/simple.py rename to baselines/deepq/deepq.py index 4bad145..7d44acf 100644 --- a/baselines/deepq/simple.py +++ b/baselines/deepq/deepq.py @@ -10,20 +10,24 @@ import baselines.common.tf_util as U from baselines.common.tf_util import load_state, save_state from baselines import logger from baselines.common.schedules import LinearSchedule -from baselines.common.input import observation_input +from baselines.common import set_global_seeds from baselines import deepq from baselines.deepq.replay_buffer import ReplayBuffer, PrioritizedReplayBuffer from baselines.deepq.utils import ObservationInput +from baselines.common.tf_util import get_session +from baselines.deepq.models import build_q_func + class ActWrapper(object): def __init__(self, act, act_params): self._act = act self._act_params = act_params + self.initial_state = None @staticmethod - def load(path): + def load_act(self, path): with open(path, "rb") as f: model_data, act_params = cloudpickle.load(f) act = deepq.build_act(**act_params) @@ -42,7 +46,10 @@ class ActWrapper(object): def __call__(self, *args, **kwargs): return self._act(*args, **kwargs) - def save(self, path=None): + def step(self, observation, **kwargs): + return self._act([observation], **kwargs), None, None, None + + def save_act(self, path=None): """Save model to a pickle located at `path`""" if path is None: path = os.path.join(logger.get_dir(), "model.pkl") @@ -61,8 +68,11 @@ class ActWrapper(object): with open(path, "wb") as f: cloudpickle.dump((model_data, self._act_params), f) + def save(self, path): + save_state(path) -def load(path): + +def load_act(path): """Load act function that was returned by learn function. Parameters @@ -76,13 +86,14 @@ def load(path): function that takes a batch of observations and returns actions. """ - return ActWrapper.load(path) + return ActWrapper.load_act(path) def learn(env, - q_func, + network, + seed=None, lr=5e-4, - max_timesteps=100000, + total_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, @@ -100,7 +111,10 @@ def learn(env, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, param_noise=False, - callback=None): + callback=None, + load_path=None, + **network_kwargs + ): """Train a deepq model. Parameters @@ -119,7 +133,7 @@ def learn(env, and returns a tensor of shape (batch_size, num_actions) with values of every action. lr: float learning rate for adam optimizer - max_timesteps: int + total_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer @@ -153,12 +167,16 @@ def learn(env, initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value - to 1.0. If set to None equals to max_timesteps. + to 1.0. If set to None equals to total_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. + load_path: str + path to load the model from. (default: None) + **network_kwargs + additional keyword arguments to pass to the network builder. Returns ------- @@ -168,8 +186,10 @@ def learn(env, """ # Create all the functions necessary to train the model - sess = tf.Session() - sess.__enter__() + sess = get_session() + set_global_seeds(seed) + + q_func = build_q_func(network, **network_kwargs) # capture the shape outside the closure so that the env object is not serialized # by cloudpickle when serializing make_obs_ph @@ -194,12 +214,12 @@ def learn(env, } act = ActWrapper(act, act_params) - + # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: - prioritized_replay_beta_iters = max_timesteps + prioritized_replay_beta_iters = total_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) @@ -207,7 +227,7 @@ def learn(env, replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. - exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), + exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * total_timesteps), initial_p=1.0, final_p=exploration_final_eps) @@ -225,12 +245,17 @@ def learn(env, model_file = os.path.join(td, "model") model_saved = False + if tf.train.latest_checkpoint(td) is not None: load_state(model_file) logger.log('Loaded model from {}'.format(model_file)) model_saved = True + elif load_path is not None: + load_state(load_path) + logger.log('Loaded model from {}'.format(load_path)) + - for t in range(max_timesteps): + for t in range(total_timesteps): if callback is not None: if callback(locals(), globals()): break diff --git a/baselines/deepq/defaults.py b/baselines/deepq/defaults.py new file mode 100644 index 0000000..d41fb18 --- /dev/null +++ b/baselines/deepq/defaults.py @@ -0,0 +1,21 @@ +def atari(): + return dict( + network='conv_only', + lr=1e-4, + buffer_size=10000, + exploration_fraction=0.1, + exploration_final_eps=0.01, + train_freq=4, + learning_starts=10000, + target_network_update_freq=1000, + gamma=0.99, + prioritized_replay=True, + prioritized_replay_alpha=0.6, + checkpoint_freq=10000, + checkpoint_path=None, + dueling=True + ) + +def retro(): + return atari() + diff --git a/baselines/deepq/experiments/enjoy_retro.py b/baselines/deepq/experiments/enjoy_retro.py new file mode 100644 index 0000000..526af16 --- /dev/null +++ b/baselines/deepq/experiments/enjoy_retro.py @@ -0,0 +1,34 @@ +import argparse + +import numpy as np + +from baselines import deepq +from baselines.common import retro_wrappers + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('--env', help='environment ID', default='SuperMarioBros-Nes') + parser.add_argument('--gamestate', help='game state to load', default='Level1-1') + parser.add_argument('--model', help='model pickle file from ActWrapper.save', default='model.pkl') + args = parser.parse_args() + + env = retro_wrappers.make_retro(game=args.env, state=args.gamestate, max_episode_steps=None) + env = retro_wrappers.wrap_deepmind_retro(env) + act = deepq.load(args.model) + + while True: + obs, done = env.reset(), False + episode_rew = 0 + while not done: + env.render() + action = act(obs[None])[0] + env_action = np.zeros(env.action_space.n) + env_action[action] = 1 + obs, rew, done, _ = env.step(env_action) + episode_rew += rew + print('Episode reward', episode_rew) + + +if __name__ == '__main__': + main() diff --git a/baselines/deepq/experiments/run_retro.py b/baselines/deepq/experiments/run_retro.py new file mode 100644 index 0000000..0338361 --- /dev/null +++ b/baselines/deepq/experiments/run_retro.py @@ -0,0 +1,49 @@ +import argparse + +from baselines import deepq +from baselines.common import set_global_seeds +from baselines import bench +from baselines import logger +from baselines.common import retro_wrappers +import retro + + +def main(): + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument('--env', help='environment ID', default='SuperMarioBros-Nes') + parser.add_argument('--gamestate', help='game state to load', default='Level1-1') + parser.add_argument('--seed', help='seed', type=int, default=0) + parser.add_argument('--num-timesteps', type=int, default=int(10e6)) + args = parser.parse_args() + logger.configure() + set_global_seeds(args.seed) + env = retro_wrappers.make_retro(game=args.env, state=args.gamestate, max_episode_steps=10000, use_restricted_actions=retro.Actions.DISCRETE) + env.seed(args.seed) + env = bench.Monitor(env, logger.get_dir()) + env = retro_wrappers.wrap_deepmind_retro(env) + + model = deepq.models.cnn_to_mlp( + convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], + hiddens=[256], + dueling=True + ) + act = deepq.learn( + env, + q_func=model, + lr=1e-4, + max_timesteps=args.num_timesteps, + buffer_size=10000, + exploration_fraction=0.1, + exploration_final_eps=0.01, + train_freq=4, + learning_starts=10000, + target_network_update_freq=1000, + gamma=0.99, + prioritized_replay=True + ) + act.save() + env.close() + + +if __name__ == '__main__': + main() diff --git a/baselines/deepq/models.py b/baselines/deepq/models.py index 198d795..c41b707 100644 --- a/baselines/deepq/models.py +++ b/baselines/deepq/models.py @@ -89,3 +89,41 @@ def cnn_to_mlp(convs, hiddens, dueling=False, layer_norm=False): return lambda *args, **kwargs: _cnn_to_mlp(convs, hiddens, dueling, layer_norm=layer_norm, *args, **kwargs) + + +def build_q_func(network, hiddens=[256], dueling=True, layer_norm=False, **network_kwargs): + if isinstance(network, str): + from baselines.common.models import get_network_builder + network = get_network_builder(network)(**network_kwargs) + + def q_func_builder(input_placeholder, num_actions, scope, reuse=False): + with tf.variable_scope(scope, reuse=reuse): + latent, _ = network(input_placeholder) + latent = layers.flatten(latent) + + with tf.variable_scope("action_value"): + action_out = latent + for hidden in hiddens: + action_out = layers.fully_connected(action_out, num_outputs=hidden, activation_fn=None) + if layer_norm: + action_out = layers.layer_norm(action_out, center=True, scale=True) + action_out = tf.nn.relu(action_out) + action_scores = layers.fully_connected(action_out, num_outputs=num_actions, activation_fn=None) + + if dueling: + with tf.variable_scope("state_value"): + state_out = latent + for hidden in hiddens: + state_out = layers.fully_connected(state_out, num_outputs=hidden, activation_fn=None) + if layer_norm: + state_out = layers.layer_norm(state_out, center=True, scale=True) + state_out = tf.nn.relu(state_out) + state_score = layers.fully_connected(state_out, num_outputs=1, activation_fn=None) + action_scores_mean = tf.reduce_mean(action_scores, 1) + action_scores_centered = action_scores - tf.expand_dims(action_scores_mean, 1) + q_out = state_score + action_scores_centered + else: + q_out = action_scores + return q_out + + return q_func_builder diff --git a/baselines/deepq/test_identity.py b/baselines/deepq/test_identity.py deleted file mode 100644 index ef57e70..0000000 --- a/baselines/deepq/test_identity.py +++ /dev/null @@ -1,43 +0,0 @@ -import tensorflow as tf -import random - -from baselines import deepq -from baselines.common.identity_env import IdentityEnv - - -def test_identity(): - - with tf.Graph().as_default(): - env = IdentityEnv(10) - random.seed(0) - - tf.set_random_seed(0) - - param_noise = False - model = deepq.models.mlp([32]) - act = deepq.learn( - env, - q_func=model, - lr=1e-3, - max_timesteps=10000, - buffer_size=50000, - exploration_fraction=0.1, - exploration_final_eps=0.02, - print_freq=10, - param_noise=param_noise, - ) - - tf.set_random_seed(0) - - N_TRIALS = 1000 - sum_rew = 0 - obs = env.reset() - for i in range(N_TRIALS): - obs, rew, done, _ = env.step(act([obs])) - sum_rew += rew - - assert sum_rew > 0.9 * N_TRIALS - - -if __name__ == '__main__': - test_identity() diff --git a/baselines/deepq/utils.py b/baselines/deepq/utils.py index 90b932e..2914f43 100644 --- a/baselines/deepq/utils.py +++ b/baselines/deepq/utils.py @@ -1,4 +1,5 @@ from baselines.common.input import observation_input +from baselines.common.tf_util import adjust_shape import tensorflow as tf @@ -36,7 +37,7 @@ class PlaceholderTfInput(TfInput): return self._placeholder def make_feed_dict(self, data): - return {self._placeholder: data} + return {self._placeholder: adjust_shape(self._placeholder, data)} class Uint8Input(PlaceholderTfInput): diff --git a/baselines/ppo1/run_atari.py b/baselines/ppo1/run_atari.py index 17941c6..96e3482 100644 --- a/baselines/ppo1/run_atari.py +++ b/baselines/ppo1/run_atari.py @@ -18,7 +18,7 @@ def train(env_id, num_timesteps, seed): logger.configure() else: logger.configure(format_strs=[]) - workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() + workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() if seed is not None else None set_global_seeds(workerseed) env = make_atari(env_id) def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613 diff --git a/baselines/ppo2/defaults.py b/baselines/ppo2/defaults.py new file mode 100644 index 0000000..447a68d --- /dev/null +++ b/baselines/ppo2/defaults.py @@ -0,0 +1,22 @@ +def mujoco(): + return dict( + nsteps=2048, + nminibatches=32, + lam=0.95, + gamma=0.99, + noptepochs=10, + log_interval=1, + ent_coef=0.0, + lr=lambda f: 3e-4 * f, + cliprange=0.2, + value_network='copy' + ) + +def atari(): + return dict( + nsteps=128, nminibatches=4, + lam=0.95, gamma=0.99, noptepochs=4, log_interval=1, + ent_coef=.01, + lr=lambda f : f * 2.5e-4, + cliprange=lambda f : f * 0.1, + ) diff --git a/baselines/ppo2/policies.py b/baselines/ppo2/policies.py deleted file mode 100644 index 6fbbb14..0000000 --- a/baselines/ppo2/policies.py +++ /dev/null @@ -1,146 +0,0 @@ -import numpy as np -import tensorflow as tf -from baselines.a2c.utils import conv, fc, conv_to_fc, batch_to_seq, seq_to_batch, lstm, lnlstm -from baselines.common.distributions import make_pdtype -from baselines.common.input import observation_input - -def nature_cnn(unscaled_images, **conv_kwargs): - """ - CNN from Nature paper. - """ - scaled_images = tf.cast(unscaled_images, tf.float32) / 255. - activ = tf.nn.relu - h = activ(conv(scaled_images, 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2), - **conv_kwargs)) - h2 = activ(conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2), **conv_kwargs)) - h3 = activ(conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2), **conv_kwargs)) - h3 = conv_to_fc(h3) - return activ(fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))) - -class LnLstmPolicy(object): - def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False): - nenv = nbatch // nsteps - X, processed_x = observation_input(ob_space, nbatch) - M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) - S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states - self.pdtype = make_pdtype(ac_space) - with tf.variable_scope("model", reuse=reuse): - h = nature_cnn(processed_x) - xs = batch_to_seq(h, nenv, nsteps) - ms = batch_to_seq(M, nenv, nsteps) - h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm) - h5 = seq_to_batch(h5) - vf = fc(h5, 'v', 1) - self.pd, self.pi = self.pdtype.pdfromlatent(h5) - - v0 = vf[:, 0] - a0 = self.pd.sample() - neglogp0 = self.pd.neglogp(a0) - self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32) - - def step(ob, state, mask): - return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask}) - - def value(ob, state, mask): - return sess.run(v0, {X:ob, S:state, M:mask}) - - self.X = X - self.M = M - self.S = S - self.vf = vf - self.step = step - self.value = value - -class LstmPolicy(object): - - def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False): - nenv = nbatch // nsteps - self.pdtype = make_pdtype(ac_space) - X, processed_x = observation_input(ob_space, nbatch) - - M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) - S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states - with tf.variable_scope("model", reuse=reuse): - h = nature_cnn(X) - xs = batch_to_seq(h, nenv, nsteps) - ms = batch_to_seq(M, nenv, nsteps) - h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm) - h5 = seq_to_batch(h5) - vf = fc(h5, 'v', 1) - self.pd, self.pi = self.pdtype.pdfromlatent(h5) - - v0 = vf[:, 0] - a0 = self.pd.sample() - neglogp0 = self.pd.neglogp(a0) - self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32) - - def step(ob, state, mask): - return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask}) - - def value(ob, state, mask): - return sess.run(v0, {X:ob, S:state, M:mask}) - - self.X = X - self.M = M - self.S = S - self.vf = vf - self.step = step - self.value = value - -class CnnPolicy(object): - - def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False, **conv_kwargs): #pylint: disable=W0613 - self.pdtype = make_pdtype(ac_space) - X, processed_x = observation_input(ob_space, nbatch) - with tf.variable_scope("model", reuse=reuse): - h = nature_cnn(processed_x, **conv_kwargs) - vf = fc(h, 'v', 1)[:,0] - self.pd, self.pi = self.pdtype.pdfromlatent(h, init_scale=0.01) - - a0 = self.pd.sample() - neglogp0 = self.pd.neglogp(a0) - self.initial_state = None - - def step(ob, *_args, **_kwargs): - a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob}) - return a, v, self.initial_state, neglogp - - def value(ob, *_args, **_kwargs): - return sess.run(vf, {X:ob}) - - self.X = X - self.vf = vf - self.step = step - self.value = value - -class MlpPolicy(object): - def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False): #pylint: disable=W0613 - self.pdtype = make_pdtype(ac_space) - with tf.variable_scope("model", reuse=reuse): - X, processed_x = observation_input(ob_space, nbatch) - activ = tf.tanh - processed_x = tf.layers.flatten(processed_x) - pi_h1 = activ(fc(processed_x, 'pi_fc1', nh=64, init_scale=np.sqrt(2))) - pi_h2 = activ(fc(pi_h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2))) - vf_h1 = activ(fc(processed_x, 'vf_fc1', nh=64, init_scale=np.sqrt(2))) - vf_h2 = activ(fc(vf_h1, 'vf_fc2', nh=64, init_scale=np.sqrt(2))) - vf = fc(vf_h2, 'vf', 1)[:,0] - - self.pd, self.pi = self.pdtype.pdfromlatent(pi_h2, init_scale=0.01) - - - a0 = self.pd.sample() - neglogp0 = self.pd.neglogp(a0) - self.initial_state = None - - def step(ob, *_args, **_kwargs): - a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob}) - return a, v, self.initial_state, neglogp - - def value(ob, *_args, **_kwargs): - return sess.run(vf, {X:ob}) - - self.X = X - self.vf = vf - self.step = step - self.value = value diff --git a/baselines/ppo2/ppo2.py b/baselines/ppo2/ppo2.py index fd34f52..3e14f26 100644 --- a/baselines/ppo2/ppo2.py +++ b/baselines/ppo2/ppo2.py @@ -1,21 +1,29 @@ import os import time -import joblib +import functools import numpy as np import os.path as osp import tensorflow as tf from baselines import logger from collections import deque -from baselines.common import explained_variance +from baselines.common import explained_variance, set_global_seeds +from baselines.common.policies import build_policy from baselines.common.runners import AbstractEnvRunner +from baselines.common.tf_util import get_session, save_variables, load_variables +from baselines.common.mpi_adam_optimizer import MpiAdamOptimizer + +from mpi4py import MPI +from baselines.common.tf_util import initialize +from baselines.common.mpi_util import sync_from_root class Model(object): def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train, nsteps, ent_coef, vf_coef, max_grad_norm): - sess = tf.get_default_session() + sess = get_session() - act_model = policy(sess, ob_space, ac_space, nbatch_act, 1, reuse=False) - train_model = policy(sess, ob_space, ac_space, nbatch_train, nsteps, reuse=True) + with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE): + act_model = policy(nbatch_act, 1, sess) + train_model = policy(nbatch_train, nsteps, sess) A = train_model.pdtype.sample_placeholder([None]) ADV = tf.placeholder(tf.float32, [None]) @@ -40,14 +48,16 @@ class Model(object): approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC)) clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE))) loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef - with tf.variable_scope('model'): - params = tf.trainable_variables() - grads = tf.gradients(loss, params) + params = tf.trainable_variables('ppo2_model') + trainer = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=LR, epsilon=1e-5) + grads_and_var = trainer.compute_gradients(loss, params) + grads, var = zip(*grads_and_var) + if max_grad_norm is not None: grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) - grads = list(zip(grads, params)) - trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5) - _train = trainer.apply_gradients(grads) + grads_and_var = list(zip(grads, var)) + + _train = trainer.apply_gradients(grads_and_var) def train(lr, cliprange, obs, returns, masks, actions, values, neglogpacs, states=None): advs = returns - values @@ -63,17 +73,6 @@ class Model(object): )[:-1] self.loss_names = ['policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac'] - def save(save_path): - ps = sess.run(params) - joblib.dump(ps, save_path) - - def load(load_path): - loaded_params = joblib.load(load_path) - restores = [] - for p, loaded_p in zip(params, loaded_params): - restores.append(p.assign(loaded_p)) - sess.run(restores) - # If you want to load weights, also save/load observation scaling inside VecNormalize self.train = train self.train_model = train_model @@ -81,9 +80,14 @@ class Model(object): self.step = act_model.step self.value = act_model.value self.initial_state = act_model.initial_state - self.save = save - self.load = load - tf.global_variables_initializer().run(session=sess) #pylint: disable=E1101 + + self.save = functools.partial(save_variables, sess=sess) + self.load = functools.partial(load_variables, sess=sess) + + if MPI.COMM_WORLD.Get_rank() == 0: + initialize() + global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="") + sync_from_root(sess, global_variables) #pylint: disable=E1101 class Runner(AbstractEnvRunner): @@ -97,7 +101,7 @@ class Runner(AbstractEnvRunner): mb_states = self.states epinfos = [] for _ in range(self.nsteps): - actions, values, self.states, neglogpacs = self.model.step(self.obs, self.states, self.dones) + actions, values, self.states, neglogpacs = self.model.step(self.obs, S=self.states, M=self.dones) mb_obs.append(self.obs.copy()) mb_actions.append(actions) mb_values.append(values) @@ -115,7 +119,7 @@ class Runner(AbstractEnvRunner): mb_values = np.asarray(mb_values, dtype=np.float32) mb_neglogpacs = np.asarray(mb_neglogpacs, dtype=np.float32) mb_dones = np.asarray(mb_dones, dtype=np.bool) - last_values = self.model.value(self.obs, self.states, self.dones) + last_values = self.model.value(self.obs, S=self.states, M=self.dones) #discount/bootstrap off value fn mb_returns = np.zeros_like(mb_rewards) mb_advs = np.zeros_like(mb_rewards) @@ -145,10 +149,65 @@ def constfn(val): return val return f -def learn(*, policy, env, nsteps, total_timesteps, ent_coef, lr, +def learn(*, network, env, total_timesteps, seed=None, nsteps=2048, ent_coef=0.0, lr=3e-4, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2, - save_interval=0, load_path=None): + save_interval=0, load_path=None, **network_kwargs): + ''' + Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347) + + Parameters: + ---------- + + network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list) + specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns + tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward + neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets. + See baselines.common/policies.py/lstm for more details on using recurrent nets in policies + + env: baselines.common.vec_env.VecEnv environment. Needs to be vectorized for parallel environment simulation. + The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class. + + + nsteps: int number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where + nenv is number of environment copies simulated in parallel) + + total_timesteps: int number of timesteps (i.e. number of actions taken in the environment) + + ent_coef: float policy entropy coefficient in the optimization objective + + lr: float or function learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the + training and 0 is the end of the training. + + vf_coef: float value function loss coefficient in the optimization objective + + max_grad_norm: float or None gradient norm clipping coefficient + + gamma: float discounting factor + + lam: float advantage estimation discounting factor (lambda in the paper) + + log_interval: int number of timesteps between logging events + + nminibatches: int number of training minibatches per update + + noptepochs: int number of training epochs per update + + cliprange: float or function clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training + and 0 is the end of the training + + save_interval: int number of timesteps between saving events + + load_path: str path to load the model from + + **network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network + For instance, 'mlp' network architecture has arguments num_hidden and num_layers. + + + + ''' + + set_global_seeds(seed) if isinstance(lr, float): lr = constfn(lr) else: assert callable(lr) @@ -156,6 +215,8 @@ def learn(*, policy, env, nsteps, total_timesteps, ent_coef, lr, else: assert callable(cliprange) total_timesteps = int(total_timesteps) + policy = build_policy(env, network, **network_kwargs) + nenvs = env.num_envs ob_space = env.observation_space ac_space = env.action_space @@ -180,7 +241,6 @@ def learn(*, policy, env, nsteps, total_timesteps, ent_coef, lr, nupdates = total_timesteps//nbatch for update in range(1, nupdates+1): assert nbatch % nminibatches == 0 - nbatch_train = nbatch // nminibatches tstart = time.time() frac = 1.0 - (update - 1.0) / nupdates lrnow = lr(frac) @@ -228,8 +288,9 @@ def learn(*, policy, env, nsteps, total_timesteps, ent_coef, lr, logger.logkv('time_elapsed', tnow - tfirststart) for (lossval, lossname) in zip(lossvals, model.loss_names): logger.logkv(lossname, lossval) - logger.dumpkvs() - if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir(): + if MPI.COMM_WORLD.Get_rank() == 0: + logger.dumpkvs() + if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir() and MPI.COMM_WORLD.Get_rank() == 0: checkdir = osp.join(logger.get_dir(), 'checkpoints') os.makedirs(checkdir, exist_ok=True) savepath = osp.join(checkdir, '%.5i'%update) @@ -240,3 +301,6 @@ def learn(*, policy, env, nsteps, total_timesteps, ent_coef, lr, def safemean(xs): return np.nan if len(xs) == 0 else np.mean(xs) + + + diff --git a/baselines/ppo2/run_atari.py b/baselines/ppo2/run_atari.py deleted file mode 100644 index 322837a..0000000 --- a/baselines/ppo2/run_atari.py +++ /dev/null @@ -1,40 +0,0 @@ -#!/usr/bin/env python3 -import sys -from baselines import logger -from baselines.common.cmd_util import make_atari_env, atari_arg_parser -from baselines.common.vec_env.vec_frame_stack import VecFrameStack -from baselines.ppo2 import ppo2 -from baselines.ppo2.policies import CnnPolicy, LstmPolicy, LnLstmPolicy, MlpPolicy -import multiprocessing -import tensorflow as tf - - -def train(env_id, num_timesteps, seed, policy): - - ncpu = multiprocessing.cpu_count() - if sys.platform == 'darwin': ncpu //= 2 - config = tf.ConfigProto(allow_soft_placement=True, - intra_op_parallelism_threads=ncpu, - inter_op_parallelism_threads=ncpu) - config.gpu_options.allow_growth = True #pylint: disable=E1101 - tf.Session(config=config).__enter__() - - env = VecFrameStack(make_atari_env(env_id, 8, seed), 4) - policy = {'cnn' : CnnPolicy, 'lstm' : LstmPolicy, 'lnlstm' : LnLstmPolicy, 'mlp': MlpPolicy}[policy] - ppo2.learn(policy=policy, env=env, nsteps=128, nminibatches=4, - lam=0.95, gamma=0.99, noptepochs=4, log_interval=1, - ent_coef=.01, - lr=lambda f : f * 2.5e-4, - cliprange=lambda f : f * 0.1, - total_timesteps=int(num_timesteps * 1.1)) - -def main(): - parser = atari_arg_parser() - parser.add_argument('--policy', help='Policy architecture', choices=['cnn', 'lstm', 'lnlstm', 'mlp'], default='cnn') - args = parser.parse_args() - logger.configure() - train(args.env, num_timesteps=args.num_timesteps, seed=args.seed, - policy=args.policy) - -if __name__ == '__main__': - main() diff --git a/baselines/ppo2/run_mujoco.py b/baselines/ppo2/run_mujoco.py deleted file mode 100644 index 282aa3f..0000000 --- a/baselines/ppo2/run_mujoco.py +++ /dev/null @@ -1,57 +0,0 @@ -#!/usr/bin/env python3 -import numpy as np -from baselines.common.cmd_util import mujoco_arg_parser -from baselines import bench, logger - - -def train(env_id, num_timesteps, seed): - from baselines.common import set_global_seeds - from baselines.common.vec_env.vec_normalize import VecNormalize - from baselines.ppo2 import ppo2 - from baselines.ppo2.policies import MlpPolicy - import gym - import tensorflow as tf - from baselines.common.vec_env.dummy_vec_env import DummyVecEnv - ncpu = 1 - config = tf.ConfigProto(allow_soft_placement=True, - intra_op_parallelism_threads=ncpu, - inter_op_parallelism_threads=ncpu) - tf.Session(config=config).__enter__() - - def make_env(): - env = gym.make(env_id) - env = bench.Monitor(env, logger.get_dir(), allow_early_resets=True) - return env - - env = DummyVecEnv([make_env]) - env = VecNormalize(env) - - set_global_seeds(seed) - policy = MlpPolicy - model = ppo2.learn(policy=policy, env=env, nsteps=2048, nminibatches=32, - lam=0.95, gamma=0.99, noptepochs=10, log_interval=1, - ent_coef=0.0, - lr=3e-4, - cliprange=0.2, - total_timesteps=num_timesteps) - - return model, env - - -def main(): - args = mujoco_arg_parser().parse_args() - logger.configure() - model, env = train(args.env, num_timesteps=args.num_timesteps, seed=args.seed) - - if args.play: - logger.log("Running trained model") - obs = np.zeros((env.num_envs,) + env.observation_space.shape) - obs[:] = env.reset() - while True: - actions = model.step(obs)[0] - obs[:] = env.step(actions)[0] - env.render() - - -if __name__ == '__main__': - main() diff --git a/baselines/run.py b/baselines/run.py new file mode 100644 index 0000000..cba8515 --- /dev/null +++ b/baselines/run.py @@ -0,0 +1,230 @@ +import sys +import multiprocessing +import os +import os.path as osp +import gym +from collections import defaultdict +import tensorflow as tf + +from baselines.common.vec_env.vec_frame_stack import VecFrameStack +from baselines.common.cmd_util import common_arg_parser, parse_unknown_args, make_mujoco_env, make_atari_env +from baselines.common.tf_util import save_state, load_state, get_session +from baselines import bench, logger +from importlib import import_module + +from baselines.common.vec_env.vec_normalize import VecNormalize +from baselines.common.vec_env.dummy_vec_env import DummyVecEnv +from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv +from baselines.common import atari_wrappers, retro_wrappers + +try: + from mpi4py import MPI +except ImportError: + MPI = None + +_game_envs = defaultdict(set) +for env in gym.envs.registry.all(): + # solve this with regexes + env_type = env._entry_point.split(':')[0].split('.')[-1] + _game_envs[env_type].add(env.id) + +# reading benchmark names directly from retro requires +# importing retro here, and for some reason that crashes tensorflow +# in ubuntu +_game_envs['retro'] = set([ + 'BubbleBobble-Nes', + 'SuperMarioBros-Nes', + 'TwinBee3PokoPokoDaimaou-Nes', + 'SpaceHarrier-Nes', + 'SonicTheHedgehog-Genesis', + 'Vectorman-Genesis', + 'FinalFight-Snes', + 'SpaceInvaders-Snes', +]) + + +def train(args, extra_args): + env_type, env_id = get_env_type(args.env) + + total_timesteps = int(args.num_timesteps) + seed = args.seed + + learn = get_learn_function(args.alg) + alg_kwargs = get_learn_function_defaults(args.alg, env_type) + alg_kwargs.update(extra_args) + + env = build_env(args) + + if args.network: + alg_kwargs['network'] = args.network + else: + if alg_kwargs.get('network') is None: + alg_kwargs['network'] = get_default_network(env_type) + + + + print('Training {} on {}:{} with arguments \n{}'.format(args.alg, env_type, env_id, alg_kwargs)) + + model = learn( + env=env, + seed=seed, + total_timesteps=total_timesteps, + **alg_kwargs + ) + + return model, env + + +def build_env(args, render=False): + ncpu = multiprocessing.cpu_count() + if sys.platform == 'darwin': ncpu //= 2 + nenv = args.num_env or ncpu if not render else 1 + alg = args.alg + rank = MPI.COMM_WORLD.Get_rank() if MPI else 0 + seed = args.seed + + env_type, env_id = get_env_type(args.env) + if env_type == 'mujoco': + get_session(tf.ConfigProto(allow_soft_placement=True, + intra_op_parallelism_threads=1, + inter_op_parallelism_threads=1)) + + if args.num_env: + env = SubprocVecEnv([lambda: make_mujoco_env(env_id, seed + i if seed is not None else None, args.reward_scale) for i in range(args.num_env)]) + else: + env = DummyVecEnv([lambda: make_mujoco_env(env_id, seed, args.reward_scale)]) + + env = VecNormalize(env) + + elif env_type == 'atari': + if alg == 'acer': + env = make_atari_env(env_id, nenv, seed) + elif alg == 'deepq': + env = atari_wrappers.make_atari(env_id) + env.seed(seed) + env = bench.Monitor(env, logger.get_dir()) + env = atari_wrappers.wrap_deepmind(env, frame_stack=True, scale=True) + elif alg == 'trpo_mpi': + env = atari_wrappers.make_atari(env_id) + env.seed(seed) + env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), str(rank))) + env = atari_wrappers.wrap_deepmind(env) + # TODO check if the second seeding is necessary, and eventually remove + env.seed(seed) + else: + frame_stack_size = 4 + env = VecFrameStack(make_atari_env(env_id, nenv, seed), frame_stack_size) + + elif env_type == 'retro': + import retro + gamestate = args.gamestate or 'Level1-1' + env = retro_wrappers.make_retro(game=args.env, state=gamestate, max_episode_steps=10000, use_restricted_actions=retro.Actions.DISCRETE) + env.seed(args.seed) + env = bench.Monitor(env, logger.get_dir()) + env = retro_wrappers.wrap_deepmind_retro(env) + + elif env_type == 'classic': + def make_env(): + e = gym.make(env_id) + e.seed(seed) + return e + + env = DummyVecEnv([make_env]) + + return env + + +def get_env_type(env_id): + if env_id in _game_envs.keys(): + env_type = env_id + env_id = [g for g in _game_envs[env_type]][0] + else: + env_type = None + for g, e in _game_envs.items(): + if env_id in e: + env_type = g + break + assert env_type is not None, 'env_id {} is not recognized in env types'.format(env_id, _game_envs.keys()) + + return env_type, env_id + +def get_default_network(env_type): + if env_type == 'mujoco' or env_type=='classic': + return 'mlp' + if env_type == 'atari': + return 'cnn' + + raise ValueError('Unknown env_type {}'.format(env_type)) + +def get_alg_module(alg, submodule=None): + submodule = submodule or alg + try: + # first try to import the alg module from baselines + alg_module = import_module('.'.join(['baselines', alg, submodule])) + except ImportError: + # then from rl_algs + alg_module = import_module('.'.join(['rl_' + 'algs', alg, submodule])) + + return alg_module + + +def get_learn_function(alg): + return get_alg_module(alg).learn + +def get_learn_function_defaults(alg, env_type): + try: + alg_defaults = get_alg_module(alg, 'defaults') + kwargs = getattr(alg_defaults, env_type)() + except (ImportError, AttributeError): + kwargs = {} + return kwargs + +def parse(v): + ''' + convert value of a command-line arg to a python object if possible, othewise, keep as string + ''' + + assert isinstance(v, str) + try: + return eval(v) + except (NameError, SyntaxError): + return v + + +def main(): + # configure logger, disable logging in child MPI processes (with rank > 0) + + arg_parser = common_arg_parser() + args, unknown_args = arg_parser.parse_known_args() + extra_args = {k: parse(v) for k,v in parse_unknown_args(unknown_args).items()} + + + if MPI is None or MPI.COMM_WORLD.Get_rank() == 0: + rank = 0 + logger.configure() + else: + logger.configure(format_strs = []) + rank = MPI.COMM_WORLD.Get_rank() + + model, _ = train(args, extra_args) + + if args.save_path is not None and rank == 0: + save_path = osp.expanduser(args.save_path) + model.save(save_path) + + + if args.play: + logger.log("Running trained model") + env = build_env(args, render=True) + obs = env.reset() + while True: + actions = model.step(obs)[0] + obs, _, done, _ = env.step(actions) + env.render() + if done: + obs = env.reset() + + + +if __name__ == '__main__': + main() diff --git a/baselines/trpo_mpi/defaults.py b/baselines/trpo_mpi/defaults.py new file mode 100644 index 0000000..96b6cb3 --- /dev/null +++ b/baselines/trpo_mpi/defaults.py @@ -0,0 +1,30 @@ +from rl_common.models import mlp, cnn_small + + +def atari(): + return dict( + network = cnn_small(), + timesteps_per_batch=512, + max_kl=0.001, + cg_iters=10, + cg_damping=1e-3, + gamma=0.98, + lam=1.0, + vf_iters=3, + vf_stepsize=1e-4, + entcoeff=0.00, + ) + +def mujoco(): + return dict( + network = mlp(num_hidden=32, num_layers=2), + timesteps_per_batch=1024, + max_kl=0.01, + cg_iters=10, + cg_damping=0.1, + gamma=0.99, + lam=0.98, + vf_iters=5, + vf_stepsize=1e-3, + normalize_observations=True, + ) diff --git a/baselines/trpo_mpi/nosharing_cnn_policy.py b/baselines/trpo_mpi/nosharing_cnn_policy.py deleted file mode 100644 index 97b2dcd..0000000 --- a/baselines/trpo_mpi/nosharing_cnn_policy.py +++ /dev/null @@ -1,56 +0,0 @@ -import baselines.common.tf_util as U -import tensorflow as tf -import gym -from baselines.common.distributions import make_pdtype - -class CnnPolicy(object): - recurrent = False - def __init__(self, name, ob_space, ac_space): - with tf.variable_scope(name): - self._init(ob_space, ac_space) - self.scope = tf.get_variable_scope().name - - def _init(self, ob_space, ac_space): - assert isinstance(ob_space, gym.spaces.Box) - - self.pdtype = pdtype = make_pdtype(ac_space) - sequence_length = None - - ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) - - obscaled = ob / 255.0 - - with tf.variable_scope("pol"): - x = obscaled - x = tf.nn.relu(U.conv2d(x, 8, "l1", [8, 8], [4, 4], pad="VALID")) - x = tf.nn.relu(U.conv2d(x, 16, "l2", [4, 4], [2, 2], pad="VALID")) - x = U.flattenallbut0(x) - x = tf.nn.relu(tf.layers.dense(x, 128, name='lin', kernel_initializer=U.normc_initializer(1.0))) - logits = tf.layers.dense(x, pdtype.param_shape()[0], name='logits', kernel_initializer=U.normc_initializer(0.01)) - self.pd = pdtype.pdfromflat(logits) - with tf.variable_scope("vf"): - x = obscaled - x = tf.nn.relu(U.conv2d(x, 8, "l1", [8, 8], [4, 4], pad="VALID")) - x = tf.nn.relu(U.conv2d(x, 16, "l2", [4, 4], [2, 2], pad="VALID")) - x = U.flattenallbut0(x) - x = tf.nn.relu(tf.layers.dense(x, 128, name='lin', kernel_initializer=U.normc_initializer(1.0))) - self.vpred = tf.layers.dense(x, 1, name='value', kernel_initializer=U.normc_initializer(1.0)) - self.vpredz = self.vpred - - self.state_in = [] - self.state_out = [] - - stochastic = tf.placeholder(dtype=tf.bool, shape=()) - ac = self.pd.sample() - self._act = U.function([stochastic, ob], [ac, self.vpred]) - - def act(self, stochastic, ob): - ac1, vpred1 = self._act(stochastic, ob[None]) - return ac1[0], vpred1[0] - def get_variables(self): - return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope) - def get_trainable_variables(self): - return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope) - def get_initial_state(self): - return [] - diff --git a/baselines/trpo_mpi/run_atari.py b/baselines/trpo_mpi/run_atari.py deleted file mode 100644 index f31ebfd..0000000 --- a/baselines/trpo_mpi/run_atari.py +++ /dev/null @@ -1,43 +0,0 @@ - #!/usr/bin/env python3 -from mpi4py import MPI -from baselines.common import set_global_seeds -import os.path as osp -import gym, logging -from baselines import logger -from baselines import bench -from baselines.common.atari_wrappers import make_atari, wrap_deepmind -from baselines.common.cmd_util import atari_arg_parser - -def train(env_id, num_timesteps, seed): - from baselines.trpo_mpi.nosharing_cnn_policy import CnnPolicy - from baselines.trpo_mpi import trpo_mpi - import baselines.common.tf_util as U - rank = MPI.COMM_WORLD.Get_rank() - sess = U.single_threaded_session() - sess.__enter__() - if rank == 0: - logger.configure() - else: - logger.configure(format_strs=[]) - - workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() - set_global_seeds(workerseed) - env = make_atari(env_id) - def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613 - return CnnPolicy(name=name, ob_space=env.observation_space, ac_space=env.action_space) - env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), str(rank))) - env.seed(workerseed) - - env = wrap_deepmind(env) - env.seed(workerseed) - - trpo_mpi.learn(env, policy_fn, timesteps_per_batch=512, max_kl=0.001, cg_iters=10, cg_damping=1e-3, - max_timesteps=int(num_timesteps * 1.1), gamma=0.98, lam=1.0, vf_iters=3, vf_stepsize=1e-4, entcoeff=0.00) - env.close() - -def main(): - args = atari_arg_parser().parse_args() - train(args.env, num_timesteps=args.num_timesteps, seed=args.seed) - -if __name__ == "__main__": - main() diff --git a/baselines/trpo_mpi/run_mujoco.py b/baselines/trpo_mpi/run_mujoco.py deleted file mode 100644 index 220bb91..0000000 --- a/baselines/trpo_mpi/run_mujoco.py +++ /dev/null @@ -1,36 +0,0 @@ -#!/usr/bin/env python3 -# noinspection PyUnresolvedReferences -from mpi4py import MPI -from baselines.common.cmd_util import make_mujoco_env, mujoco_arg_parser -from baselines import logger -from baselines.ppo1.mlp_policy import MlpPolicy -from baselines.trpo_mpi import trpo_mpi - -def train(env_id, num_timesteps, seed): - import baselines.common.tf_util as U - sess = U.single_threaded_session() - sess.__enter__() - - rank = MPI.COMM_WORLD.Get_rank() - if rank == 0: - logger.configure() - else: - logger.configure(format_strs=[]) - logger.set_level(logger.DISABLED) - workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() - def policy_fn(name, ob_space, ac_space): - return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, - hid_size=32, num_hid_layers=2) - env = make_mujoco_env(env_id, workerseed) - trpo_mpi.learn(env, policy_fn, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1, - max_timesteps=num_timesteps, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3) - env.close() - -def main(): - args = mujoco_arg_parser().parse_args() - train(args.env, num_timesteps=args.num_timesteps, seed=args.seed) - - -if __name__ == '__main__': - main() - diff --git a/baselines/trpo_mpi/trpo_mpi.py b/baselines/trpo_mpi/trpo_mpi.py index e23d9ac..d84b0fc 100644 --- a/baselines/trpo_mpi/trpo_mpi.py +++ b/baselines/trpo_mpi/trpo_mpi.py @@ -6,8 +6,11 @@ import time from baselines.common import colorize from mpi4py import MPI from collections import deque +from baselines.common import set_global_seeds from baselines.common.mpi_adam import MpiAdam from baselines.common.cg import cg +from baselines.common.input import observation_placeholder +from baselines.common.policies import build_policy from contextlib import contextmanager def traj_segment_generator(pi, env, horizon, stochastic): @@ -33,7 +36,7 @@ def traj_segment_generator(pi, env, horizon, stochastic): while True: prevac = ac - ac, vpred = pi.act(stochastic, ob) + ac, vpred, _, _ = pi.step(ob, stochastic=stochastic) # Slight weirdness here because we need value function at time T # before returning segment [0, T-1] so we get the correct # terminal value @@ -41,7 +44,7 @@ def traj_segment_generator(pi, env, horizon, stochastic): yield {"ob" : obs, "rew" : rews, "vpred" : vpreds, "new" : news, "ac" : acs, "prevac" : prevacs, "nextvpred": vpred * (1 - new), "ep_rets" : ep_rets, "ep_lens" : ep_lens} - _, vpred = pi.act(stochastic, ob) + _, vpred, _, _ = pi.step(ob, stochastic=stochastic) # Be careful!!! if you change the downstream algorithm to aggregate # several of these batches, then be sure to do a deepcopy ep_rets = [] @@ -79,30 +82,100 @@ def add_vtarg_and_adv(seg, gamma, lam): gaelam[t] = lastgaelam = delta + gamma * lam * nonterminal * lastgaelam seg["tdlamret"] = seg["adv"] + seg["vpred"] -def learn(env, policy_fn, *, - timesteps_per_batch, # what to train on - max_kl, cg_iters, - gamma, lam, # advantage estimation +def learn(*, + network, + env, + total_timesteps, + timesteps_per_batch=1024, # what to train on + max_kl=0.001, + cg_iters=10, + gamma=0.99, + lam=1.0, # advantage estimation + seed=None, entcoeff=0.0, cg_damping=1e-2, vf_stepsize=3e-4, vf_iters =3, - max_timesteps=0, max_episodes=0, max_iters=0, # time constraint - callback=None + max_episodes=0, max_iters=0, # time constraint + callback=None, + load_path=None, + **network_kwargs ): + ''' + learn a policy function with TRPO algorithm + + Parameters: + ---------- + + network neural network to learn. Can be either string ('mlp', 'cnn', 'lstm', 'lnlstm' for basic types) + or function that takes input placeholder and returns tuple (output, None) for feedforward nets + or (output, (state_placeholder, state_output, mask_placeholder)) for recurrent nets + + env environment (one of the gym environments or wrapped via baselines.common.vec_env.VecEnv-type class + + timesteps_per_batch timesteps per gradient estimation batch + + max_kl max KL divergence between old policy and new policy ( KL(pi_old || pi) ) + + entcoeff coefficient of policy entropy term in the optimization objective + + cg_iters number of iterations of conjugate gradient algorithm + + cg_damping conjugate gradient damping + + vf_stepsize learning rate for adam optimizer used to optimie value function loss + + vf_iters number of iterations of value function optimization iterations per each policy optimization step + + total_timesteps max number of timesteps + + max_episodes max number of episodes + + max_iters maximum number of policy optimization iterations + + callback function to be called with (locals(), globals()) each policy optimization step + + load_path str, path to load the model from (default: None, i.e. no model is loaded) + + **network_kwargs keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network + + Returns: + ------- + + learnt model + + ''' + + nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() + + cpus_per_worker = 1 + U.get_session(config=tf.ConfigProto( + allow_soft_placement=True, + inter_op_parallelism_threads=cpus_per_worker, + intra_op_parallelism_threads=cpus_per_worker + )) + + + policy = build_policy(env, network, value_network='copy', **network_kwargs) + set_global_seeds(seed) + np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space - pi = policy_fn("pi", ob_space, ac_space) - oldpi = policy_fn("oldpi", ob_space, ac_space) + + ob = observation_placeholder(ob_space) + with tf.variable_scope("pi"): + pi = policy(observ_placeholder=ob) + with tf.variable_scope("oldpi"): + oldpi = policy(observ_placeholder=ob) + atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return - ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) @@ -111,7 +184,7 @@ def learn(env, policy_fn, *, meanent = tf.reduce_mean(ent) entbonus = entcoeff * meanent - vferr = tf.reduce_mean(tf.square(pi.vpred - ret)) + vferr = tf.reduce_mean(tf.square(pi.vf - ret)) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold surrgain = tf.reduce_mean(ratio * atarg) @@ -122,9 +195,12 @@ def learn(env, policy_fn, *, dist = meankl - all_var_list = pi.get_trainable_variables() - var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")] - vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")] + all_var_list = get_trainable_variables("pi") + # var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")] + # vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")] + var_list = get_pi_trainable_variables("pi") + vf_var_list = get_vf_trainable_variables("pi") + vfadam = MpiAdam(vf_var_list) get_flat = U.GetFlat(var_list) @@ -142,7 +218,8 @@ def learn(env, policy_fn, *, fvp = U.flatgrad(gvp, var_list) assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv) - for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())]) + for (oldv, newv) in zipsame(get_variables("oldpi"), get_variables("pi"))]) + compute_losses = U.function([ob, ac, atarg], losses) compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)]) compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp) @@ -166,6 +243,9 @@ def learn(env, policy_fn, *, return out U.initialize() + if load_path is not None: + pi.load(load_path) + th_init = get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) set_from_flat(th_init) @@ -183,11 +263,16 @@ def learn(env, policy_fn, *, lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards - assert sum([max_iters>0, max_timesteps>0, max_episodes>0])==1 + if sum([max_iters>0, total_timesteps>0, max_episodes>0])==0: + # noththing to be done + return pi + + assert sum([max_iters>0, total_timesteps>0, max_episodes>0]) < 2, \ + 'out of max_iters, total_timesteps, and max_episodes only one should be specified' while True: if callback: callback(locals(), globals()) - if max_timesteps and timesteps_so_far >= max_timesteps: + if total_timesteps and timesteps_so_far >= total_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break @@ -287,5 +372,20 @@ def learn(env, policy_fn, *, if rank==0: logger.dump_tabular() + return pi + def flatten_lists(listoflists): - return [el for list_ in listoflists for el in list_] \ No newline at end of file + return [el for list_ in listoflists for el in list_] + +def get_variables(scope): + return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope) + +def get_trainable_variables(scope): + return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope) + +def get_vf_trainable_variables(scope): + return [v for v in get_trainable_variables(scope) if 'vf' in v.name[len(scope):].split('/')] + +def get_pi_trainable_variables(scope): + return [v for v in get_trainable_variables(scope) if 'pi' in v.name[len(scope):].split('/')] + diff --git a/conftest.py b/conftest.py new file mode 100644 index 0000000..3493c45 --- /dev/null +++ b/conftest.py @@ -0,0 +1,19 @@ +import pytest + + +def pytest_addoption(parser): + parser.addoption('--runslow', action='store_true', default=False, help='run slow tests') + + +def pytest_collection_modifyitems(config, items): + if config.getoption('--runslow'): + # --runslow given in cli: do not skip slow tests + return + skip_slow = pytest.mark.skip(reason='need --runslow option to run') + slow_tests = [] + for item in items: + if 'slow' in item.keywords: + slow_tests.append(item.name) + item.add_marker(skip_slow) + + print('skipping slow tests', ' '.join(slow_tests), 'use --runslow to run this') diff --git a/setup.py b/setup.py index bf8badc..35673ce 100644 --- a/setup.py +++ b/setup.py @@ -14,7 +14,6 @@ setup(name='baselines', 'scipy', 'tqdm', 'joblib', - 'zmq', 'dill', 'progressbar2', 'mpi4py', @@ -23,6 +22,12 @@ setup(name='baselines', 'click', 'opencv-python' ], + extras_require={ + 'test': [ + 'filelock', + 'pytest' + ] + }, description='OpenAI baselines: high quality implementations of reinforcement learning algorithms', author='OpenAI', url='https://github.com/openai/baselines',