Compare commits

...

49 Commits

Author SHA1 Message Date
Peter Zhokhov
ea68f3b7e6 dummy commit to RUN BENCHMARKS 2018-08-10 09:46:43 -07:00
Peter Zhokhov
ca721a4be6 Merge branch 'observation-dtype' of github.com:openai/baselines into peterz_benchmarks 2018-08-10 09:45:50 -07:00
Peter Zhokhov
72f3572a10 fixed syntax in conv_only RUN BENCHMARKS 2018-08-08 16:24:39 -07:00
Peter Zhokhov
b9cd941471 dummy commit to RUN BENCHMARKS 2018-08-08 15:59:59 -07:00
Peter Zhokhov
0899b71ede scale the images in conv_only RUN BENCHMARKS 2018-08-08 15:15:03 -07:00
Peter Zhokhov
cc8c9541fb dummy commit to RUN BENCHMARKS 2018-08-08 15:10:39 -07:00
Peter Zhokhov
cb32522394 enable all benchmarks 2018-08-08 15:10:00 -07:00
Peter Zhokhov
1e40ec22be dummy commit to RUN BENCHMARKS 2018-08-08 10:45:18 -07:00
Peter Zhokhov
701a36cdfa added a note in README about TfRunningMeanStd and serialization of VecNormalize 2018-08-08 10:44:58 -07:00
Peter Zhokhov
5a7f9847d8 flake8 complaints 2018-08-03 13:59:58 -07:00
Peter Zhokhov
b63134e5c5 added acer runner (missing import) 2018-08-03 13:31:37 -07:00
Peter Zhokhov
db314cdeda Merge branch 'peterz_profile_vec_normalize' into peterz_migrate_rlalgs 2018-08-03 11:47:36 -07:00
Peter Zhokhov
b08c083d91 use VecNormalize with regular RunningMeanStd 2018-08-03 11:44:12 -07:00
Peter Zhokhov
bfbbe66d9e profiling wip 2018-08-02 11:23:12 -07:00
Peter Zhokhov
1c5c6563b7 reverted VecNormalize to use RunningMeanStd (no tf) 2018-08-02 10:55:09 -07:00
Peter Zhokhov
1fa8c58da5 reverted VecNormalize to use RunningMeanStd (no tf) 2018-08-02 10:54:07 -07:00
Peter Zhokhov
f6d1115ead reverted running_mean_std to user property decorators for mean, var, count 2018-08-02 10:32:22 -07:00
Peter Zhokhov
f6d5a47bed use ncpu=1 for mujoco sessions - gives a bit of a performance speedup 2018-08-02 10:24:21 -07:00
Peter Zhokhov
c2df27bee4 non-tf normalization benchmark RUN BENCHMARKS 2018-08-02 09:41:41 -07:00
Peter Zhokhov
974c15756e changed default ppo2 lr schedule to linear RUN BENCHMARKS 2018-08-01 16:24:44 -07:00
Peter Zhokhov
ad43fd9a35 add defaults 2018-08-01 16:15:59 -07:00
Peter Zhokhov
72c357c638 hardcode names of retro environments 2018-08-01 15:18:59 -07:00
Peter Zhokhov
e00e5ca016 run ppo_mpi benchmarks only RUN BENCHMARKS 2018-08-01 14:56:08 -07:00
Peter Zhokhov
705797f2f0 Merge branch 'peterz_migrate_rlalgs' into peterz_benchmarks 2018-08-01 14:46:40 -07:00
Peter Zhokhov
fcd84aa831 make_atari_env compatible with mpi 2018-08-01 14:46:18 -07:00
Peter Zhokhov
390b51597a benchmarks on ppo2 only RUN BENCHMARKS 2018-08-01 11:01:50 -07:00
Peter Zhokhov
95104a3592 Merge branch 'peterz_migrate_rlalgs' into peterz_benchmarks 2018-08-01 10:50:29 -07:00
Peter Zhokhov
3528f7b992 save all variables to make sure we save the vec_normalize normalization 2018-08-01 10:12:19 -07:00
Peter Zhokhov
151e48009e flake8 complaints 2018-07-31 16:25:12 -07:00
Peter Zhokhov
92f33335e9 dummy commit to RUN BENCHMARKS 2018-07-31 15:53:18 -07:00
Peter Zhokhov
af729cff15 dummy commit to RUN BENCHMARKS 2018-07-31 15:37:00 -07:00
Peter Zhokhov
10f815fe1d fixed import in vec_normalize 2018-07-31 15:19:43 -07:00
Peter Zhokhov
8c4adac898 running_mean_std uses tensorflow variables 2018-07-31 14:45:55 -07:00
Peter Zhokhov
2a93ea8782 serialize variables as a dict, not as a list 2018-07-31 11:13:31 -07:00
Peter Zhokhov
9c48f9fad5 very dummy commit to RUN BENCHMARKS 2018-07-31 10:23:43 -07:00
Peter Zhokhov
348cbb4b71 dummy commit to RUN BENCHMARKS 2018-07-31 09:42:23 -07:00
Peter Zhokhov
a1602ab15f dummy commit to RUN BENCHMARKS 2018-07-30 17:51:16 -07:00
Peter Zhokhov
e63e69bb14 dummy commit to RUN BENCHMARKS 2018-07-30 17:39:22 -07:00
Peter Zhokhov
385e7e5c0d dummy commit to RUN BENCHMARKS 2018-07-30 17:21:05 -07:00
Peter Zhokhov
d112a2e49f added approximate humanoid reward with ppo2 into the README for reference 2018-07-30 16:58:31 -07:00
Peter Zhokhov
e662dd6409 run.py can run algos from both baselines and rl_algs 2018-07-30 16:09:48 -07:00
Peter Zhokhov
efc6bffce3 replaced atari_arg_parser with common_arg_parser 2018-07-30 15:58:56 -07:00
Peter Zhokhov
872181d4c3 re-exported rl_algs - fixed problems with serialization test and test_cartpole 2018-07-30 15:49:48 -07:00
Peter Zhokhov
628ddecf6a re-exported rl_algs 2018-07-30 12:15:46 -07:00
peter
83a4a4be65 run slow tests 2018-07-26 14:39:25 -07:00
peter
7edac38c73 more stuff from rl-algs 2018-07-26 14:26:57 -07:00
peter
a6dca44115 exported rl-algs 2018-07-26 14:02:04 -07:00
Karl Cobbe
622915c473 fix dtype for wrapper observation spaces 2018-06-12 14:48:39 -07:00
Karl Cobbe
a1d3c18ec0 fix dtype for wrapper observation spaces 2018-06-11 13:35:47 -07:00
71 changed files with 2942 additions and 1070 deletions

1
.benchmark_pattern Normal file
View File

@@ -0,0 +1 @@

2
.gitignore vendored
View File

@@ -34,5 +34,3 @@ src
.cache .cache
MUJOCO_LOG.TXT MUJOCO_LOG.TXT

View File

@@ -10,5 +10,5 @@ install:
- docker build . -t baselines-test - docker build . -t baselines-test
script: script:
- flake8 --select=F baselines/common - flake8 --select=F,E999 baselines/common baselines/trpo_mpi baselines/ppo2 baselines/a2c baselines/deepq baselines/acer
- docker run baselines-test pytest - docker run baselines-test pytest --runslow

View File

@@ -1,20 +1,24 @@
FROM ubuntu:16.04 FROM ubuntu:16.04
RUN apt-get -y update && apt-get -y install git wget python-dev python3-dev libopenmpi-dev python-pip zlib1g-dev cmake RUN apt-get -y update && apt-get -y install git wget python-dev python3-dev libopenmpi-dev python-pip zlib1g-dev cmake python-opencv
ENV CODE_DIR /root/code ENV CODE_DIR /root/code
ENV VENV /root/venv ENV VENV /root/venv
COPY . $CODE_DIR/baselines
RUN \ RUN \
pip install virtualenv && \ pip install virtualenv && \
virtualenv $VENV --python=python3 && \ virtualenv $VENV --python=python3 && \
. $VENV/bin/activate && \ . $VENV/bin/activate && \
cd $CODE_DIR && \ pip install --upgrade pip
pip install --upgrade pip && \
pip install -e baselines && \
pip install pytest
ENV PATH=$VENV/bin:$PATH ENV PATH=$VENV/bin:$PATH
COPY . $CODE_DIR/baselines
WORKDIR $CODE_DIR/baselines WORKDIR $CODE_DIR/baselines
# Clean up pycache and pyc files
RUN rm -rf __pycache__ && \
find . -name "*.pyc" -delete && \
pip install -e .[test]
CMD /bin/bash CMD /bin/bash

View File

@@ -62,6 +62,60 @@ pip install pytest
pytest pytest
``` ```
## Subpackages
## Testing the installation
All unit tests in baselines can be run using pytest runner:
```
pip install pytest
pytest
```
## Training models
Most of the algorithms in baselines repo are used as follows:
```bash
python -m baselines.run --alg=<name of the algorithm> --env=<environment_id> [additional arguments]
```
### Example 1. PPO with MuJoCo Humanoid
For instance, to train a fully-connected network controlling MuJoCo humanoid using a2c for 20M timesteps
```bash
python -m baselines.run --alg=a2c --env=Humanoid-v2 --network=mlp --num_timesteps=2e7
```
Note that for mujoco environments fully-connected network is default, so we can omit `--network=mlp`
The hyperparameters for both network and the learning algorithm can be controlled via the command line, for instance:
```bash
python -m baselines.run --alg=a2c --env=Humanoid-v2 --network=mlp --num_timesteps=2e7 --ent_coef=0.1 --num_hidden=32 --num_layers=3 --value_network=copy
```
will set entropy coeffient to 0.1, and construct fully connected network with 3 layers with 32 hidden units in each, and create a separate network for value function estimation (so that its parameters are not shared with the policy network, but the structure is the same)
See docstrings in [common/models.py](common/models.py) for description of network parameters for each type of model, and
docstring for [baselines/ppo2/ppo2.py/learn()](ppo2/ppo2.py) fir the description of the ppo2 hyperparamters.
### Example 2. DQN on Atari
DQN with Atari is at this point a classics of benchmarks. To run the baselines implementation of DQN on Atari Pong:
```
python -m baselines.run --alg=deepq --env=PongNoFrameskip-v4 --num_timesteps=1e6
```
## Saving, loading and visualizing models
The algorithms serialization API is not properly unified yet; however, there is a simple method to save / restore trained models.
`--save_path` and `--load_path` command-line option loads the tensorflow state from a given path before training, and saves it after the training, respectively.
Let's imagine you'd like to train ppo2 on Atari Pong, save the model and then later visualize what has it learnt.
```bash
python -m baselines.run --alg=ppo2 --env=PongNoFrameskip-v4 --num-timesteps=2e7 --save_path=~/models/pong_20M_ppo2
```
This should get to the mean reward per episode about 5k. To load and visualize the model, we'll do the following - load the model, train it for 0 steps, and then visualize:
```bash
python -m baselines.run --alg=ppo2 --env=PongNoFrameskip-v4 --num-timesteps=0 --load_path=~/models/pong_20M_ppo2 --play
```
*NOTE:* At the moment Mujoco training uses VecNormalize wrapper for the environment which is not being saved correctly; so loading the models trained on Mujoco will not work well if the environment is recreated. If necessary, you can work around that by replacing RunningMeanStd by TfRunningMeanStd in [baselines/common/vec_env/vec_normalize.py](baselines/common/vec_env/vec_normalize.py#L12). This way, mean and std of environment normalizing wrapper will be saved in tensorflow variables and included in the model file; however, training is slower that way - hence not including it by default
## Subpackages ## Subpackages
- [A2C](baselines/a2c) - [A2C](baselines/a2c)
@@ -85,3 +139,4 @@ To cite this repository in publications:
journal = {GitHub repository}, journal = {GitHub repository},
howpublished = {\url{https://github.com/openai/baselines}}, howpublished = {\url{https://github.com/openai/baselines}},
} }

View File

@@ -1,42 +1,48 @@
import os.path as osp
import time import time
import joblib import functools
import numpy as np
import tensorflow as tf import tensorflow as tf
from baselines import logger from baselines import logger
from baselines.common import set_global_seeds, explained_variance from baselines.common import set_global_seeds, explained_variance
from baselines.common.runners import AbstractEnvRunner
from baselines.common import tf_util from baselines.common import tf_util
from baselines.common.policies import build_policy
from baselines.a2c.utils import discount_with_dones
from baselines.a2c.utils import Scheduler, make_path, find_trainable_variables from baselines.a2c.utils import Scheduler, find_trainable_variables
from baselines.a2c.utils import cat_entropy, mse from baselines.a2c.runner import Runner
from tensorflow import losses
class Model(object): class Model(object):
def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, def __init__(self, policy, env, nsteps,
ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4,
alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear'): alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear'):
sess = tf_util.make_session() sess = tf_util.get_session()
nenvs = env.num_envs
nbatch = nenvs*nsteps nbatch = nenvs*nsteps
A = tf.placeholder(tf.int32, [nbatch])
with tf.variable_scope('a2c_model', reuse=tf.AUTO_REUSE):
step_model = policy(nenvs, 1, sess)
train_model = policy(nbatch, nsteps, sess)
A = tf.placeholder(train_model.action.dtype, train_model.action.shape)
ADV = tf.placeholder(tf.float32, [nbatch]) ADV = tf.placeholder(tf.float32, [nbatch])
R = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch])
LR = tf.placeholder(tf.float32, []) LR = tf.placeholder(tf.float32, [])
step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False) neglogpac = train_model.pd.neglogp(A)
train_model = policy(sess, ob_space, ac_space, nenvs*nsteps, nsteps, reuse=True) entropy = tf.reduce_mean(train_model.pd.entropy())
neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=A)
pg_loss = tf.reduce_mean(ADV * neglogpac) pg_loss = tf.reduce_mean(ADV * neglogpac)
vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R)) vf_loss = losses.mean_squared_error(tf.squeeze(train_model.vf), R)
entropy = tf.reduce_mean(cat_entropy(train_model.pi))
loss = pg_loss - entropy*ent_coef + vf_loss * vf_coef loss = pg_loss - entropy*ent_coef + vf_loss * vf_coef
params = find_trainable_variables("model") params = find_trainable_variables("a2c_model")
grads = tf.gradients(loss, params) grads = tf.gradients(loss, params)
if max_grad_norm is not None: if max_grad_norm is not None:
grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
@@ -50,6 +56,7 @@ class Model(object):
advs = rewards - values advs = rewards - values
for step in range(len(obs)): for step in range(len(obs)):
cur_lr = lr.value() cur_lr = lr.value()
td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, LR:cur_lr} td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, LR:cur_lr}
if states is not None: if states is not None:
td_map[train_model.S] = states td_map[train_model.S] = states
@@ -60,17 +67,6 @@ class Model(object):
) )
return policy_loss, value_loss, policy_entropy return policy_loss, value_loss, policy_entropy
def save(save_path):
ps = sess.run(params)
make_path(osp.dirname(save_path))
joblib.dump(ps, save_path)
def load(load_path):
loaded_params = joblib.load(load_path)
restores = []
for p, loaded_p in zip(params, loaded_params):
restores.append(p.assign(loaded_p))
sess.run(restores)
self.train = train self.train = train
self.train_model = train_model self.train_model = train_model
@@ -78,66 +74,87 @@ class Model(object):
self.step = step_model.step self.step = step_model.step
self.value = step_model.value self.value = step_model.value
self.initial_state = step_model.initial_state self.initial_state = step_model.initial_state
self.save = save self.save = functools.partial(tf_util.save_variables, sess=sess)
self.load = load self.load = functools.partial(tf_util.load_variables, sess=sess)
tf.global_variables_initializer().run(session=sess) tf.global_variables_initializer().run(session=sess)
class Runner(AbstractEnvRunner):
def __init__(self, env, model, nsteps=5, gamma=0.99): def learn(
super().__init__(env=env, model=model, nsteps=nsteps) network,
self.gamma = gamma env,
seed=None,
nsteps=5,
total_timesteps=int(80e6),
vf_coef=0.5,
ent_coef=0.01,
max_grad_norm=0.5,
lr=7e-4,
lrschedule='linear',
epsilon=1e-5,
alpha=0.99,
gamma=0.99,
log_interval=100,
load_path=None,
**network_kwargs):
'''
Main entrypoint for A2C algorithm. Train a policy with given network architecture on a given environment using a2c algorithm.
Parameters:
-----------
network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
See baselines.common/policies.py/lstm for more details on using recurrent nets in policies
env: RL environment. Should implement interface similar to VecEnv (baselines.common/vec_env) or be wrapped with DummyVecEnv (baselines.common/vec_env/dummy_vec_env.py)
seed: seed to make random number sequence in the alorightm reproducible. By default is None which means seed from system noise generator (not reproducible)
nsteps: int, number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where
nenv is number of environment copies simulated in parallel)
total_timesteps: int, total number of timesteps to train on (default: 80M)
vf_coef: float, coefficient in front of value function loss in the total loss function (default: 0.5)
ent_coef: float, coeffictiant in front of the policy entropy in the total loss function (default: 0.01)
max_gradient_norm: float, gradient is clipped to have global L2 norm no more than this value (default: 0.5)
lr: float, learning rate for RMSProp (current implementation has RMSProp hardcoded in) (default: 7e-4)
lrschedule: schedule of learning rate. Can be 'linear', 'constant', or a function [0..1] -> [0..1] that takes fraction of the training progress as input and
returns fraction of the learning rate (specified as lr) as output
epsilon: float, RMSProp epsilon (stabilizes square root computation in denominator of RMSProp update) (default: 1e-5)
alpha: float, RMSProp decay parameter (default: 0.99)
gamma: float, reward discounting parameter (default: 0.99)
log_interval: int, specifies how frequently the logs are printed out (default: 100)
**network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
For instance, 'mlp' network architecture has arguments num_hidden and num_layers.
'''
def run(self):
mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[]
mb_states = self.states
for n in range(self.nsteps):
actions, values, states, _ = self.model.step(self.obs, self.states, self.dones)
mb_obs.append(np.copy(self.obs))
mb_actions.append(actions)
mb_values.append(values)
mb_dones.append(self.dones)
obs, rewards, dones, _ = self.env.step(actions)
self.states = states
self.dones = dones
for n, done in enumerate(dones):
if done:
self.obs[n] = self.obs[n]*0
self.obs = obs
mb_rewards.append(rewards)
mb_dones.append(self.dones)
#batch of steps to batch of rollouts
mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape(self.batch_ob_shape)
mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0)
mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0)
mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0)
mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
mb_masks = mb_dones[:, :-1]
mb_dones = mb_dones[:, 1:]
last_values = self.model.value(self.obs, self.states, self.dones).tolist()
#discount/bootstrap off value fn
for n, (rewards, dones, value) in enumerate(zip(mb_rewards, mb_dones, last_values)):
rewards = rewards.tolist()
dones = dones.tolist()
if dones[-1] == 0:
rewards = discount_with_dones(rewards+[value], dones+[0], self.gamma)[:-1]
else:
rewards = discount_with_dones(rewards, dones, self.gamma)
mb_rewards[n] = rewards
mb_rewards = mb_rewards.flatten()
mb_actions = mb_actions.flatten()
mb_values = mb_values.flatten()
mb_masks = mb_masks.flatten()
return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values
def learn(policy, env, seed, nsteps=5, total_timesteps=int(80e6), vf_coef=0.5, ent_coef=0.01, max_grad_norm=0.5, lr=7e-4, lrschedule='linear', epsilon=1e-5, alpha=0.99, gamma=0.99, log_interval=100):
set_global_seeds(seed) set_global_seeds(seed)
nenvs = env.num_envs nenvs = env.num_envs
ob_space = env.observation_space policy = build_policy(env, network, **network_kwargs)
ac_space = env.action_space
model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nenvs=nenvs, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, model = Model(policy=policy, env=env, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef,
max_grad_norm=max_grad_norm, lr=lr, alpha=alpha, epsilon=epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule) max_grad_norm=max_grad_norm, lr=lr, alpha=alpha, epsilon=epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule)
if load_path is not None:
model.load(load_path)
runner = Runner(env, model, nsteps=nsteps, gamma=gamma) runner = Runner(env, model, nsteps=nsteps, gamma=gamma)
nbatch = nenvs*nsteps nbatch = nenvs*nsteps
@@ -158,3 +175,4 @@ def learn(policy, env, seed, nsteps=5, total_timesteps=int(80e6), vf_coef=0.5, e
logger.dump_tabular() logger.dump_tabular()
env.close() env.close()
return model return model

View File

@@ -1,146 +0,0 @@
import numpy as np
import tensorflow as tf
from baselines.a2c.utils import conv, fc, conv_to_fc, batch_to_seq, seq_to_batch, lstm, lnlstm
from baselines.common.distributions import make_pdtype
from baselines.common.input import observation_input
def nature_cnn(unscaled_images, **conv_kwargs):
"""
CNN from Nature paper.
"""
scaled_images = tf.cast(unscaled_images, tf.float32) / 255.
activ = tf.nn.relu
h = activ(conv(scaled_images, 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2),
**conv_kwargs))
h2 = activ(conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2), **conv_kwargs))
h3 = activ(conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2), **conv_kwargs))
h3 = conv_to_fc(h3)
return activ(fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2)))
class LnLstmPolicy(object):
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False):
nenv = nbatch // nsteps
X, processed_x = observation_input(ob_space, nbatch)
M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
self.pdtype = make_pdtype(ac_space)
with tf.variable_scope("model", reuse=reuse):
h = nature_cnn(processed_x)
xs = batch_to_seq(h, nenv, nsteps)
ms = batch_to_seq(M, nenv, nsteps)
h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm)
h5 = seq_to_batch(h5)
vf = fc(h5, 'v', 1)
self.pd, self.pi = self.pdtype.pdfromlatent(h5)
v0 = vf[:, 0]
a0 = self.pd.sample()
neglogp0 = self.pd.neglogp(a0)
self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)
def step(ob, state, mask):
return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask})
def value(ob, state, mask):
return sess.run(v0, {X:ob, S:state, M:mask})
self.X = X
self.M = M
self.S = S
self.vf = vf
self.step = step
self.value = value
class LstmPolicy(object):
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False):
nenv = nbatch // nsteps
self.pdtype = make_pdtype(ac_space)
X, processed_x = observation_input(ob_space, nbatch)
M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
with tf.variable_scope("model", reuse=reuse):
h = nature_cnn(X)
xs = batch_to_seq(h, nenv, nsteps)
ms = batch_to_seq(M, nenv, nsteps)
h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
h5 = seq_to_batch(h5)
vf = fc(h5, 'v', 1)
self.pd, self.pi = self.pdtype.pdfromlatent(h5)
v0 = vf[:, 0]
a0 = self.pd.sample()
neglogp0 = self.pd.neglogp(a0)
self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)
def step(ob, state, mask):
return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask})
def value(ob, state, mask):
return sess.run(v0, {X:ob, S:state, M:mask})
self.X = X
self.M = M
self.S = S
self.vf = vf
self.step = step
self.value = value
class CnnPolicy(object):
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False, **conv_kwargs): #pylint: disable=W0613
self.pdtype = make_pdtype(ac_space)
X, processed_x = observation_input(ob_space, nbatch)
with tf.variable_scope("model", reuse=reuse):
h = nature_cnn(processed_x, **conv_kwargs)
vf = fc(h, 'v', 1)[:,0]
self.pd, self.pi = self.pdtype.pdfromlatent(h, init_scale=0.01)
a0 = self.pd.sample()
neglogp0 = self.pd.neglogp(a0)
self.initial_state = None
def step(ob, *_args, **_kwargs):
a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob})
return a, v, self.initial_state, neglogp
def value(ob, *_args, **_kwargs):
return sess.run(vf, {X:ob})
self.X = X
self.vf = vf
self.step = step
self.value = value
class MlpPolicy(object):
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False): #pylint: disable=W0613
self.pdtype = make_pdtype(ac_space)
with tf.variable_scope("model", reuse=reuse):
X, processed_x = observation_input(ob_space, nbatch)
activ = tf.tanh
processed_x = tf.layers.flatten(processed_x)
pi_h1 = activ(fc(processed_x, 'pi_fc1', nh=64, init_scale=np.sqrt(2)))
pi_h2 = activ(fc(pi_h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2)))
vf_h1 = activ(fc(processed_x, 'vf_fc1', nh=64, init_scale=np.sqrt(2)))
vf_h2 = activ(fc(vf_h1, 'vf_fc2', nh=64, init_scale=np.sqrt(2)))
vf = fc(vf_h2, 'vf', 1)[:,0]
self.pd, self.pi = self.pdtype.pdfromlatent(pi_h2, init_scale=0.01)
a0 = self.pd.sample()
neglogp0 = self.pd.neglogp(a0)
self.initial_state = None
def step(ob, *_args, **_kwargs):
a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob})
return a, v, self.initial_state, neglogp
def value(ob, *_args, **_kwargs):
return sess.run(vf, {X:ob})
self.X = X
self.vf = vf
self.step = step
self.value = value

View File

@@ -1,30 +0,0 @@
#!/usr/bin/env python3
from baselines import logger
from baselines.common.cmd_util import make_atari_env, atari_arg_parser
from baselines.common.vec_env.vec_frame_stack import VecFrameStack
from baselines.a2c.a2c import learn
from baselines.ppo2.policies import CnnPolicy, LstmPolicy, LnLstmPolicy
def train(env_id, num_timesteps, seed, policy, lrschedule, num_env):
if policy == 'cnn':
policy_fn = CnnPolicy
elif policy == 'lstm':
policy_fn = LstmPolicy
elif policy == 'lnlstm':
policy_fn = LnLstmPolicy
env = VecFrameStack(make_atari_env(env_id, num_env, seed), 4)
learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), lrschedule=lrschedule)
env.close()
def main():
parser = atari_arg_parser()
parser.add_argument('--policy', help='Policy architecture', choices=['cnn', 'lstm', 'lnlstm'], default='cnn')
parser.add_argument('--lrschedule', help='Learning rate schedule', choices=['constant', 'linear'], default='constant')
args = parser.parse_args()
logger.configure()
train(args.env, num_timesteps=args.num_timesteps, seed=args.seed,
policy=args.policy, lrschedule=args.lrschedule, num_env=16)
if __name__ == '__main__':
main()

60
baselines/a2c/runner.py Normal file
View File

@@ -0,0 +1,60 @@
import numpy as np
from baselines.a2c.utils import discount_with_dones
from baselines.common.runners import AbstractEnvRunner
class Runner(AbstractEnvRunner):
def __init__(self, env, model, nsteps=5, gamma=0.99):
super().__init__(env=env, model=model, nsteps=nsteps)
self.gamma = gamma
self.batch_action_shape = [x if x is not None else -1 for x in model.train_model.action.shape.as_list()]
self.ob_dtype = model.train_model.X.dtype.as_numpy_dtype
def run(self):
mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[]
mb_states = self.states
for n in range(self.nsteps):
actions, values, states, _ = self.model.step(self.obs, S=self.states, M=self.dones)
mb_obs.append(np.copy(self.obs))
mb_actions.append(actions)
mb_values.append(values)
mb_dones.append(self.dones)
obs, rewards, dones, _ = self.env.step(actions)
self.states = states
self.dones = dones
for n, done in enumerate(dones):
if done:
self.obs[n] = self.obs[n]*0
self.obs = obs
mb_rewards.append(rewards)
mb_dones.append(self.dones)
#batch of steps to batch of rollouts
mb_obs = np.asarray(mb_obs, dtype=self.ob_dtype).swapaxes(1, 0).reshape(self.batch_ob_shape)
mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0)
mb_actions = np.asarray(mb_actions, dtype=self.model.train_model.action.dtype.name).swapaxes(1, 0)
mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0)
mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
mb_masks = mb_dones[:, :-1]
mb_dones = mb_dones[:, 1:]
if self.gamma > 0.0:
#discount/bootstrap off value fn
last_values = self.model.value(self.obs, S=self.states, M=self.dones).tolist()
for n, (rewards, dones, value) in enumerate(zip(mb_rewards, mb_dones, last_values)):
rewards = rewards.tolist()
dones = dones.tolist()
if dones[-1] == 0:
rewards = discount_with_dones(rewards+[value], dones+[0], self.gamma)[:-1]
else:
rewards = discount_with_dones(rewards, dones, self.gamma)
mb_rewards[n] = rewards
mb_actions = mb_actions.reshape(self.batch_action_shape)
mb_rewards = mb_rewards.flatten()
mb_values = mb_values.flatten()
mb_masks = mb_masks.flatten()
return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values

View File

@@ -1,8 +1,6 @@
import os import os
import gym
import numpy as np import numpy as np
import tensorflow as tf import tensorflow as tf
from gym import spaces
from collections import deque from collections import deque
def sample(logits): def sample(logits):
@@ -10,18 +8,15 @@ def sample(logits):
return tf.argmax(logits - tf.log(-tf.log(noise)), 1) return tf.argmax(logits - tf.log(-tf.log(noise)), 1)
def cat_entropy(logits): def cat_entropy(logits):
a0 = logits - tf.reduce_max(logits, 1, keep_dims=True) a0 = logits - tf.reduce_max(logits, 1, keepdims=True)
ea0 = tf.exp(a0) ea0 = tf.exp(a0)
z0 = tf.reduce_sum(ea0, 1, keep_dims=True) z0 = tf.reduce_sum(ea0, 1, keepdims=True)
p0 = ea0 / z0 p0 = ea0 / z0
return tf.reduce_sum(p0 * (tf.log(z0) - a0), 1) return tf.reduce_sum(p0 * (tf.log(z0) - a0), 1)
def cat_entropy_softmax(p0): def cat_entropy_softmax(p0):
return - tf.reduce_sum(p0 * tf.log(p0 + 1e-6), axis = 1) return - tf.reduce_sum(p0 * tf.log(p0 + 1e-6), axis = 1)
def mse(pred, target):
return tf.square(pred-target)/2.
def ortho_init(scale=1.0): def ortho_init(scale=1.0):
def _ortho_init(shape, dtype, partition_info=None): def _ortho_init(shape, dtype, partition_info=None):
#lasagne ortho init for tf #lasagne ortho init for tf
@@ -58,7 +53,7 @@ def conv(x, scope, *, nf, rf, stride, pad='VALID', init_scale=1.0, data_format='
b = tf.get_variable("b", bias_var_shape, initializer=tf.constant_initializer(0.0)) b = tf.get_variable("b", bias_var_shape, initializer=tf.constant_initializer(0.0))
if not one_dim_bias and data_format == 'NHWC': if not one_dim_bias and data_format == 'NHWC':
b = tf.reshape(b, bshape) b = tf.reshape(b, bshape)
return b + tf.nn.conv2d(x, w, strides=strides, padding=pad, data_format=data_format) return tf.nn.conv2d(x, w, strides=strides, padding=pad, data_format=data_format) + b
def fc(x, scope, nh, *, init_scale=1.0, init_bias=0.0): def fc(x, scope, nh, *, init_scale=1.0, init_bias=0.0):
with tf.variable_scope(scope): with tf.variable_scope(scope):
@@ -85,7 +80,6 @@ def seq_to_batch(h, flat = False):
def lstm(xs, ms, s, scope, nh, init_scale=1.0): def lstm(xs, ms, s, scope, nh, init_scale=1.0):
nbatch, nin = [v.value for v in xs[0].get_shape()] nbatch, nin = [v.value for v in xs[0].get_shape()]
nsteps = len(xs)
with tf.variable_scope(scope): with tf.variable_scope(scope):
wx = tf.get_variable("wx", [nin, nh*4], initializer=ortho_init(init_scale)) wx = tf.get_variable("wx", [nin, nh*4], initializer=ortho_init(init_scale))
wh = tf.get_variable("wh", [nh, nh*4], initializer=ortho_init(init_scale)) wh = tf.get_variable("wh", [nh, nh*4], initializer=ortho_init(init_scale))
@@ -115,7 +109,6 @@ def _ln(x, g, b, e=1e-5, axes=[1]):
def lnlstm(xs, ms, s, scope, nh, init_scale=1.0): def lnlstm(xs, ms, s, scope, nh, init_scale=1.0):
nbatch, nin = [v.value for v in xs[0].get_shape()] nbatch, nin = [v.value for v in xs[0].get_shape()]
nsteps = len(xs)
with tf.variable_scope(scope): with tf.variable_scope(scope):
wx = tf.get_variable("wx", [nin, nh*4], initializer=ortho_init(init_scale)) wx = tf.get_variable("wx", [nin, nh*4], initializer=ortho_init(init_scale))
gx = tf.get_variable("gx", [nh*4], initializer=tf.constant_initializer(1.0)) gx = tf.get_variable("gx", [nh*4], initializer=tf.constant_initializer(1.0))
@@ -160,8 +153,7 @@ def discount_with_dones(rewards, dones, gamma):
return discounted[::-1] return discounted[::-1]
def find_trainable_variables(key): def find_trainable_variables(key):
with tf.variable_scope(key): return tf.trainable_variables(key)
return tf.trainable_variables()
def make_path(f): def make_path(f):
return os.makedirs(f, exist_ok=True) return os.makedirs(f, exist_ok=True)

View File

@@ -1,20 +1,20 @@
import time import time
import joblib import functools
import numpy as np import numpy as np
import tensorflow as tf import tensorflow as tf
from baselines import logger from baselines import logger
from baselines.common import set_global_seeds from baselines.common import set_global_seeds
from baselines.common.runners import AbstractEnvRunner from baselines.common.policies import build_policy
from baselines.common.tf_util import get_session, save_variables
from baselines.a2c.utils import batch_to_seq, seq_to_batch from baselines.a2c.utils import batch_to_seq, seq_to_batch
from baselines.a2c.utils import Scheduler, make_path, find_trainable_variables
from baselines.a2c.utils import cat_entropy_softmax from baselines.a2c.utils import cat_entropy_softmax
from baselines.a2c.utils import Scheduler, find_trainable_variables
from baselines.a2c.utils import EpisodeStats from baselines.a2c.utils import EpisodeStats
from baselines.a2c.utils import get_by_index, check_shape, avg_norm, gradient_add, q_explained_variance from baselines.a2c.utils import get_by_index, check_shape, avg_norm, gradient_add, q_explained_variance
from baselines.acer.buffer import Buffer from baselines.acer.buffer import Buffer
from baselines.acer.runner import Runner
import os.path as osp
# remove last step # remove last step
def strip(var, nenvs, nsteps, flat = False): def strip(var, nenvs, nsteps, flat = False):
@@ -59,10 +59,8 @@ class Model(object):
ent_coef, q_coef, gamma, max_grad_norm, lr, ent_coef, q_coef, gamma, max_grad_norm, lr,
rprop_alpha, rprop_epsilon, total_timesteps, lrschedule, rprop_alpha, rprop_epsilon, total_timesteps, lrschedule,
c, trust_region, alpha, delta): c, trust_region, alpha, delta):
config = tf.ConfigProto(allow_soft_placement=True,
intra_op_parallelism_threads=num_procs, sess = get_session()
inter_op_parallelism_threads=num_procs)
sess = tf.Session(config=config)
nact = ac_space.n nact = ac_space.n
nbatch = nenvs * nsteps nbatch = nenvs * nsteps
@@ -73,10 +71,15 @@ class Model(object):
LR = tf.placeholder(tf.float32, []) LR = tf.placeholder(tf.float32, [])
eps = 1e-6 eps = 1e-6
step_model = policy(sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False) step_ob_placeholder = tf.placeholder(dtype=ob_space.dtype, shape=(nenvs,) + ob_space.shape[:-1] + (ob_space.shape[-1] * nstack,))
train_model = policy(sess, ob_space, ac_space, nenvs, nsteps + 1, nstack, reuse=True) train_ob_placeholder = tf.placeholder(dtype=ob_space.dtype, shape=(nenvs*(nsteps+1),) + ob_space.shape[:-1] + (ob_space.shape[-1] * nstack,))
with tf.variable_scope('acer_model', reuse=tf.AUTO_REUSE):
params = find_trainable_variables("model") step_model = policy(observ_placeholder=step_ob_placeholder, sess=sess)
train_model = policy(observ_placeholder=train_ob_placeholder, sess=sess)
params = find_trainable_variables("acer_model")
print("Params {}".format(len(params))) print("Params {}".format(len(params)))
for var in params: for var in params:
print(var) print(var)
@@ -90,14 +93,20 @@ class Model(object):
print(v.name) print(v.name)
return v return v
with tf.variable_scope("", custom_getter=custom_getter, reuse=True): with tf.variable_scope("acer_model", custom_getter=custom_getter, reuse=True):
polyak_model = policy(sess, ob_space, ac_space, nenvs, nsteps + 1, nstack, reuse=True) polyak_model = policy(observ_placeholder=train_ob_placeholder, sess=sess)
# Notation: (var) = batch variable, (var)s = seqeuence variable, (var)_i = variable index by action at step i # Notation: (var) = batch variable, (var)s = seqeuence variable, (var)_i = variable index by action at step i
v = tf.reduce_sum(train_model.pi * train_model.q, axis = -1) # shape is [nenvs * (nsteps + 1)]
# action probability distributions according to train_model, polyak_model and step_model
# poilcy.pi is probability distribution parameters; to obtain distribution that sums to 1 need to take softmax
train_model_p = tf.nn.softmax(train_model.pi)
polyak_model_p = tf.nn.softmax(polyak_model.pi)
step_model_p = tf.nn.softmax(step_model.pi)
v = tf.reduce_sum(train_model_p * train_model.q, axis = -1) # shape is [nenvs * (nsteps + 1)]
# strip off last step # strip off last step
f, f_pol, q = map(lambda var: strip(var, nenvs, nsteps), [train_model.pi, polyak_model.pi, train_model.q]) f, f_pol, q = map(lambda var: strip(var, nenvs, nsteps), [train_model_p, polyak_model_p, train_model.q])
# Get pi and q values for actions taken # Get pi and q values for actions taken
f_i = get_by_index(f, A) f_i = get_by_index(f, A)
q_i = get_by_index(q, A) q_i = get_by_index(q, A)
@@ -111,6 +120,7 @@ class Model(object):
# Calculate losses # Calculate losses
# Entropy # Entropy
# entropy = tf.reduce_mean(strip(train_model.pd.entropy(), nenvs, nsteps))
entropy = tf.reduce_mean(cat_entropy_softmax(f)) entropy = tf.reduce_mean(cat_entropy_softmax(f))
# Policy Graident loss, with truncated importance sampling & bias correction # Policy Graident loss, with truncated importance sampling & bias correction
@@ -192,80 +202,29 @@ class Model(object):
def train(obs, actions, rewards, dones, mus, states, masks, steps): def train(obs, actions, rewards, dones, mus, states, masks, steps):
cur_lr = lr.value_steps(steps) cur_lr = lr.value_steps(steps)
td_map = {train_model.X: obs, polyak_model.X: obs, A: actions, R: rewards, D: dones, MU: mus, LR: cur_lr} td_map = {train_model.X: obs, polyak_model.X: obs, A: actions, R: rewards, D: dones, MU: mus, LR: cur_lr}
if states != []: if states is not None:
td_map[train_model.S] = states td_map[train_model.S] = states
td_map[train_model.M] = masks td_map[train_model.M] = masks
td_map[polyak_model.S] = states td_map[polyak_model.S] = states
td_map[polyak_model.M] = masks td_map[polyak_model.M] = masks
return names_ops, sess.run(run_ops, td_map)[1:] # strip off _train return names_ops, sess.run(run_ops, td_map)[1:] # strip off _train
def save(save_path): def _step(observation, **kwargs):
ps = sess.run(params) return step_model._evaluate([step_model.action, step_model_p, step_model.state], observation, **kwargs)
make_path(osp.dirname(save_path))
joblib.dump(ps, save_path)
self.train = train self.train = train
self.save = save self.save = functools.partial(save_variables, sess=sess, variables=params)
self.train_model = train_model self.train_model = train_model
self.step_model = step_model self.step_model = step_model
self.step = step_model.step self._step = _step
self.step = self.step_model.step
self.initial_state = step_model.initial_state self.initial_state = step_model.initial_state
tf.global_variables_initializer().run(session=sess) tf.global_variables_initializer().run(session=sess)
class Runner(AbstractEnvRunner):
def __init__(self, env, model, nsteps, nstack):
super().__init__(env=env, model=model, nsteps=nsteps)
self.nstack = nstack
nh, nw, nc = env.observation_space.shape
self.nc = nc # nc = 1 for atari, but just in case
self.nenv = nenv = env.num_envs
self.nact = env.action_space.n
self.nbatch = nenv * nsteps
self.batch_ob_shape = (nenv*(nsteps+1), nh, nw, nc*nstack)
self.obs = np.zeros((nenv, nh, nw, nc * nstack), dtype=np.uint8)
obs = env.reset()
self.update_obs(obs)
def update_obs(self, obs, dones=None):
if dones is not None:
self.obs *= (1 - dones.astype(np.uint8))[:, None, None, None]
self.obs = np.roll(self.obs, shift=-self.nc, axis=3)
self.obs[:, :, :, -self.nc:] = obs[:, :, :, :]
def run(self):
enc_obs = np.split(self.obs, self.nstack, axis=3) # so now list of obs steps
mb_obs, mb_actions, mb_mus, mb_dones, mb_rewards = [], [], [], [], []
for _ in range(self.nsteps):
actions, mus, states = self.model.step(self.obs, state=self.states, mask=self.dones)
mb_obs.append(np.copy(self.obs))
mb_actions.append(actions)
mb_mus.append(mus)
mb_dones.append(self.dones)
obs, rewards, dones, _ = self.env.step(actions)
# states information for statefull models like LSTM
self.states = states
self.dones = dones
self.update_obs(obs, dones)
mb_rewards.append(rewards)
enc_obs.append(obs)
mb_obs.append(np.copy(self.obs))
mb_dones.append(self.dones)
enc_obs = np.asarray(enc_obs, dtype=np.uint8).swapaxes(1, 0)
mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0)
mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0)
mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0)
mb_mus = np.asarray(mb_mus, dtype=np.float32).swapaxes(1, 0)
mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
mb_masks = mb_dones # Used for statefull models like LSTM's to mask state when done
mb_dones = mb_dones[:, 1:] # Used for calculating returns. The dones array is now aligned with rewards
# shapes are now [nenv, nsteps, []]
# When pulling from buffer, arrays will now be reshaped in place, preventing a deep copy.
return enc_obs, mb_obs, mb_actions, mb_rewards, mb_mus, mb_dones, mb_masks
class Acer(): class Acer():
def __init__(self, runner, model, buffer, log_interval): def __init__(self, runner, model, buffer, log_interval):
@@ -311,19 +270,84 @@ class Acer():
logger.dump_tabular() logger.dump_tabular()
def learn(policy, env, seed, nsteps=20, nstack=4, total_timesteps=int(80e6), q_coef=0.5, ent_coef=0.01, def learn(network, env, seed=None, nsteps=20, nstack=4, total_timesteps=int(80e6), q_coef=0.5, ent_coef=0.01,
max_grad_norm=10, lr=7e-4, lrschedule='linear', rprop_epsilon=1e-5, rprop_alpha=0.99, gamma=0.99, max_grad_norm=10, lr=7e-4, lrschedule='linear', rprop_epsilon=1e-5, rprop_alpha=0.99, gamma=0.99,
log_interval=100, buffer_size=50000, replay_ratio=4, replay_start=10000, c=10.0, log_interval=100, buffer_size=50000, replay_ratio=4, replay_start=10000, c=10.0,
trust_region=True, alpha=0.99, delta=1): trust_region=True, alpha=0.99, delta=1, load_path=None, **network_kwargs):
'''
Main entrypoint for ACER (Actor-Critic with Experience Replay) algorithm (https://arxiv.org/pdf/1611.01224.pdf)
Train an agent with given network architecture on a given environment using ACER.
Parameters:
----------
network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
See baselines.common/policies.py/lstm for more details on using recurrent nets in policies
env: environment. Needs to be vectorized for parallel environment simulation.
The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class.
nsteps: int, number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where
nenv is number of environment copies simulated in parallel) (default: 20)
nstack: int, size of the frame stack, i.e. number of the frames passed to the step model. Frames are stacked along channel dimension
(last image dimension) (default: 4)
total_timesteps: int, number of timesteps (i.e. number of actions taken in the environment) (default: 80M)
q_coef: float, value function loss coefficient in the optimization objective (analog of vf_coef for other actor-critic methods)
ent_coef: float, policy entropy coefficient in the optimization objective (default: 0.01)
max_grad_norm: float, gradient norm clipping coefficient. If set to None, no clipping. (default: 10),
lr: float, learning rate for RMSProp (current implementation has RMSProp hardcoded in) (default: 7e-4)
lrschedule: schedule of learning rate. Can be 'linear', 'constant', or a function [0..1] -> [0..1] that takes fraction of the training progress as input and
returns fraction of the learning rate (specified as lr) as output
rprop_epsilon: float, RMSProp epsilon (stabilizes square root computation in denominator of RMSProp update) (default: 1e-5)
rprop_alpha: float, RMSProp decay parameter (default: 0.99)
gamma: float, reward discounting factor (default: 0.99)
log_interval: int, number of updates between logging events (default: 100)
buffer_size: int, size of the replay buffer (default: 50k)
replay_ratio: int, now many (on average) batches of data to sample from the replay buffer take after batch from the environment (default: 4)
replay_start: int, the sampling from the replay buffer does not start until replay buffer has at least that many samples (default: 10k)
c: float, importance weight clipping factor (default: 10)
trust_region bool, whether or not algorithms estimates the gradient KL divergence between the old and updated policy and uses it to determine step size (default: True)
delta: float, max KL divergence between the old policy and updated policy (default: 1)
alpha: float, momentum factor in the Polyak (exponential moving average) averaging of the model parameters (default: 0.99)
load_path: str, path to load the model from (default: None)
**network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
For instance, 'mlp' network architecture has arguments num_hidden and num_layers.
'''
print("Running Acer Simple") print("Running Acer Simple")
print(locals()) print(locals())
tf.reset_default_graph()
set_global_seeds(seed) set_global_seeds(seed)
policy = build_policy(env, network, estimate_q=True, **network_kwargs)
nenvs = env.num_envs nenvs = env.num_envs
ob_space = env.observation_space ob_space = env.observation_space
ac_space = env.action_space ac_space = env.action_space
num_procs = len(env.remotes) # HACK num_procs = len(env.remotes) if hasattr(env, 'remotes') else 1# HACK
model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nenvs=nenvs, nsteps=nsteps, nstack=nstack, model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nenvs=nenvs, nsteps=nsteps, nstack=nstack,
num_procs=num_procs, ent_coef=ent_coef, q_coef=q_coef, gamma=gamma, num_procs=num_procs, ent_coef=ent_coef, q_coef=q_coef, gamma=gamma,
max_grad_norm=max_grad_norm, lr=lr, rprop_alpha=rprop_alpha, rprop_epsilon=rprop_epsilon, max_grad_norm=max_grad_norm, lr=lr, rprop_alpha=rprop_alpha, rprop_epsilon=rprop_epsilon,
@@ -338,6 +362,7 @@ def learn(policy, env, seed, nsteps=20, nstack=4, total_timesteps=int(80e6), q_c
nbatch = nenvs*nsteps nbatch = nenvs*nsteps
acer = Acer(runner, model, buffer, log_interval) acer = Acer(runner, model, buffer, log_interval)
acer.tstart = time.time() acer.tstart = time.time()
for acer.steps in range(0, total_timesteps, nbatch): #nbatch samples, 1 on_policy call and multiple off-policy calls for acer.steps in range(0, total_timesteps, nbatch): #nbatch samples, 1 on_policy call and multiple off-policy calls
acer.call(on_policy=True) acer.call(on_policy=True)
if replay_ratio > 0 and buffer.has_atleast(replay_start): if replay_ratio > 0 and buffer.has_atleast(replay_start):
@@ -346,3 +371,4 @@ def learn(policy, env, seed, nsteps=20, nstack=4, total_timesteps=int(80e6), q_c
acer.call(on_policy=False) # no simulation steps in this acer.call(on_policy=False) # no simulation steps in this
env.close() env.close()
return model

View File

@@ -0,0 +1,4 @@
def atari():
return dict(
lrschedule='constant'
)

View File

@@ -1,6 +1,6 @@
import numpy as np import numpy as np
import tensorflow as tf import tensorflow as tf
from baselines.ppo2.policies import nature_cnn from baselines.common.policies import nature_cnn
from baselines.a2c.utils import fc, batch_to_seq, seq_to_batch, lstm, sample from baselines.a2c.utils import fc, batch_to_seq, seq_to_batch, lstm, sample
@@ -18,11 +18,13 @@ class AcerCnnPolicy(object):
pi = tf.nn.softmax(pi_logits) pi = tf.nn.softmax(pi_logits)
q = fc(h, 'q', nact) q = fc(h, 'q', nact)
a = sample(pi_logits) # could change this to use self.pi instead a = sample(tf.nn.softmax(pi_logits)) # could change this to use self.pi instead
self.initial_state = [] # not stateful self.initial_state = [] # not stateful
self.X = X self.X = X
self.pi = pi # actual policy params now self.pi = pi # actual policy params now
self.pi_logits = pi_logits
self.q = q self.q = q
self.vf = q
def step(ob, *args, **kwargs): def step(ob, *args, **kwargs):
# returns actions, mus, states # returns actions, mus, states

View File

@@ -1,30 +0,0 @@
#!/usr/bin/env python3
from baselines import logger
from baselines.acer.acer_simple import learn
from baselines.acer.policies import AcerCnnPolicy, AcerLstmPolicy
from baselines.common.cmd_util import make_atari_env, atari_arg_parser
def train(env_id, num_timesteps, seed, policy, lrschedule, num_cpu):
env = make_atari_env(env_id, num_cpu, seed)
if policy == 'cnn':
policy_fn = AcerCnnPolicy
elif policy == 'lstm':
policy_fn = AcerLstmPolicy
else:
print("Policy {} not implemented".format(policy))
return
learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), lrschedule=lrschedule)
env.close()
def main():
parser = atari_arg_parser()
parser.add_argument('--policy', help='Policy architecture', choices=['cnn', 'lstm', 'lnlstm'], default='cnn')
parser.add_argument('--lrschedule', help='Learning rate schedule', choices=['constant', 'linear'], default='constant')
parser.add_argument('--logdir', help ='Directory for logging')
args = parser.parse_args()
logger.configure(args.logdir)
train(args.env, num_timesteps=args.num_timesteps, seed=args.seed,
policy=args.policy, lrschedule=args.lrschedule, num_cpu=16)
if __name__ == '__main__':
main()

60
baselines/acer/runner.py Normal file
View File

@@ -0,0 +1,60 @@
import numpy as np
from baselines.common.runners import AbstractEnvRunner
class Runner(AbstractEnvRunner):
def __init__(self, env, model, nsteps, nstack):
super().__init__(env=env, model=model, nsteps=nsteps)
self.nstack = nstack
nh, nw, nc = env.observation_space.shape
self.nc = nc # nc = 1 for atari, but just in case
self.nact = env.action_space.n
nenv = self.nenv
self.nbatch = nenv * nsteps
self.batch_ob_shape = (nenv*(nsteps+1), nh, nw, nc*nstack)
self.obs = np.zeros((nenv, nh, nw, nc * nstack), dtype=np.uint8)
obs = env.reset()
self.update_obs(obs)
def update_obs(self, obs, dones=None):
#self.obs = obs
if dones is not None:
self.obs *= (1 - dones.astype(np.uint8))[:, None, None, None]
self.obs = np.roll(self.obs, shift=-self.nc, axis=3)
self.obs[:, :, :, -self.nc:] = obs[:, :, :, :]
def run(self):
enc_obs = np.split(self.obs, self.nstack, axis=3) # so now list of obs steps
mb_obs, mb_actions, mb_mus, mb_dones, mb_rewards = [], [], [], [], []
for _ in range(self.nsteps):
actions, mus, states = self.model._step(self.obs, S=self.states, M=self.dones)
mb_obs.append(np.copy(self.obs))
mb_actions.append(actions)
mb_mus.append(mus)
mb_dones.append(self.dones)
obs, rewards, dones, _ = self.env.step(actions)
# states information for statefull models like LSTM
self.states = states
self.dones = dones
self.update_obs(obs, dones)
mb_rewards.append(rewards)
enc_obs.append(obs)
mb_obs.append(np.copy(self.obs))
mb_dones.append(self.dones)
enc_obs = np.asarray(enc_obs, dtype=np.uint8).swapaxes(1, 0)
mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0)
mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0)
mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0)
mb_mus = np.asarray(mb_mus, dtype=np.float32).swapaxes(1, 0)
mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
mb_masks = mb_dones # Used for statefull models like LSTM's to mask state when done
mb_dones = mb_dones[:, 1:] # Used for calculating returns. The dones array is now aligned with rewards
# shapes are now [nenv, nsteps, []]
# When pulling from buffer, arrays will now be reshaped in place, preventing a deep copy.
return enc_obs, mb_obs, mb_actions, mb_rewards, mb_mus, mb_dones, mb_masks

1
baselines/acktr/acktr.py Normal file
View File

@@ -0,0 +1 @@
from baselines.acktr.acktr_disc import *

View File

@@ -1,16 +1,17 @@
import os.path as osp import os.path as osp
import time import time
import joblib import functools
import numpy as np import numpy as np
import tensorflow as tf import tensorflow as tf
from baselines import logger from baselines import logger
from baselines.common import set_global_seeds, explained_variance from baselines.common import set_global_seeds, explained_variance
from baselines.common.policies import build_policy
from baselines.common.tf_util import get_session, save_variables, load_variables
from baselines.a2c.a2c import Runner from baselines.a2c.runner import Runner
from baselines.a2c.utils import discount_with_dones from baselines.a2c.utils import discount_with_dones
from baselines.a2c.utils import Scheduler, find_trainable_variables from baselines.a2c.utils import Scheduler, find_trainable_variables
from baselines.a2c.utils import cat_entropy, mse
from baselines.acktr import kfac from baselines.acktr import kfac
@@ -19,11 +20,8 @@ class Model(object):
def __init__(self, policy, ob_space, ac_space, nenvs,total_timesteps, nprocs=32, nsteps=20, def __init__(self, policy, ob_space, ac_space, nenvs,total_timesteps, nprocs=32, nsteps=20,
ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5, ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5,
kfac_clip=0.001, lrschedule='linear'): kfac_clip=0.001, lrschedule='linear'):
config = tf.ConfigProto(allow_soft_placement=True,
intra_op_parallelism_threads=nprocs, self.sess = sess = get_session()
inter_op_parallelism_threads=nprocs)
config.gpu_options.allow_growth = True
self.sess = sess = tf.Session(config=config)
nact = ac_space.n nact = ac_space.n
nbatch = nenvs * nsteps nbatch = nenvs * nsteps
A = tf.placeholder(tf.int32, [nbatch]) A = tf.placeholder(tf.int32, [nbatch])
@@ -32,27 +30,28 @@ class Model(object):
PG_LR = tf.placeholder(tf.float32, []) PG_LR = tf.placeholder(tf.float32, [])
VF_LR = tf.placeholder(tf.float32, []) VF_LR = tf.placeholder(tf.float32, [])
self.model = step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False) with tf.variable_scope('acktr_model', reuse=tf.AUTO_REUSE):
self.model2 = train_model = policy(sess, ob_space, ac_space, nenvs*nsteps, nsteps, reuse=True) self.model = step_model = policy(nenvs, 1, sess=sess)
self.model2 = train_model = policy(nenvs*nsteps, nsteps, sess=sess)
logpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=A) neglogpac = train_model.pd.neglogp(A)
self.logits = logits = train_model.pi self.logits = logits = train_model.pi
##training loss ##training loss
pg_loss = tf.reduce_mean(ADV*logpac) pg_loss = tf.reduce_mean(ADV*neglogpac)
entropy = tf.reduce_mean(cat_entropy(train_model.pi)) entropy = tf.reduce_mean(train_model.pd.entropy())
pg_loss = pg_loss - ent_coef * entropy pg_loss = pg_loss - ent_coef * entropy
vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R)) vf_loss = tf.losses.mean_squared_error(tf.squeeze(train_model.vf), R)
train_loss = pg_loss + vf_coef * vf_loss train_loss = pg_loss + vf_coef * vf_loss
##Fisher loss construction ##Fisher loss construction
self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(logpac) self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(neglogpac)
sample_net = train_model.vf + tf.random_normal(tf.shape(train_model.vf)) sample_net = train_model.vf + tf.random_normal(tf.shape(train_model.vf))
self.vf_fisher = vf_fisher_loss = - vf_fisher_coef*tf.reduce_mean(tf.pow(train_model.vf - tf.stop_gradient(sample_net), 2)) self.vf_fisher = vf_fisher_loss = - vf_fisher_coef*tf.reduce_mean(tf.pow(train_model.vf - tf.stop_gradient(sample_net), 2))
self.joint_fisher = joint_fisher_loss = pg_fisher_loss + vf_fisher_loss self.joint_fisher = joint_fisher_loss = pg_fisher_loss + vf_fisher_loss
self.params=params = find_trainable_variables("model") self.params=params = find_trainable_variables("acktr_model")
self.grads_check = grads = tf.gradients(train_loss,params) self.grads_check = grads = tf.gradients(train_loss,params)
@@ -82,22 +81,10 @@ class Model(object):
) )
return policy_loss, value_loss, policy_entropy return policy_loss, value_loss, policy_entropy
def save(save_path):
ps = sess.run(params)
joblib.dump(ps, save_path)
def load(load_path):
loaded_params = joblib.load(load_path)
restores = []
for p, loaded_p in zip(params, loaded_params):
restores.append(p.assign(loaded_p))
sess.run(restores)
self.train = train self.train = train
self.save = save self.save = functools.partial(save_variables, sess=sess)
self.load = load self.load = functools.partial(load_variables, sess=sess)
self.train_model = train_model self.train_model = train_model
self.step_model = step_model self.step_model = step_model
self.step = step_model.step self.step = step_model.step
@@ -105,12 +92,17 @@ class Model(object):
self.initial_state = step_model.initial_state self.initial_state = step_model.initial_state
tf.global_variables_initializer().run(session=sess) tf.global_variables_initializer().run(session=sess)
def learn(policy, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interval=1, nprocs=32, nsteps=20, def learn(network, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interval=1, nprocs=32, nsteps=20,
ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5, ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5,
kfac_clip=0.001, save_interval=None, lrschedule='linear'): kfac_clip=0.001, save_interval=None, lrschedule='linear', load_path=None, **network_kwargs):
tf.reset_default_graph()
set_global_seeds(seed) set_global_seeds(seed)
if network == 'cnn':
network_kwargs['one_dim_bias'] = True
policy = build_policy(env, network, **network_kwargs)
nenvs = env.num_envs nenvs = env.num_envs
ob_space = env.observation_space ob_space = env.observation_space
ac_space = env.action_space ac_space = env.action_space
@@ -124,6 +116,9 @@ def learn(policy, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interval
fh.write(cloudpickle.dumps(make_model)) fh.write(cloudpickle.dumps(make_model))
model = make_model() model = make_model()
if load_path is not None:
model.load(load_path)
runner = Runner(env, model, nsteps=nsteps, gamma=gamma) runner = Runner(env, model, nsteps=nsteps, gamma=gamma)
nbatch = nenvs*nsteps nbatch = nenvs*nsteps
tstart = time.time() tstart = time.time()
@@ -153,3 +148,4 @@ def learn(policy, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interval
coord.request_stop() coord.request_stop()
coord.join(enqueue_threads) coord.join(enqueue_threads)
env.close() env.close()
return model

View File

@@ -6,11 +6,11 @@ from baselines import logger
from baselines.acktr.acktr_disc import learn from baselines.acktr.acktr_disc import learn
from baselines.common.cmd_util import make_atari_env, atari_arg_parser from baselines.common.cmd_util import make_atari_env, atari_arg_parser
from baselines.common.vec_env.vec_frame_stack import VecFrameStack from baselines.common.vec_env.vec_frame_stack import VecFrameStack
from baselines.ppo2.policies import CnnPolicy from baselines.common.policies import cnn
def train(env_id, num_timesteps, seed, num_cpu): def train(env_id, num_timesteps, seed, num_cpu):
env = VecFrameStack(make_atari_env(env_id, num_cpu, seed), 4) env = VecFrameStack(make_atari_env(env_id, num_cpu, seed), 4)
policy_fn = partial(CnnPolicy, one_dim_bias=True) policy_fn = cnn(env=env, one_dim_bias=True)
learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), nprocs=num_cpu) learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), nprocs=num_cpu)
env.close() env.close()

View File

@@ -59,7 +59,7 @@ register_benchmark({
register_benchmark({ register_benchmark({
'name': 'Atari10M', 'name': 'Atari10M',
'description': '7 Atari games from Mnih et al. (2013), with pixel observations, 10M timesteps', 'description': '7 Atari games from Mnih et al. (2013), with pixel observations, 10M timesteps',
'tasks': [{'desc': _game, 'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(10e6)} for _game in _atari7] 'tasks': [{'desc': _game, 'env_id': _game + _ATARI_SUFFIX, 'trials': 6, 'num_timesteps': int(10e6)} for _game in _atari7]
}) })
register_benchmark({ register_benchmark({
@@ -84,8 +84,9 @@ _mujocosmall = [
register_benchmark({ register_benchmark({
'name': 'Mujoco1M', 'name': 'Mujoco1M',
'description': 'Some small 2D MuJoCo tasks, run for 1M timesteps', 'description': 'Some small 2D MuJoCo tasks, run for 1M timesteps',
'tasks': [{'env_id': _envid, 'trials': 3, 'num_timesteps': int(1e6)} for _envid in _mujocosmall] 'tasks': [{'env_id': _envid, 'trials': 6, 'num_timesteps': int(1e6)} for _envid in _mujocosmall]
}) })
register_benchmark({ register_benchmark({
'name': 'MujocoWalkers', 'name': 'MujocoWalkers',
'description': 'MuJoCo forward walkers, run for 8M, humanoid 100M', 'description': 'MuJoCo forward walkers, run for 8M, humanoid 100M',

View File

@@ -112,6 +112,8 @@ def load_results(dir):
with open(fname, 'rt') as fh: with open(fname, 'rt') as fh:
if fname.endswith('csv'): if fname.endswith('csv'):
firstline = fh.readline() firstline = fh.readline()
if not firstline:
continue
assert firstline[0] == '#' assert firstline[0] == '#'
header = json.loads(firstline[1:]) header = json.loads(firstline[1:])
df = pandas.read_csv(fh, index_col=None) df = pandas.read_csv(fh, index_col=None)

View File

@@ -1,4 +1,6 @@
import numpy as np import numpy as np
import os
os.environ.setdefault('PATH', '')
from collections import deque from collections import deque
import gym import gym
from gym import spaces from gym import spaces
@@ -154,7 +156,7 @@ class FrameStack(gym.Wrapper):
self.k = k self.k = k
self.frames = deque([], maxlen=k) self.frames = deque([], maxlen=k)
shp = env.observation_space.shape shp = env.observation_space.shape
self.observation_space = spaces.Box(low=0, high=255, shape=(shp[0], shp[1], shp[2] * k), dtype=np.uint8) self.observation_space = spaces.Box(low=0, high=255, shape=(shp[0], shp[1], shp[2] * k), dtype=env.observation_space.dtype)
def reset(self): def reset(self):
ob = self.env.reset() ob = self.env.reset()
@@ -174,6 +176,7 @@ class FrameStack(gym.Wrapper):
class ScaledFloatFrame(gym.ObservationWrapper): class ScaledFloatFrame(gym.ObservationWrapper):
def __init__(self, env): def __init__(self, env):
gym.ObservationWrapper.__init__(self, env) gym.ObservationWrapper.__init__(self, env)
self.observation_space = gym.spaces.Box(low=0, high=1, shape=env.observation_space.shape, dtype=np.float32)
def observation(self, observation): def observation(self, observation):
# careful! This undoes the memory optimization, use # careful! This undoes the memory optimization, use

View File

@@ -3,7 +3,11 @@ Helpers for scripts like run_atari.py.
""" """
import os import os
try:
from mpi4py import MPI from mpi4py import MPI
except ImportError:
MPI = None
import gym import gym
from gym.wrappers import FlattenDictWrapper from gym.wrappers import FlattenDictWrapper
from baselines import logger from baselines import logger
@@ -17,25 +21,32 @@ def make_atari_env(env_id, num_env, seed, wrapper_kwargs=None, start_index=0):
Create a wrapped, monitored SubprocVecEnv for Atari. Create a wrapped, monitored SubprocVecEnv for Atari.
""" """
if wrapper_kwargs is None: wrapper_kwargs = {} if wrapper_kwargs is None: wrapper_kwargs = {}
mpi_rank = MPI.COMM_WORLD.Get_rank() if MPI else 0
def make_env(rank): # pylint: disable=C0111 def make_env(rank): # pylint: disable=C0111
def _thunk(): def _thunk():
env = make_atari(env_id) env = make_atari(env_id)
env.seed(seed + rank) env.seed(seed + 10000*mpi_rank + rank if seed is not None else None)
env = Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) env = Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(mpi_rank) + '.' + str(rank)))
return wrap_deepmind(env, **wrapper_kwargs) return wrap_deepmind(env, **wrapper_kwargs)
return _thunk return _thunk
set_global_seeds(seed) set_global_seeds(seed)
return SubprocVecEnv([make_env(i + start_index) for i in range(num_env)]) return SubprocVecEnv([make_env(i + start_index) for i in range(num_env)])
def make_mujoco_env(env_id, seed): def make_mujoco_env(env_id, seed, reward_scale=1.0):
""" """
Create a wrapped, monitored gym.Env for MuJoCo. Create a wrapped, monitored gym.Env for MuJoCo.
""" """
rank = MPI.COMM_WORLD.Get_rank() rank = MPI.COMM_WORLD.Get_rank()
set_global_seeds(seed + 10000 * rank) myseed = seed + 1000 * rank if seed is not None else None
set_global_seeds(myseed)
env = gym.make(env_id) env = gym.make(env_id)
env = Monitor(env, os.path.join(logger.get_dir(), str(rank))) env = Monitor(env, os.path.join(logger.get_dir(), str(rank)), allow_early_resets=True)
env.seed(seed) env.seed(seed)
if reward_scale != 1.0:
from baselines.common.retro_wrappers import RewardScaler
env = RewardScaler(env, reward_scale)
return env return env
def make_robotics_env(env_id, seed, rank=0): def make_robotics_env(env_id, seed, rank=0):
@@ -62,20 +73,27 @@ def atari_arg_parser():
""" """
Create an argparse.ArgumentParser for run_atari.py. Create an argparse.ArgumentParser for run_atari.py.
""" """
parser = arg_parser() print('Obsolete - use common_arg_parser instead')
parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4') return common_arg_parser()
parser.add_argument('--seed', help='RNG seed', type=int, default=0)
parser.add_argument('--num-timesteps', type=int, default=int(10e6))
return parser
def mujoco_arg_parser(): def mujoco_arg_parser():
print('Obsolete - use common_arg_parser instead')
return common_arg_parser()
def common_arg_parser():
""" """
Create an argparse.ArgumentParser for run_mujoco.py. Create an argparse.ArgumentParser for run_mujoco.py.
""" """
parser = arg_parser() parser = arg_parser()
parser.add_argument('--env', help='environment ID', type=str, default='Reacher-v2') parser.add_argument('--env', help='environment ID', type=str, default='Reacher-v2')
parser.add_argument('--seed', help='RNG seed', type=int, default=0) parser.add_argument('--seed', help='RNG seed', type=int, default=None)
parser.add_argument('--num-timesteps', type=int, default=int(1e6)) parser.add_argument('--alg', help='Algorithm', type=str, default='ppo2')
parser.add_argument('--num_timesteps', type=float, default=1e6),
parser.add_argument('--network', help='network type (mlp, cnn, lstm, cnn_lstm, conv_only)', default=None)
parser.add_argument('--gamestate', help='game state to load (so far only used in retro games)', default=None)
parser.add_argument('--num_env', help='Number of environment copies being run in parallel. When not specified, set to number of cpus for Atari, and to 1 for Mujoco', default=None, type=int)
parser.add_argument('--reward_scale', help='Reward scale factor. Default: 1.0', default=1.0, type=float)
parser.add_argument('--save_path', help='Path to save trained model to', default=None, type=str)
parser.add_argument('--play', default=False, action='store_true') parser.add_argument('--play', default=False, action='store_true')
return parser return parser
@@ -85,6 +103,24 @@ def robotics_arg_parser():
""" """
parser = arg_parser() parser = arg_parser()
parser.add_argument('--env', help='environment ID', type=str, default='FetchReach-v0') parser.add_argument('--env', help='environment ID', type=str, default='FetchReach-v0')
parser.add_argument('--seed', help='RNG seed', type=int, default=0) parser.add_argument('--seed', help='RNG seed', type=int, default=None)
parser.add_argument('--num-timesteps', type=int, default=int(1e6)) parser.add_argument('--num-timesteps', type=int, default=int(1e6))
return parser return parser
def parse_unknown_args(args):
"""
Parse arguments not consumed by arg parser into a dicitonary
"""
retval = {}
for arg in args:
assert arg.startswith('--')
assert '=' in arg, 'cannot parse arg {}'.format(arg)
key = arg.split('=')[0][2:]
value = arg.split('=')[1]
retval[key] = value
return retval

View File

@@ -85,7 +85,7 @@ class DiagGaussianPdType(PdType):
def pdfromlatent(self, latent_vector, init_scale=1.0, init_bias=0.0): def pdfromlatent(self, latent_vector, init_scale=1.0, init_bias=0.0):
mean = fc(latent_vector, 'pi', self.size, init_scale=init_scale, init_bias=init_bias) mean = fc(latent_vector, 'pi', self.size, init_scale=init_scale, init_bias=init_bias)
logstd = tf.get_variable(name='logstd', shape=[1, self.size], initializer=tf.zeros_initializer()) logstd = tf.get_variable(name='pi/logstd', shape=[1, self.size], initializer=tf.zeros_initializer())
pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
return self.pdfromflat(pdparam), mean return self.pdfromflat(pdparam), mean
@@ -143,26 +143,26 @@ class CategoricalPd(Pd):
# Note: we can't use sparse_softmax_cross_entropy_with_logits because # Note: we can't use sparse_softmax_cross_entropy_with_logits because
# the implementation does not allow second-order derivatives... # the implementation does not allow second-order derivatives...
one_hot_actions = tf.one_hot(x, self.logits.get_shape().as_list()[-1]) one_hot_actions = tf.one_hot(x, self.logits.get_shape().as_list()[-1])
return tf.nn.softmax_cross_entropy_with_logits( return tf.nn.softmax_cross_entropy_with_logits_v2(
logits=self.logits, logits=self.logits,
labels=one_hot_actions) labels=one_hot_actions)
def kl(self, other): def kl(self, other):
a0 = self.logits - tf.reduce_max(self.logits, axis=-1, keep_dims=True) a0 = self.logits - tf.reduce_max(self.logits, axis=-1, keepdims=True)
a1 = other.logits - tf.reduce_max(other.logits, axis=-1, keep_dims=True) a1 = other.logits - tf.reduce_max(other.logits, axis=-1, keepdims=True)
ea0 = tf.exp(a0) ea0 = tf.exp(a0)
ea1 = tf.exp(a1) ea1 = tf.exp(a1)
z0 = tf.reduce_sum(ea0, axis=-1, keep_dims=True) z0 = tf.reduce_sum(ea0, axis=-1, keepdims=True)
z1 = tf.reduce_sum(ea1, axis=-1, keep_dims=True) z1 = tf.reduce_sum(ea1, axis=-1, keepdims=True)
p0 = ea0 / z0 p0 = ea0 / z0
return tf.reduce_sum(p0 * (a0 - tf.log(z0) - a1 + tf.log(z1)), axis=-1) return tf.reduce_sum(p0 * (a0 - tf.log(z0) - a1 + tf.log(z1)), axis=-1)
def entropy(self): def entropy(self):
a0 = self.logits - tf.reduce_max(self.logits, axis=-1, keep_dims=True) a0 = self.logits - tf.reduce_max(self.logits, axis=-1, keepdims=True)
ea0 = tf.exp(a0) ea0 = tf.exp(a0)
z0 = tf.reduce_sum(ea0, axis=-1, keep_dims=True) z0 = tf.reduce_sum(ea0, axis=-1, keepdims=True)
p0 = ea0 / z0 p0 = ea0 / z0
return tf.reduce_sum(p0 * (tf.log(z0) - a0), axis=-1) return tf.reduce_sum(p0 * (tf.log(z0) - a0), axis=-1)
def sample(self): def sample(self):
u = tf.random_uniform(tf.shape(self.logits)) u = tf.random_uniform(tf.shape(self.logits), dtype=self.logits.dtype)
return tf.argmax(self.logits - tf.log(-tf.log(u)), axis=-1) return tf.argmax(self.logits - tf.log(-tf.log(u)), axis=-1)
@classmethod @classmethod
def fromflat(cls, flat): def fromflat(cls, flat):

View File

@@ -1,30 +1,56 @@
import tensorflow as tf import tensorflow as tf
from gym.spaces import Discrete, Box from gym.spaces import Discrete, Box
def observation_placeholder(ob_space, batch_size=None, name='Ob'):
'''
Create placeholder to feed observations into of the size appropriate to the observation space
Parameters:
----------
ob_space: gym.Space observation space
batch_size: int size of the batch to be fed into input. Can be left None in most cases.
name: str name of the placeholder
Returns:
-------
tensorflow placeholder tensor
'''
assert isinstance(ob_space, Discrete) or isinstance(ob_space, Box), \
'Can only deal with Discrete and Box observation spaces for now'
return tf.placeholder(shape=(batch_size,) + ob_space.shape, dtype=ob_space.dtype, name=name)
def observation_input(ob_space, batch_size=None, name='Ob'): def observation_input(ob_space, batch_size=None, name='Ob'):
''' '''
Build observation input with encoding depending on the Create placeholder to feed observations into of the size appropriate to the observation space, and add input
observation space type encoder of the appropriate type.
Params: '''
ob_space: observation space (should be one of gym.spaces) placeholder = observation_placeholder(ob_space, batch_size, name)
batch_size: batch size for input (default is None, so that resulting input placeholder can take tensors with any batch size) return placeholder, encode_observation(ob_space, placeholder)
name: tensorflow variable name for input placeholder
returns: tuple (input_placeholder, processed_input_tensor) def encode_observation(ob_space, placeholder):
'''
Encode input in the way that is appropriate to the observation space
Parameters:
----------
ob_space: gym.Space observation space
placeholder: tf.placeholder observation input placeholder
''' '''
if isinstance(ob_space, Discrete): if isinstance(ob_space, Discrete):
input_x = tf.placeholder(shape=(batch_size,), dtype=tf.int32, name=name) return tf.to_float(tf.one_hot(placeholder, ob_space.n))
processed_x = tf.to_float(tf.one_hot(input_x, ob_space.n))
return input_x, processed_x
elif isinstance(ob_space, Box): elif isinstance(ob_space, Box):
input_shape = (batch_size,) + ob_space.shape return tf.to_float(placeholder)
input_x = tf.placeholder(shape=input_shape, dtype=ob_space.dtype, name=name)
processed_x = tf.to_float(input_x)
return input_x, processed_x
else: else:
raise NotImplementedError raise NotImplementedError

View File

@@ -67,14 +67,21 @@ class EzPickle(object):
def set_global_seeds(i): def set_global_seeds(i):
try:
import MPI
rank = MPI.COMM_WORLD.Get_rank()
except ImportError:
rank = 0
myseed = i + 1000 * rank if i is not None else None
try: try:
import tensorflow as tf import tensorflow as tf
except ImportError: except ImportError:
pass pass
else: else:
tf.set_random_seed(i) tf.set_random_seed(myseed)
np.random.seed(i) np.random.seed(myseed)
random.seed(i) random.seed(myseed)
def pretty_eta(seconds_left): def pretty_eta(seconds_left):

177
baselines/common/models.py Normal file
View File

@@ -0,0 +1,177 @@
import numpy as np
import tensorflow as tf
from baselines.a2c import utils
from baselines.a2c.utils import conv, fc, conv_to_fc, batch_to_seq, seq_to_batch
from baselines.common.mpi_running_mean_std import RunningMeanStd
import tensorflow.contrib.layers as layers
def nature_cnn(unscaled_images, **conv_kwargs):
"""
CNN from Nature paper.
"""
scaled_images = tf.cast(unscaled_images, tf.float32) / 255.
activ = tf.nn.relu
h = activ(conv(scaled_images, 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2),
**conv_kwargs))
h2 = activ(conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2), **conv_kwargs))
h3 = activ(conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2), **conv_kwargs))
h3 = conv_to_fc(h3)
return activ(fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2)))
def mlp(num_layers=2, num_hidden=64, activation=tf.tanh):
"""
Simple fully connected layer policy. Separate stacks of fully-connected layers are used for policy and value function estimation.
More customized fully-connected policies can be obtained by using PolicyWithV class directly.
Parameters:
----------
num_layers: int number of fully-connected layers (default: 2)
num_hidden: int size of fully-connected layers (default: 64)
activation: activation function (default: tf.tanh)
Returns:
-------
function that builds fully connected network with a given input placeholder
"""
def network_fn(X):
h = tf.layers.flatten(X)
for i in range(num_layers):
h = activation(fc(h, 'mlp_fc{}'.format(i), nh=num_hidden, init_scale=np.sqrt(2)))
return h, None
return network_fn
def cnn(**conv_kwargs):
def network_fn(X):
return nature_cnn(X, **conv_kwargs), None
return network_fn
def cnn_small(**conv_kwargs):
def network_fn(X):
h = tf.cast(X, tf.float32) / 255.
activ = tf.nn.relu
h = activ(conv(h, 'c1', nf=8, rf=8, stride=4, init_scale=np.sqrt(2), **conv_kwargs))
h = activ(conv(h, 'c2', nf=16, rf=4, stride=2, init_scale=np.sqrt(2), **conv_kwargs))
h = conv_to_fc(h)
h = activ(fc(h, 'fc1', nh=128, init_scale=np.sqrt(2)))
return h, None
return network_fn
def lstm(nlstm=128, layer_norm=False):
def network_fn(X, nenv=1):
nbatch = X.shape[0]
nsteps = nbatch // nenv
h = tf.layers.flatten(X)
M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
S = tf.placeholder(tf.float32, [nenv, 2*nlstm]) #states
xs = batch_to_seq(h, nenv, nsteps)
ms = batch_to_seq(M, nenv, nsteps)
if layer_norm:
h5, snew = utils.lnlstm(xs, ms, S, scope='lnlstm', nh=nlstm)
else:
h5, snew = utils.lstm(xs, ms, S, scope='lstm', nh=nlstm)
h = seq_to_batch(h5)
initial_state = np.zeros(S.shape.as_list(), dtype=float)
return h, {'S':S, 'M':M, 'state':snew, 'initial_state':initial_state}
return network_fn
def cnn_lstm(nlstm=128, layer_norm=False, **conv_kwargs):
def network_fn(X, nenv=1):
nbatch = X.shape[0]
nsteps = nbatch // nenv
h = nature_cnn(X, **conv_kwargs)
M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
S = tf.placeholder(tf.float32, [nenv, 2*nlstm]) #states
xs = batch_to_seq(h, nenv, nsteps)
ms = batch_to_seq(M, nenv, nsteps)
if layer_norm:
h5, snew = utils.lnlstm(xs, ms, S, scope='lnlstm', nh=nlstm)
else:
h5, snew = utils.lstm(xs, ms, S, scope='lstm', nh=nlstm)
h = seq_to_batch(h5)
initial_state = np.zeros(S.shape.as_list(), dtype=float)
return h, {'S':S, 'M':M, 'state':snew, 'initial_state':initial_state}
return network_fn
def cnn_lnlstm(nlstm=128, **conv_kwargs):
return cnn_lstm(nlstm, layer_norm=True, **conv_kwargs)
def conv_only(convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], **conv_kwargs):
'''
convolutions-only net
Parameters:
----------
conv: list of triples (filter_number, filter_size, stride) specifying parameters for each layer.
Returns:
function that takes tensorflow tensor as input and returns the output of the last convolutional layer
'''
def network_fn(X):
out = tf.cast(X, tf.float32) / 255.
with tf.variable_scope("convnet"):
for num_outputs, kernel_size, stride in convs:
out = layers.convolution2d(out,
num_outputs=num_outputs,
kernel_size=kernel_size,
stride=stride,
activation_fn=tf.nn.relu,
**conv_kwargs)
return out, None
return network_fn
def _normalize_clip_observation(x, clip_range=[-5.0, 5.0]):
rms = RunningMeanStd(shape=x.shape[1:])
norm_x = tf.clip_by_value((x - rms.mean) / rms.std, min(clip_range), max(clip_range))
return norm_x, rms
def get_network_builder(name):
# TODO: replace with reflection?
if name == 'cnn':
return cnn
elif name == 'cnn_small':
return cnn_small
elif name == 'conv_only':
return conv_only
elif name == 'mlp':
return mlp
elif name == 'lstm':
return lstm
elif name == 'cnn_lstm':
return cnn_lstm
elif name == 'cnn_lnlstm':
return cnn_lnlstm
else:
raise ValueError('Unknown network type: {}'.format(name))

View File

@@ -0,0 +1,31 @@
import numpy as np
import tensorflow as tf
from mpi4py import MPI
class MpiAdamOptimizer(tf.train.AdamOptimizer):
"""Adam optimizer that averages gradients across mpi processes."""
def __init__(self, comm, **kwargs):
self.comm = comm
tf.train.AdamOptimizer.__init__(self, **kwargs)
def compute_gradients(self, loss, var_list, **kwargs):
grads_and_vars = tf.train.AdamOptimizer.compute_gradients(self, loss, var_list, **kwargs)
grads_and_vars = [(g, v) for g, v in grads_and_vars if g is not None]
flat_grad = tf.concat([tf.reshape(g, (-1,)) for g, v in grads_and_vars], axis=0)
shapes = [v.shape.as_list() for g, v in grads_and_vars]
sizes = [int(np.prod(s)) for s in shapes]
num_tasks = self.comm.Get_size()
buf = np.zeros(sum(sizes), np.float32)
def _collect_grads(flat_grad):
self.comm.Allreduce(flat_grad, buf, op=MPI.SUM)
np.divide(buf, float(num_tasks), out=buf)
return buf
avg_flat_grad = tf.py_func(_collect_grads, [flat_grad], tf.float32)
avg_flat_grad.set_shape(flat_grad.shape)
avg_grads = tf.split(avg_flat_grad, sizes, axis=0)
avg_grads_and_vars = [(tf.reshape(g, v.shape), v)
for g, (_, v) in zip(avg_grads, grads_and_vars)]
return avg_grads_and_vars

View File

@@ -0,0 +1,101 @@
from collections import defaultdict
from mpi4py import MPI
import os, numpy as np
import platform
import shutil
import subprocess
def sync_from_root(sess, variables, comm=None):
"""
Send the root node's parameters to every worker.
Arguments:
sess: the TensorFlow session.
variables: all parameter variables including optimizer's
"""
if comm is None: comm = MPI.COMM_WORLD
rank = comm.Get_rank()
for var in variables:
if rank == 0:
comm.Bcast(sess.run(var))
else:
import tensorflow as tf
returned_var = np.empty(var.shape, dtype='float32')
comm.Bcast(returned_var)
sess.run(tf.assign(var, returned_var))
def gpu_count():
"""
Count the GPUs on this machine.
"""
if shutil.which('nvidia-smi') is None:
return 0
output = subprocess.check_output(['nvidia-smi', '--query-gpu=gpu_name', '--format=csv'])
return max(0, len(output.split(b'\n')) - 2)
def setup_mpi_gpus():
"""
Set CUDA_VISIBLE_DEVICES using MPI.
"""
num_gpus = gpu_count()
if num_gpus == 0:
return
local_rank, _ = get_local_rank_size(MPI.COMM_WORLD)
os.environ['CUDA_VISIBLE_DEVICES'] = str(local_rank % num_gpus)
def get_local_rank_size(comm):
"""
Returns the rank of each process on its machine
The processes on a given machine will be assigned ranks
0, 1, 2, ..., N-1,
where N is the number of processes on this machine.
Useful if you want to assign one gpu per machine
"""
this_node = platform.node()
ranks_nodes = comm.allgather((comm.Get_rank(), this_node))
node2rankssofar = defaultdict(int)
local_rank = None
for (rank, node) in ranks_nodes:
if rank == comm.Get_rank():
local_rank = node2rankssofar[node]
node2rankssofar[node] += 1
assert local_rank is not None
return local_rank, node2rankssofar[this_node]
def share_file(comm, path):
"""
Copies the file from rank 0 to all other ranks
Puts it in the same place on all machines
"""
localrank, _ = get_local_rank_size(comm)
if comm.Get_rank() == 0:
with open(path, 'rb') as fh:
data = fh.read()
comm.bcast(data)
else:
data = comm.bcast(None)
if localrank == 0:
os.makedirs(os.path.dirname(path), exist_ok=True)
with open(path, 'wb') as fh:
fh.write(data)
comm.Barrier()
def dict_gather(comm, d, op='mean', assert_all_have_data=True):
if comm is None: return d
alldicts = comm.allgather(d)
size = comm.size
k2li = defaultdict(list)
for d in alldicts:
for (k,v) in d.items():
k2li[k].append(v)
result = {}
for (k,li) in k2li.items():
if assert_all_have_data:
assert len(li)==size, "only %i out of %i MPI workers have sent '%s'" % (len(li), size, k)
if op=='mean':
result[k] = np.mean(li, axis=0)
elif op=='sum':
result[k] = np.sum(li, axis=0)
else:
assert 0, op
return result

View File

@@ -0,0 +1,179 @@
import tensorflow as tf
from baselines.common import tf_util
from baselines.a2c.utils import fc
from baselines.common.distributions import make_pdtype
from baselines.common.input import observation_placeholder, encode_observation
from baselines.common.tf_util import adjust_shape
from baselines.common.mpi_running_mean_std import RunningMeanStd
from baselines.common.models import get_network_builder
import gym
class PolicyWithValue(object):
"""
Encapsulates fields and methods for RL policy and value function estimation with shared parameters
"""
def __init__(self, env, observations, latent, estimate_q=False, vf_latent=None, sess=None, **tensors):
"""
Parameters:
----------
env RL environment
observations tensorflow placeholder in which the observations will be fed
latent latent state from which policy distribution parameters should be inferred
vf_latent latent state from which value function should be inferred (if None, then latent is used)
sess tensorflow session to run calculations in (if None, default session is used)
**tensors tensorflow tensors for additional attributes such as state or mask
"""
self.X = observations
self.state = tf.constant([])
self.initial_state = None
self.__dict__.update(tensors)
vf_latent = vf_latent if vf_latent is not None else latent
vf_latent = tf.layers.flatten(vf_latent)
latent = tf.layers.flatten(latent)
self.pdtype = make_pdtype(env.action_space)
self.pd, self.pi = self.pdtype.pdfromlatent(latent, init_scale=0.01)
self.action = self.pd.sample()
self.neglogp = self.pd.neglogp(self.action)
self.sess = sess
if estimate_q:
assert isinstance(env.action_space, gym.spaces.Discrete)
self.q = fc(vf_latent, 'q', env.action_space.n)
self.vf = self.q
else:
self.vf = fc(vf_latent, 'vf', 1)
self.vf = self.vf[:,0]
def _evaluate(self, variables, observation, **extra_feed):
sess = self.sess or tf.get_default_session()
feed_dict = {self.X: adjust_shape(self.X, observation)}
for inpt_name, data in extra_feed.items():
if inpt_name in self.__dict__.keys():
inpt = self.__dict__[inpt_name]
if isinstance(inpt, tf.Tensor) and inpt._op.type == 'Placeholder':
feed_dict[inpt] = adjust_shape(inpt, data)
return sess.run(variables, feed_dict)
def step(self, observation, **extra_feed):
"""
Compute next action(s) given the observaion(s)
Parameters:
----------
observation observation data (either single or a batch)
**extra_feed additional data such as state or mask (names of the arguments should match the ones in constructor, see __init__)
Returns:
-------
(action, value estimate, next state, negative log likelihood of the action under current policy parameters) tuple
"""
a, v, state, neglogp = self._evaluate([self.action, self.vf, self.state, self.neglogp], observation, **extra_feed)
if state.size == 0:
state = None
return a, v, state, neglogp
def value(self, ob, *args, **kwargs):
"""
Compute value estimate(s) given the observaion(s)
Parameters:
----------
observation observation data (either single or a batch)
**extra_feed additional data such as state or mask (names of the arguments should match the ones in constructor, see __init__)
Returns:
-------
value estimate
"""
return self._evaluate(self.vf, ob, *args, **kwargs)
def save(self, save_path):
tf_util.save_state(save_path, sess=self.sess)
def load(self, load_path):
tf_util.load_state(load_path, sess=self.sess)
def build_policy(env, policy_network, value_network=None, normalize_observations=False, estimate_q=False, **policy_kwargs):
if isinstance(policy_network, str):
network_type = policy_network
policy_network = get_network_builder(network_type)(**policy_kwargs)
def policy_fn(nbatch=None, nsteps=None, sess=None, observ_placeholder=None):
ob_space = env.observation_space
X = observ_placeholder if observ_placeholder is not None else observation_placeholder(ob_space, batch_size=nbatch)
extra_tensors = {}
if normalize_observations and X.dtype == tf.float32:
encoded_x, rms = _normalize_clip_observation(X)
extra_tensors['rms'] = rms
else:
encoded_x = X
encoded_x = encode_observation(ob_space, encoded_x)
with tf.variable_scope('pi', reuse=tf.AUTO_REUSE):
policy_latent, recurrent_tensors = policy_network(encoded_x)
if recurrent_tensors is not None:
# recurrent architecture, need a few more steps
nenv = nbatch // nsteps
assert nenv > 0, 'Bad input for recurrent policy: batch size {} smaller than nsteps {}'.format(nbatch, nsteps)
policy_latent, recurrent_tensors = policy_network(encoded_x, nenv)
extra_tensors.update(recurrent_tensors)
_v_net = value_network
if _v_net is None or _v_net == 'shared':
vf_latent = policy_latent
else:
if _v_net == 'copy':
_v_net = policy_network
else:
assert callable(_v_net)
with tf.variable_scope('vf', reuse=tf.AUTO_REUSE):
vf_latent, _ = _v_net(encoded_x)
policy = PolicyWithValue(
env=env,
observations=X,
latent=policy_latent,
vf_latent=vf_latent,
sess=sess,
estimate_q=estimate_q,
**extra_tensors
)
return policy
return policy_fn
def _normalize_clip_observation(x, clip_range=[-5.0, 5.0]):
rms = RunningMeanStd(shape=x.shape[1:])
norm_x = tf.clip_by_value((x - rms.mean) / rms.std, min(clip_range), max(clip_range))
return norm_x, rms

View File

@@ -0,0 +1,293 @@
# flake8: noqa F403, F405
from .atari_wrappers import *
import numpy as np
import gym
class TimeLimit(gym.Wrapper):
def __init__(self, env, max_episode_steps=None):
super(TimeLimit, self).__init__(env)
self._max_episode_steps = max_episode_steps
self._elapsed_steps = 0
def step(self, ac):
observation, reward, done, info = self.env.step(ac)
self._elapsed_steps += 1
if self._elapsed_steps >= self._max_episode_steps:
done = True
info['TimeLimit.truncated'] = True
return observation, reward, done, info
def reset(self, **kwargs):
self._elapsed_steps = 0
return self.env.reset(**kwargs)
class StochasticFrameSkip(gym.Wrapper):
def __init__(self, env, n, stickprob):
gym.Wrapper.__init__(self, env)
self.n = n
self.stickprob = stickprob
self.curac = None
self.rng = np.random.RandomState()
self.supports_want_render = hasattr(env, "supports_want_render")
def reset(self, **kwargs):
self.curac = None
return self.env.reset(**kwargs)
def step(self, ac):
done = False
totrew = 0
for i in range(self.n):
# First step after reset, use action
if self.curac is None:
self.curac = ac
# First substep, delay with probability=stickprob
elif i==0:
if self.rng.rand() > self.stickprob:
self.curac = ac
# Second substep, new action definitely kicks in
elif i==1:
self.curac = ac
if self.supports_want_render and i<self.n-1:
ob, rew, done, info = self.env.step(self.curac, want_render=False)
else:
ob, rew, done, info = self.env.step(self.curac)
totrew += rew
if done: break
return ob, totrew, done, info
def seed(self, s):
self.rng.seed(s)
class PartialFrameStack(gym.Wrapper):
def __init__(self, env, k, channel=1):
"""
Stack one channel (channel keyword) from previous frames
"""
gym.Wrapper.__init__(self, env)
shp = env.observation_space.shape
self.channel = channel
self.observation_space = gym.spaces.Box(low=0, high=255,
shape=(shp[0], shp[1], shp[2] + k - 1),
dtype=env.observation_space.dtype)
self.k = k
self.frames = deque([], maxlen=k)
shp = env.observation_space.shape
def reset(self):
ob = self.env.reset()
assert ob.shape[2] > self.channel
for _ in range(self.k):
self.frames.append(ob)
return self._get_ob()
def step(self, ac):
ob, reward, done, info = self.env.step(ac)
self.frames.append(ob)
return self._get_ob(), reward, done, info
def _get_ob(self):
assert len(self.frames) == self.k
return np.concatenate([frame if i==self.k-1 else frame[:,:,self.channel:self.channel+1]
for (i, frame) in enumerate(self.frames)], axis=2)
class Downsample(gym.ObservationWrapper):
def __init__(self, env, ratio):
"""
Downsample images by a factor of ratio
"""
gym.ObservationWrapper.__init__(self, env)
(oldh, oldw, oldc) = env.observation_space.shape
newshape = (oldh//ratio, oldw//ratio, oldc)
self.observation_space = spaces.Box(low=0, high=255,
shape=newshape, dtype=np.uint8)
def observation(self, frame):
height, width, _ = self.observation_space.shape
frame = cv2.resize(frame, (width, height), interpolation=cv2.INTER_AREA)
if frame.ndim == 2:
frame = frame[:,:,None]
return frame
class Rgb2gray(gym.ObservationWrapper):
def __init__(self, env):
"""
Downsample images by a factor of ratio
"""
gym.ObservationWrapper.__init__(self, env)
(oldh, oldw, _oldc) = env.observation_space.shape
self.observation_space = spaces.Box(low=0, high=255,
shape=(oldh, oldw, 1), dtype=np.uint8)
def observation(self, frame):
frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
return frame[:,:,None]
class MovieRecord(gym.Wrapper):
def __init__(self, env, savedir, k):
gym.Wrapper.__init__(self, env)
self.savedir = savedir
self.k = k
self.epcount = 0
def reset(self):
if self.epcount % self.k == 0:
print('saving movie this episode', self.savedir)
self.env.unwrapped.movie_path = self.savedir
else:
print('not saving this episode')
self.env.unwrapped.movie_path = None
self.env.unwrapped.movie = None
self.epcount += 1
return self.env.reset()
class AppendTimeout(gym.Wrapper):
def __init__(self, env):
gym.Wrapper.__init__(self, env)
self.action_space = env.action_space
self.timeout_space = gym.spaces.Box(low=np.array([0.0]), high=np.array([1.0]), dtype=np.float32)
self.original_os = env.observation_space
if isinstance(self.original_os, gym.spaces.Dict):
import copy
ordered_dict = copy.deepcopy(self.original_os.spaces)
ordered_dict['value_estimation_timeout'] = self.timeout_space
self.observation_space = gym.spaces.Dict(ordered_dict)
self.dict_mode = True
else:
self.observation_space = gym.spaces.Dict({
'original': self.original_os,
'value_estimation_timeout': self.timeout_space
})
self.dict_mode = False
self.ac_count = None
while 1:
if not hasattr(env, "_max_episode_steps"): # Looking for TimeLimit wrapper that has this field
env = env.env
continue
break
self.timeout = env._max_episode_steps
def step(self, ac):
self.ac_count += 1
ob, rew, done, info = self.env.step(ac)
return self._process(ob), rew, done, info
def reset(self):
self.ac_count = 0
return self._process(self.env.reset())
def _process(self, ob):
fracmissing = 1 - self.ac_count / self.timeout
if self.dict_mode:
ob['value_estimation_timeout'] = fracmissing
else:
return { 'original': ob, 'value_estimation_timeout': fracmissing }
class StartDoingRandomActionsWrapper(gym.Wrapper):
"""
Warning: can eat info dicts, not good if you depend on them
"""
def __init__(self, env, max_random_steps, on_startup=True, every_episode=False):
gym.Wrapper.__init__(self, env)
self.on_startup = on_startup
self.every_episode = every_episode
self.random_steps = max_random_steps
self.last_obs = None
if on_startup:
self.some_random_steps()
def some_random_steps(self):
self.last_obs = self.env.reset()
n = np.random.randint(self.random_steps)
#print("running for random %i frames" % n)
for _ in range(n):
self.last_obs, _, done, _ = self.env.step(self.env.action_space.sample())
if done: self.last_obs = self.env.reset()
def reset(self):
return self.last_obs
def step(self, a):
self.last_obs, rew, done, info = self.env.step(a)
if done:
self.last_obs = self.env.reset()
if self.every_episode:
self.some_random_steps()
return self.last_obs, rew, done, info
def make_retro(*, game, state, max_episode_steps, **kwargs):
import retro
env = retro.make(game, state, **kwargs)
env = StochasticFrameSkip(env, n=4, stickprob=0.25)
if max_episode_steps is not None:
env = TimeLimit(env, max_episode_steps=max_episode_steps)
return env
def wrap_deepmind_retro(env, scale=True, frame_stack=4):
"""
Configure environment for retro games, using config similar to DeepMind-style Atari in wrap_deepmind
"""
env = WarpFrame(env)
env = ClipRewardEnv(env)
env = FrameStack(env, frame_stack)
if scale:
env = ScaledFloatFrame(env)
return env
class SonicDiscretizer(gym.ActionWrapper):
"""
Wrap a gym-retro environment and make it use discrete
actions for the Sonic game.
"""
def __init__(self, env):
super(SonicDiscretizer, self).__init__(env)
buttons = ["B", "A", "MODE", "START", "UP", "DOWN", "LEFT", "RIGHT", "C", "Y", "X", "Z"]
actions = [['LEFT'], ['RIGHT'], ['LEFT', 'DOWN'], ['RIGHT', 'DOWN'], ['DOWN'],
['DOWN', 'B'], ['B']]
self._actions = []
for action in actions:
arr = np.array([False] * 12)
for button in action:
arr[buttons.index(button)] = True
self._actions.append(arr)
self.action_space = gym.spaces.Discrete(len(self._actions))
def action(self, a): # pylint: disable=W0221
return self._actions[a].copy()
class RewardScaler(gym.RewardWrapper):
"""
Bring rewards to a reasonable scale for PPO.
This is incredibly important and effects performance
drastically.
"""
def __init__(self, env, scale=0.01):
super(RewardScaler, self).__init__(env)
self.scale = scale
def reward(self, reward):
return reward * self.scale
class AllowBacktracking(gym.Wrapper):
"""
Use deltas in max(X) as the reward, rather than deltas
in X. This way, agents are not discouraged too heavily
from exploring backwards if there is no way to advance
head-on in the level.
"""
def __init__(self, env):
super(AllowBacktracking, self).__init__(env)
self._cur_x = 0
self._max_x = 0
def reset(self, **kwargs): # pylint: disable=E0202
self._cur_x = 0
self._max_x = 0
return self.env.reset(**kwargs)
def step(self, action): # pylint: disable=E0202
obs, rew, done, info = self.env.step(action)
self._cur_x += rew
rew = max(0, self._cur_x - self._max_x)
self._max_x = max(self._max_x, self._cur_x)
return obs, rew, done, info

View File

@@ -5,7 +5,7 @@ class AbstractEnvRunner(ABC):
def __init__(self, *, env, model, nsteps): def __init__(self, *, env, model, nsteps):
self.env = env self.env = env
self.model = model self.model = model
nenv = env.num_envs self.nenv = nenv = env.num_envs if hasattr(env, 'num_envs') else 1
self.batch_ob_shape = (nenv*nsteps,) + env.observation_space.shape self.batch_ob_shape = (nenv*nsteps,) + env.observation_space.shape
self.obs = np.zeros((nenv,) + env.observation_space.shape, dtype=env.observation_space.dtype.name) self.obs = np.zeros((nenv,) + env.observation_space.shape, dtype=env.observation_space.dtype.name)
self.obs[:] = env.reset() self.obs[:] = env.reset()
@@ -16,3 +16,4 @@ class AbstractEnvRunner(ABC):
@abstractmethod @abstractmethod
def run(self): def run(self):
raise NotImplementedError raise NotImplementedError

View File

@@ -1,4 +1,7 @@
import tensorflow as tf
import numpy as np import numpy as np
from baselines.common.tf_util import get_session
class RunningMeanStd(object): class RunningMeanStd(object):
# https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
def __init__(self, epsilon=1e-4, shape=()): def __init__(self, epsilon=1e-4, shape=()):
@@ -13,20 +16,71 @@ class RunningMeanStd(object):
self.update_from_moments(batch_mean, batch_var, batch_count) self.update_from_moments(batch_mean, batch_var, batch_count)
def update_from_moments(self, batch_mean, batch_var, batch_count): def update_from_moments(self, batch_mean, batch_var, batch_count):
delta = batch_mean - self.mean self.mean, self.var, self.count = update_mean_var_count_from_moments(
tot_count = self.count + batch_count self.mean, self.var, self.count, batch_mean, batch_var, batch_count)
new_mean = self.mean + delta * batch_count / tot_count def update_mean_var_count_from_moments(mean, var, count, batch_mean, batch_var, batch_count):
m_a = self.var * (self.count) delta = batch_mean - mean
m_b = batch_var * (batch_count) tot_count = count + batch_count
M2 = m_a + m_b + np.square(delta) * self.count * batch_count / (self.count + batch_count)
new_var = M2 / (self.count + batch_count) new_mean = mean + delta * batch_count / tot_count
m_a = var * count
m_b = batch_var * batch_count
M2 = m_a + m_b + np.square(delta) * count * batch_count / (count + batch_count)
new_var = M2 / (count + batch_count)
new_count = batch_count + count
return new_mean, new_var, new_count
class TfRunningMeanStd(object):
# https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
'''
TensorFlow variables-based implmentation of computing running mean and std
Benefit of this implementation is that it can be saved / loaded together with the tensorflow model
'''
def __init__(self, epsilon=1e-4, shape=(), scope=''):
sess = get_session()
self._new_mean = tf.placeholder(shape=shape, dtype=tf.float64)
self._new_var = tf.placeholder(shape=shape, dtype=tf.float64)
self._new_count = tf.placeholder(shape=(), dtype=tf.float64)
with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
self._mean = tf.get_variable('mean', initializer=np.zeros(shape, 'float64'), dtype=tf.float64)
self._var = tf.get_variable('std', initializer=np.ones(shape, 'float64'), dtype=tf.float64)
self._count = tf.get_variable('count', initializer=np.full((), epsilon, 'float64'), dtype=tf.float64)
self.update_ops = tf.group([
self._var.assign(self._new_var),
self._mean.assign(self._new_mean),
self._count.assign(self._new_count)
])
sess.run(tf.variables_initializer([self._mean, self._var, self._count]))
self.sess = sess
self._set_mean_var_count()
def _set_mean_var_count(self):
self.mean, self.var, self.count = self.sess.run([self._mean, self._var, self._count])
def update(self, x):
batch_mean = np.mean(x, axis=0)
batch_var = np.var(x, axis=0)
batch_count = x.shape[0]
new_mean, new_var, new_count = update_mean_var_count_from_moments(self.mean, self.var, self.count, batch_mean, batch_var, batch_count)
self.sess.run(self.update_ops, feed_dict={
self._new_mean: new_mean,
self._new_var: new_var,
self._new_count: new_count
})
self._set_mean_var_count()
new_count = batch_count + self.count
self.mean = new_mean
self.var = new_var
self.count = new_count
def test_runningmeanstd(): def test_runningmeanstd():
for (x1, x2, x3) in [ for (x1, x2, x3) in [
@@ -43,4 +97,91 @@ def test_runningmeanstd():
rms.update(x3) rms.update(x3)
ms2 = [rms.mean, rms.var] ms2 = [rms.mean, rms.var]
assert np.allclose(ms1, ms2) np.testing.assert_allclose(ms1, ms2)
def test_tf_runningmeanstd():
for (x1, x2, x3) in [
(np.random.randn(3), np.random.randn(4), np.random.randn(5)),
(np.random.randn(3,2), np.random.randn(4,2), np.random.randn(5,2)),
]:
rms = TfRunningMeanStd(epsilon=0.0, shape=x1.shape[1:], scope='running_mean_std' + str(np.random.randint(0, 128)))
x = np.concatenate([x1, x2, x3], axis=0)
ms1 = [x.mean(axis=0), x.var(axis=0)]
rms.update(x1)
rms.update(x2)
rms.update(x3)
ms2 = [rms.mean, rms.var]
np.testing.assert_allclose(ms1, ms2)
def profile_tf_runningmeanstd():
import time
from baselines.common import tf_util
tf_util.get_session( config=tf.ConfigProto(
inter_op_parallelism_threads=1,
intra_op_parallelism_threads=1,
allow_soft_placement=True
))
x = np.random.random((376,))
n_trials = 10000
rms = RunningMeanStd()
tfrms = TfRunningMeanStd()
tic1 = time.time()
for _ in range(n_trials):
rms.update(x)
tic2 = time.time()
for _ in range(n_trials):
tfrms.update(x)
tic3 = time.time()
print('rms update time ({} trials): {} s'.format(n_trials, tic2 - tic1))
print('tfrms update time ({} trials): {} s'.format(n_trials, tic3 - tic2))
tic1 = time.time()
for _ in range(n_trials):
z1 = rms.mean
tic2 = time.time()
for _ in range(n_trials):
z2 = tfrms.mean
assert z1 == z2
tic3 = time.time()
print('rms get mean time ({} trials): {} s'.format(n_trials, tic2 - tic1))
print('tfrms get mean time ({} trials): {} s'.format(n_trials, tic3 - tic2))
'''
options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) #pylint: disable=E1101
run_metadata = tf.RunMetadata()
profile_opts = dict(options=options, run_metadata=run_metadata)
from tensorflow.python.client import timeline
fetched_timeline = timeline.Timeline(run_metadata.step_stats) #pylint: disable=E1101
chrome_trace = fetched_timeline.generate_chrome_trace_format()
outfile = '/tmp/timeline.json'
with open(outfile, 'wt') as f:
f.write(chrome_trace)
print(f'Successfully saved profile to {outfile}. Exiting.')
exit(0)
'''
if __name__ == '__main__':
profile_tf_runningmeanstd()

View File

@@ -1,44 +0,0 @@
import pytest
import tensorflow as tf
import random
import numpy as np
from gym.spaces import np_random
from baselines.a2c import a2c
from baselines.ppo2 import ppo2
from baselines.common.identity_env import IdentityEnv
from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
from baselines.ppo2.policies import MlpPolicy
learn_func_list = [
lambda e: a2c.learn(policy=MlpPolicy, env=e, seed=0, total_timesteps=50000),
lambda e: ppo2.learn(policy=MlpPolicy, env=e, total_timesteps=50000, lr=1e-3, nsteps=128, ent_coef=0.01)
]
@pytest.mark.slow
@pytest.mark.parametrize("learn_func", learn_func_list)
def test_identity(learn_func):
'''
Test if the algorithm (with a given policy)
can learn an identity transformation (i.e. return observation as an action)
'''
np.random.seed(0)
np_random.seed(0)
random.seed(0)
env = DummyVecEnv([lambda: IdentityEnv(10)])
with tf.Graph().as_default(), tf.Session().as_default():
tf.set_random_seed(0)
model = learn_func(env)
N_TRIALS = 1000
sum_rew = 0
obs = env.reset()
for i in range(N_TRIALS):
obs, rew, done, _ = env.step(model.step(obs)[0])
sum_rew += rew
assert sum_rew > 0.9 * N_TRIALS

View File

View File

View File

@@ -0,0 +1,44 @@
import numpy as np
from gym import Env
from gym.spaces import Discrete
class FixedSequenceEnv(Env):
def __init__(
self,
n_actions=10,
seed=0,
episode_len=100
):
self.np_random = np.random.RandomState()
self.np_random.seed(seed)
self.sequence = [self.np_random.randint(0, n_actions-1) for _ in range(episode_len)]
self.action_space = Discrete(n_actions)
self.observation_space = Discrete(1)
self.episode_len = episode_len
self.time = 0
self.reset()
def reset(self):
self.time = 0
return 0
def step(self, actions):
rew = self._get_reward(actions)
self._choose_next_state()
done = False
if self.episode_len and self.time >= self.episode_len:
rew = 0
done = True
return 0, rew, done, {}
def _choose_next_state(self):
self.time += 1
def _get_reward(self, actions):
return 1 if actions == self.sequence[self.time] else 0

View File

@@ -0,0 +1,70 @@
import numpy as np
from abc import abstractmethod
from gym import Env
from gym.spaces import Discrete, Box
class IdentityEnv(Env):
def __init__(
self,
episode_len=None
):
self.episode_len = episode_len
self.time = 0
self.reset()
def reset(self):
self._choose_next_state()
self.time = 0
self.observation_space = self.action_space
return self.state
def step(self, actions):
rew = self._get_reward(actions)
self._choose_next_state()
done = False
if self.episode_len and self.time >= self.episode_len:
rew = 0
done = True
return self.state, rew, done, {}
def _choose_next_state(self):
self.state = self.action_space.sample()
self.time += 1
@abstractmethod
def _get_reward(self, actions):
raise NotImplementedError
class DiscreteIdentityEnv(IdentityEnv):
def __init__(
self,
dim,
episode_len=None,
):
self.action_space = Discrete(dim)
super().__init__(episode_len=episode_len)
def _get_reward(self, actions):
return 1 if self.state == actions else 0
class BoxIdentityEnv(IdentityEnv):
def __init__(
self,
shape,
episode_len=None,
):
self.action_space = Box(low=-1.0, high=1.0, shape=shape)
super().__init__(episode_len=episode_len)
def _get_reward(self, actions):
diff = actions - self.state
diff = diff[:]
return -0.5 * np.dot(diff, diff)

View File

@@ -0,0 +1,70 @@
import os.path as osp
import numpy as np
import tempfile
import filelock
from gym import Env
from gym.spaces import Discrete, Box
class MnistEnv(Env):
def __init__(
self,
seed=0,
episode_len=None,
no_images=None
):
from tensorflow.examples.tutorials.mnist import input_data
# we could use temporary directory for this with a context manager and
# TemporaryDirecotry, but then each test that uses mnist would re-download the data
# this way the data is not cleaned up, but we only download it once per machine
mnist_path = osp.join(tempfile.gettempdir(), 'MNIST_data')
with filelock.FileLock(mnist_path + '.lock'):
self.mnist = input_data.read_data_sets(mnist_path)
self.np_random = np.random.RandomState()
self.np_random.seed(seed)
self.observation_space = Box(low=0.0, high=1.0, shape=(28,28,1))
self.action_space = Discrete(10)
self.episode_len = episode_len
self.time = 0
self.no_images = no_images
self.train_mode()
self.reset()
def reset(self):
self._choose_next_state()
self.time = 0
return self.state[0]
def step(self, actions):
rew = self._get_reward(actions)
self._choose_next_state()
done = False
if self.episode_len and self.time >= self.episode_len:
rew = 0
done = True
return self.state[0], rew, done, {}
def train_mode(self):
self.dataset = self.mnist.train
def test_mode(self):
self.dataset = self.mnist.test
def _choose_next_state(self):
max_index = (self.no_images if self.no_images is not None else self.dataset.num_examples) - 1
index = self.np_random.randint(0, max_index)
image = self.dataset.images[index].reshape(28,28,1)*255
label = self.dataset.labels[index]
self.state = (image, label)
self.time += 1
def _get_reward(self, actions):
return 1 if self.state[1] == actions else 0

View File

@@ -0,0 +1,40 @@
import pytest
import gym
from baselines.run import get_learn_function
from baselines.common.tests.util import reward_per_episode_test
common_kwargs = dict(
total_timesteps=30000,
network='mlp',
gamma=1.0,
seed=0,
)
learn_kwargs = {
'a2c' : dict(nsteps=32, value_network='copy', lr=0.05),
'acktr': dict(nsteps=32, value_network='copy'),
'deepq': {},
'ppo2': dict(value_network='copy'),
'trpo_mpi': {}
}
@pytest.mark.slow
@pytest.mark.parametrize("alg", learn_kwargs.keys())
def test_cartpole(alg):
'''
Test if the algorithm (with an mlp policy)
can learn to balance the cartpole
'''
kwargs = common_kwargs.copy()
kwargs.update(learn_kwargs[alg])
learn_fn = lambda e: get_learn_function(alg)(env=e, **kwargs)
def env_fn():
env = gym.make('CartPole-v0')
env.seed(0)
return env
reward_per_episode_test(env_fn, learn_fn, 100)

View File

@@ -0,0 +1,51 @@
import pytest
from baselines.common.tests.envs.fixed_sequence_env import FixedSequenceEnv
from baselines.common.tests.util import simple_test
from baselines.run import get_learn_function
common_kwargs = dict(
seed=0,
total_timesteps=50000,
)
learn_kwargs = {
'a2c': {},
'ppo2': dict(nsteps=10, ent_coef=0.0, nminibatches=1),
# TODO enable sequential models for trpo_mpi (proper handling of nbatch and nsteps)
# github issue: https://github.com/openai/baselines/issues/188
# 'trpo_mpi': lambda e, p: trpo_mpi.learn(policy_fn=p(env=e), env=e, max_timesteps=30000, timesteps_per_batch=100, cg_iters=10, gamma=0.9, lam=1.0, max_kl=0.001)
}
alg_list = learn_kwargs.keys()
rnn_list = ['lstm']
@pytest.mark.slow
@pytest.mark.parametrize("alg", alg_list)
@pytest.mark.parametrize("rnn", rnn_list)
def test_fixed_sequence(alg, rnn):
'''
Test if the algorithm (with a given policy)
can learn an identity transformation (i.e. return observation as an action)
'''
kwargs = learn_kwargs[alg]
kwargs.update(common_kwargs)
episode_len = 5
env_fn = lambda: FixedSequenceEnv(10, episode_len=episode_len)
learn = lambda e: get_learn_function(alg)(
env=e,
network=rnn,
**kwargs
)
simple_test(env_fn, learn, 0.7)
if __name__ == '__main__':
test_fixed_sequence('ppo2', 'lstm')

View File

@@ -0,0 +1,55 @@
import pytest
from baselines.common.tests.envs.identity_env import DiscreteIdentityEnv, BoxIdentityEnv
from baselines.run import get_learn_function
from baselines.common.tests.util import simple_test
common_kwargs = dict(
total_timesteps=30000,
network='mlp',
gamma=0.9,
seed=0,
)
learn_kwargs = {
'a2c' : {},
'acktr': {},
'deepq': {},
'ppo2': dict(lr=1e-3, nsteps=64, ent_coef=0.0),
'trpo_mpi': dict(timesteps_per_batch=100, cg_iters=10, gamma=0.9, lam=1.0, max_kl=0.01)
}
@pytest.mark.slow
@pytest.mark.parametrize("alg", learn_kwargs.keys())
def test_discrete_identity(alg):
'''
Test if the algorithm (with an mlp policy)
can learn an identity transformation (i.e. return observation as an action)
'''
kwargs = learn_kwargs[alg]
kwargs.update(common_kwargs)
learn_fn = lambda e: get_learn_function(alg)(env=e, **kwargs)
env_fn = lambda: DiscreteIdentityEnv(10, episode_len=100)
simple_test(env_fn, learn_fn, 0.9)
@pytest.mark.slow
@pytest.mark.parametrize("alg", ['a2c', 'ppo2', 'trpo_mpi'])
def test_continuous_identity(alg):
'''
Test if the algorithm (with an mlp policy)
can learn an identity transformation (i.e. return observation as an action)
to a required precision
'''
kwargs = learn_kwargs[alg]
kwargs.update(common_kwargs)
learn_fn = lambda e: get_learn_function(alg)(env=e, **kwargs)
env_fn = lambda: BoxIdentityEnv((1,), episode_len=100)
simple_test(env_fn, learn_fn, -0.1)
if __name__ == '__main__':
test_continuous_identity('a2c')

View File

@@ -0,0 +1,50 @@
import pytest
# from baselines.acer import acer_simple as acer
from baselines.common.tests.envs.mnist_env import MnistEnv
from baselines.common.tests.util import simple_test
from baselines.run import get_learn_function
# TODO investigate a2c and ppo2 failures - is it due to bad hyperparameters for this problem?
# GitHub issue https://github.com/openai/baselines/issues/189
common_kwargs = {
'seed': 0,
'network':'cnn',
'gamma':0.9,
'pad':'SAME'
}
learn_args = {
'a2c': dict(total_timesteps=50000),
# TODO need to resolve inference (step) API differences for acer; also slow
# 'acer': dict(seed=0, total_timesteps=1000),
'deepq': dict(total_timesteps=5000),
'acktr': dict(total_timesteps=30000),
'ppo2': dict(total_timesteps=50000, lr=1e-3, nsteps=128, ent_coef=0.0),
'trpo_mpi': dict(total_timesteps=80000, timesteps_per_batch=100, cg_iters=10, lam=1.0, max_kl=0.001)
}
#tests pass, but are too slow on travis. Same algorithms are covered
# by other tests with less compute-hungry nn's and by benchmarks
@pytest.mark.skip
@pytest.mark.slow
@pytest.mark.parametrize("alg", learn_args.keys())
def test_mnist(alg):
'''
Test if the algorithm can learn to classify MNIST digits.
Uses CNN policy.
'''
learn_kwargs = learn_args[alg]
learn_kwargs.update(common_kwargs)
learn = get_learn_function(alg)
learn_fn = lambda e: learn(env=e, **learn_kwargs)
env_fn = lambda: MnistEnv(seed=0, episode_len=100)
simple_test(env_fn, learn_fn, 0.6)
if __name__ == '__main__':
test_mnist('deepq')

View File

@@ -0,0 +1,97 @@
import os
import tempfile
import pytest
import tensorflow as tf
import numpy as np
from baselines.common.tests.envs.mnist_env import MnistEnv
from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
from baselines.run import get_learn_function
from baselines.common.tf_util import make_session, get_session
from functools import partial
learn_kwargs = {
'deepq': {},
'a2c': {},
'acktr': {},
'ppo2': {'nminibatches': 1, 'nsteps': 10},
'trpo_mpi': {},
}
network_kwargs = {
'mlp': {},
'cnn': {'pad': 'SAME'},
'lstm': {},
'cnn_lnlstm': {'pad': 'SAME'}
}
@pytest.mark.parametrize("learn_fn", learn_kwargs.keys())
@pytest.mark.parametrize("network_fn", network_kwargs.keys())
def test_serialization(learn_fn, network_fn):
'''
Test if the trained model can be serialized
'''
if network_fn.endswith('lstm') and learn_fn in ['acktr', 'trpo_mpi', 'deepq']:
# TODO make acktr work with recurrent policies
# and test
# github issue: https://github.com/openai/baselines/issues/194
return
env = DummyVecEnv([lambda: MnistEnv(10, episode_len=100)])
ob = env.reset().copy()
learn = get_learn_function(learn_fn)
kwargs = {}
kwargs.update(network_kwargs[network_fn])
kwargs.update(learn_kwargs[learn_fn])
learn = partial(learn, env=env, network=network_fn, seed=0, **kwargs)
with tempfile.TemporaryDirectory() as td:
model_path = os.path.join(td, 'serialization_test_model')
with tf.Graph().as_default(), make_session().as_default():
model = learn(total_timesteps=100)
model.save(model_path)
mean1, std1 = _get_action_stats(model, ob)
variables_dict1 = _serialize_variables()
with tf.Graph().as_default(), make_session().as_default():
model = learn(total_timesteps=0, load_path=model_path)
mean2, std2 = _get_action_stats(model, ob)
variables_dict2 = _serialize_variables()
for k, v in variables_dict1.items():
np.testing.assert_allclose(v, variables_dict2[k], atol=0.01,
err_msg='saved and loaded variable {} value mismatch'.format(k))
np.testing.assert_allclose(mean1, mean2, atol=0.5)
np.testing.assert_allclose(std1, std2, atol=0.5)
def _serialize_variables():
sess = get_session()
variables = tf.trainable_variables()
values = sess.run(variables)
return {var.name: value for var, value in zip(variables, values)}
def _get_action_stats(model, ob):
ntrials = 1000
if model.initial_state is None or model.initial_state == []:
actions = np.array([model.step(ob)[0] for _ in range(ntrials)])
else:
actions = np.array([model.step(ob, S=model.initial_state, M=[False])[0] for _ in range(ntrials)])
mean = np.mean(actions, axis=0)
std = np.std(actions, axis=0)
return mean, std

View File

@@ -0,0 +1,91 @@
import tensorflow as tf
import numpy as np
from gym.spaces import np_random
from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
N_TRIALS = 10000
N_EPISODES = 100
def simple_test(env_fn, learn_fn, min_reward_fraction, n_trials=N_TRIALS):
np.random.seed(0)
np_random.seed(0)
env = DummyVecEnv([env_fn])
with tf.Graph().as_default(), tf.Session(config=tf.ConfigProto(allow_soft_placement=True)).as_default():
tf.set_random_seed(0)
model = learn_fn(env)
sum_rew = 0
done = True
for i in range(n_trials):
if done:
obs = env.reset()
state = model.initial_state
if state is not None:
a, v, state, _ = model.step(obs, S=state, M=[False])
else:
a, v, _, _ = model.step(obs)
obs, rew, done, _ = env.step(a)
sum_rew += float(rew)
print("Reward in {} trials is {}".format(n_trials, sum_rew))
assert sum_rew > min_reward_fraction * n_trials, \
'sum of rewards {} is less than {} of the total number of trials {}'.format(sum_rew, min_reward_fraction, n_trials)
def reward_per_episode_test(env_fn, learn_fn, min_avg_reward, n_trials=N_EPISODES):
env = DummyVecEnv([env_fn])
with tf.Graph().as_default(), tf.Session(config=tf.ConfigProto(allow_soft_placement=True)).as_default():
model = learn_fn(env)
N_TRIALS = 100
observations, actions, rewards = rollout(env, model, N_TRIALS)
rewards = [sum(r) for r in rewards]
avg_rew = sum(rewards) / N_TRIALS
print("Average reward in {} episodes is {}".format(n_trials, avg_rew))
assert avg_rew > min_avg_reward, \
'average reward in {} episodes ({}) is less than {}'.format(n_trials, avg_rew, min_avg_reward)
def rollout(env, model, n_trials):
rewards = []
actions = []
observations = []
for i in range(n_trials):
obs = env.reset()
state = model.initial_state
episode_rew = []
episode_actions = []
episode_obs = []
while True:
if state is not None:
a, v, state, _ = model.step(obs, S=state, M=[False])
else:
a,v, _, _ = model.step(obs)
obs, rew, done, _ = env.step(a)
episode_rew.append(rew)
episode_actions.append(a)
episode_obs.append(obs)
if done:
break
rewards.append(episode_rew)
actions.append(episode_actions)
observations.append(episode_obs)
return observations, actions, rewards

View File

@@ -1,3 +1,4 @@
import joblib
import numpy as np import numpy as np
import tensorflow as tf # pylint: ignore-module import tensorflow as tf # pylint: ignore-module
import copy import copy
@@ -48,17 +49,28 @@ def huber_loss(x, delta=1.0):
# Global session # Global session
# ================================================================ # ================================================================
def make_session(num_cpu=None, make_default=False, graph=None): def get_session(config=None):
"""Get default session or create one with a given config"""
sess = tf.get_default_session()
if sess is None:
sess = make_session(config=config, make_default=True)
return sess
def make_session(config=None, num_cpu=None, make_default=False, graph=None):
"""Returns a session that will use <num_cpu> CPU's only""" """Returns a session that will use <num_cpu> CPU's only"""
if num_cpu is None: if num_cpu is None:
num_cpu = int(os.getenv('RCALL_NUM_CPU', multiprocessing.cpu_count())) num_cpu = int(os.getenv('RCALL_NUM_CPU', multiprocessing.cpu_count()))
tf_config = tf.ConfigProto( if config is None:
config = tf.ConfigProto(
allow_soft_placement=True,
inter_op_parallelism_threads=num_cpu, inter_op_parallelism_threads=num_cpu,
intra_op_parallelism_threads=num_cpu) intra_op_parallelism_threads=num_cpu)
config.gpu_options.allow_growth = True
if make_default: if make_default:
return tf.InteractiveSession(config=tf_config, graph=graph) return tf.InteractiveSession(config=config, graph=graph)
else: else:
return tf.Session(config=tf_config, graph=graph) return tf.Session(config=config, graph=graph)
def single_threaded_session(): def single_threaded_session():
"""Returns a session which will only use a single CPU""" """Returns a session which will only use a single CPU"""
@@ -76,7 +88,7 @@ ALREADY_INITIALIZED = set()
def initialize(): def initialize():
"""Initialize all the uninitialized variables in the global scope.""" """Initialize all the uninitialized variables in the global scope."""
new_variables = set(tf.global_variables()) - ALREADY_INITIALIZED new_variables = set(tf.global_variables()) - ALREADY_INITIALIZED
tf.get_default_session().run(tf.variables_initializer(new_variables)) get_session().run(tf.variables_initializer(new_variables))
ALREADY_INITIALIZED.update(new_variables) ALREADY_INITIALIZED.update(new_variables)
# ================================================================ # ================================================================
@@ -85,7 +97,7 @@ def initialize():
def normc_initializer(std=1.0, axis=0): def normc_initializer(std=1.0, axis=0):
def _initializer(shape, dtype=None, partition_info=None): # pylint: disable=W0613 def _initializer(shape, dtype=None, partition_info=None): # pylint: disable=W0613
out = np.random.randn(*shape).astype(np.float32) out = np.random.randn(*shape).astype(dtype.as_numpy_dtype)
out *= std / np.sqrt(np.square(out).sum(axis=axis, keepdims=True)) out *= std / np.sqrt(np.square(out).sum(axis=axis, keepdims=True))
return tf.constant(out) return tf.constant(out)
return _initializer return _initializer
@@ -179,7 +191,7 @@ class _Function(object):
if hasattr(inpt, 'make_feed_dict'): if hasattr(inpt, 'make_feed_dict'):
feed_dict.update(inpt.make_feed_dict(value)) feed_dict.update(inpt.make_feed_dict(value))
else: else:
feed_dict[inpt] = value feed_dict[inpt] = adjust_shape(inpt, value)
def __call__(self, *args): def __call__(self, *args):
assert len(args) <= len(self.inputs), "Too many arguments provided" assert len(args) <= len(self.inputs), "Too many arguments provided"
@@ -189,8 +201,8 @@ class _Function(object):
self._feed_input(feed_dict, inpt, value) self._feed_input(feed_dict, inpt, value)
# Update feed dict with givens. # Update feed dict with givens.
for inpt in self.givens: for inpt in self.givens:
feed_dict[inpt] = feed_dict.get(inpt, self.givens[inpt]) feed_dict[inpt] = adjust_shape(inpt, feed_dict.get(inpt, self.givens[inpt]))
results = tf.get_default_session().run(self.outputs_update, feed_dict=feed_dict)[:-1] results = get_session().run(self.outputs_update, feed_dict=feed_dict)[:-1]
return results return results
# ================================================================ # ================================================================
@@ -243,14 +255,23 @@ class GetFlat(object):
def __call__(self): def __call__(self):
return tf.get_default_session().run(self.op) return tf.get_default_session().run(self.op)
def flattenallbut0(x):
return tf.reshape(x, [-1, intprod(x.get_shape().as_list()[1:])])
# =============================================================
# TF placeholders management
# ============================================================
_PLACEHOLDER_CACHE = {} # name -> (placeholder, dtype, shape) _PLACEHOLDER_CACHE = {} # name -> (placeholder, dtype, shape)
def get_placeholder(name, dtype, shape): def get_placeholder(name, dtype, shape):
if name in _PLACEHOLDER_CACHE: if name in _PLACEHOLDER_CACHE:
out, dtype1, shape1 = _PLACEHOLDER_CACHE[name] out, dtype1, shape1 = _PLACEHOLDER_CACHE[name]
assert dtype1 == dtype and shape1 == shape if out.graph == tf.get_default_graph():
assert dtype1 == dtype and shape1 == shape, \
'Placeholder with name {} has already been registered and has shape {}, different from requested {}'.format(name, shape1, shape)
return out return out
else:
out = tf.placeholder(dtype=dtype, shape=shape, name=name) out = tf.placeholder(dtype=dtype, shape=shape, name=name)
_PLACEHOLDER_CACHE[name] = (out, dtype, shape) _PLACEHOLDER_CACHE[name] = (out, dtype, shape)
return out return out
@@ -258,8 +279,6 @@ def get_placeholder(name, dtype, shape):
def get_placeholder_cached(name): def get_placeholder_cached(name):
return _PLACEHOLDER_CACHE[name][0] return _PLACEHOLDER_CACHE[name][0]
def flattenallbut0(x):
return tf.reshape(x, [-1, intprod(x.get_shape().as_list()[1:])])
# ================================================================ # ================================================================
@@ -292,13 +311,95 @@ def get_available_gpus():
# Saving variables # Saving variables
# ================================================================ # ================================================================
def load_state(fname): def load_state(fname, sess=None):
sess = sess or get_session()
saver = tf.train.Saver() saver = tf.train.Saver()
saver.restore(tf.get_default_session(), fname) saver.restore(tf.get_default_session(), fname)
def save_state(fname): def save_state(fname, sess=None):
sess = sess or get_session()
os.makedirs(os.path.dirname(fname), exist_ok=True) os.makedirs(os.path.dirname(fname), exist_ok=True)
saver = tf.train.Saver() saver = tf.train.Saver()
saver.save(tf.get_default_session(), fname) saver.save(tf.get_default_session(), fname)
# The methods above and below are clearly doing the same thing, and in a rather similar way
# TODO: ensure there is no subtle differences and remove one
def save_variables(save_path, variables=None, sess=None):
sess = sess or get_session()
variables = variables or tf.trainable_variables()
ps = sess.run(variables)
save_dict = {v.name: value for v, value in zip(variables, ps)}
os.makedirs(os.path.dirname(save_path), exist_ok=True)
joblib.dump(save_dict, save_path)
def load_variables(load_path, variables=None, sess=None):
sess = sess or get_session()
variables = variables or tf.trainable_variables()
loaded_params = joblib.load(os.path.expanduser(load_path))
restores = []
for v in variables:
restores.append(v.assign(loaded_params[v.name]))
sess.run(restores)
# ================================================================
# Shape adjustment for feeding into tf placeholders
# ================================================================
def adjust_shape(placeholder, data):
'''
adjust shape of the data to the shape of the placeholder if possible.
If shape is incompatible, AssertionError is thrown
Parameters:
placeholder tensorflow input placeholder
data input data to be (potentially) reshaped to be fed into placeholder
Returns:
reshaped data
'''
if not isinstance(data, np.ndarray) and not isinstance(data, list):
return data
if isinstance(data, list):
data = np.array(data)
placeholder_shape = [x or -1 for x in placeholder.shape.as_list()]
assert _check_shape(placeholder_shape, data.shape), \
'Shape of data {} is not compatible with shape of the placeholder {}'.format(data.shape, placeholder_shape)
return np.reshape(data, placeholder_shape)
def _check_shape(placeholder_shape, data_shape):
''' check if two shapes are compatible (i.e. differ only by dimensions of size 1, or by the batch dimension)'''
return True
squeezed_placeholder_shape = _squeeze_shape(placeholder_shape)
squeezed_data_shape = _squeeze_shape(data_shape)
for i, s_data in enumerate(squeezed_data_shape):
s_placeholder = squeezed_placeholder_shape[i]
if s_placeholder != -1 and s_data != s_placeholder:
return False
return True
def _squeeze_shape(shape):
return [x for x in shape if x != 1]
# Tensorboard interfacing
# ================================================================
def launch_tensorboard_in_background(log_dir):
from tensorboard import main as tb
import threading
tf.flags.FLAGS.logdir = log_dir
t = threading.Thread(target=tb.main, args=([]))
t.start()

View File

@@ -30,15 +30,30 @@ class DummyVecEnv(VecEnv):
self.actions = None self.actions = None
def step_async(self, actions): def step_async(self, actions):
listify = True
try:
if len(actions) == self.num_envs:
listify = False
except TypeError:
pass
if not listify:
self.actions = actions self.actions = actions
else:
assert self.num_envs == 1, "actions {} is either not a list or has a wrong size - cannot match to {} environments".format(actions, self.num_envs)
self.actions = [actions]
def step_wait(self): def step_wait(self):
for e in range(self.num_envs): for e in range(self.num_envs):
obs, self.buf_rews[e], self.buf_dones[e], self.buf_infos[e] = self.envs[e].step(self.actions[e]) action = self.actions[e]
if isinstance(self.envs[e].action_space, spaces.Discrete):
action = int(action)
obs, self.buf_rews[e], self.buf_dones[e], self.buf_infos[e] = self.envs[e].step(action)
if self.buf_dones[e]: if self.buf_dones[e]:
obs = self.envs[e].reset() obs = self.envs[e].reset()
self._save_obs(e, obs) self._save_obs(e, obs)
return (self._obs_from_buf(), np.copy(self.buf_rews), np.copy(self.buf_dones), return (np.copy(self._obs_from_buf()), np.copy(self.buf_rews), np.copy(self.buf_dones),
self.buf_infos.copy()) self.buf_infos.copy())
def reset(self): def reset(self):

View File

@@ -7,6 +7,7 @@ from baselines.common.tile_images import tile_images
def worker(remote, parent_remote, env_fn_wrapper): def worker(remote, parent_remote, env_fn_wrapper):
parent_remote.close() parent_remote.close()
env = env_fn_wrapper.x() env = env_fn_wrapper.x()
try:
while True: while True:
cmd, data = remote.recv() cmd, data = remote.recv()
if cmd == 'step': if cmd == 'step':
@@ -26,7 +27,10 @@ def worker(remote, parent_remote, env_fn_wrapper):
remote.send((env.observation_space, env.action_space)) remote.send((env.observation_space, env.action_space))
else: else:
raise NotImplementedError raise NotImplementedError
except KeyboardInterrupt:
print('SubprocVecEnv worker: got KeyboardInterrupt')
finally:
env.close()
class SubprocVecEnv(VecEnv): class SubprocVecEnv(VecEnv):
def __init__(self, env_fns, spaces=None): def __init__(self, env_fns, spaces=None):

View File

@@ -10,6 +10,8 @@ class VecNormalize(VecEnvWrapper):
VecEnvWrapper.__init__(self, venv) VecEnvWrapper.__init__(self, venv)
self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if ob else None self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if ob else None
self.ret_rms = RunningMeanStd(shape=()) if ret else None self.ret_rms = RunningMeanStd(shape=()) if ret else None
#self.ob_rms = TfRunningMeanStd(shape=self.observation_space.shape, scope='observation_running_mean_std') if ob else None
#self.ret_rms = TfRunningMeanStd(shape=(), scope='return_running_mean_std') if ret else None
self.clipob = clipob self.clipob = clipob
self.cliprew = cliprew self.cliprew = cliprew
self.ret = np.zeros(self.num_envs) self.ret = np.zeros(self.num_envs)

View File

@@ -26,9 +26,9 @@ def reduce_std(x, axis=None, keepdims=False):
return tf.sqrt(reduce_var(x, axis=axis, keepdims=keepdims)) return tf.sqrt(reduce_var(x, axis=axis, keepdims=keepdims))
def reduce_var(x, axis=None, keepdims=False): def reduce_var(x, axis=None, keepdims=False):
m = tf.reduce_mean(x, axis=axis, keep_dims=True) m = tf.reduce_mean(x, axis=axis, keepdims=True)
devs_squared = tf.square(x - m) devs_squared = tf.square(x - m)
return tf.reduce_mean(devs_squared, axis=axis, keep_dims=keepdims) return tf.reduce_mean(devs_squared, axis=axis, keepdims=keepdims)
def get_target_updates(vars, target_vars, tau): def get_target_updates(vars, target_vars, tau):
logger.info('setting up target updates ...') logger.info('setting up target updates ...')

View File

@@ -1,6 +1,6 @@
from baselines.deepq import models # noqa from baselines.deepq import models # noqa
from baselines.deepq.build_graph import build_act, build_train # noqa from baselines.deepq.build_graph import build_act, build_train # noqa
from baselines.deepq.simple import learn, load # noqa from baselines.deepq.deepq import learn, load_act # noqa
from baselines.deepq.replay_buffer import ReplayBuffer, PrioritizedReplayBuffer # noqa from baselines.deepq.replay_buffer import ReplayBuffer, PrioritizedReplayBuffer # noqa
def wrap_atari_dqn(env): def wrap_atari_dqn(env):

View File

@@ -10,20 +10,24 @@ import baselines.common.tf_util as U
from baselines.common.tf_util import load_state, save_state from baselines.common.tf_util import load_state, save_state
from baselines import logger from baselines import logger
from baselines.common.schedules import LinearSchedule from baselines.common.schedules import LinearSchedule
from baselines.common.input import observation_input from baselines.common import set_global_seeds
from baselines import deepq from baselines import deepq
from baselines.deepq.replay_buffer import ReplayBuffer, PrioritizedReplayBuffer from baselines.deepq.replay_buffer import ReplayBuffer, PrioritizedReplayBuffer
from baselines.deepq.utils import ObservationInput from baselines.deepq.utils import ObservationInput
from baselines.common.tf_util import get_session
from baselines.deepq.models import build_q_func
class ActWrapper(object): class ActWrapper(object):
def __init__(self, act, act_params): def __init__(self, act, act_params):
self._act = act self._act = act
self._act_params = act_params self._act_params = act_params
self.initial_state = None
@staticmethod @staticmethod
def load(path): def load_act(self, path):
with open(path, "rb") as f: with open(path, "rb") as f:
model_data, act_params = cloudpickle.load(f) model_data, act_params = cloudpickle.load(f)
act = deepq.build_act(**act_params) act = deepq.build_act(**act_params)
@@ -42,7 +46,10 @@ class ActWrapper(object):
def __call__(self, *args, **kwargs): def __call__(self, *args, **kwargs):
return self._act(*args, **kwargs) return self._act(*args, **kwargs)
def save(self, path=None): def step(self, observation, **kwargs):
return self._act([observation], **kwargs), None, None, None
def save_act(self, path=None):
"""Save model to a pickle located at `path`""" """Save model to a pickle located at `path`"""
if path is None: if path is None:
path = os.path.join(logger.get_dir(), "model.pkl") path = os.path.join(logger.get_dir(), "model.pkl")
@@ -61,8 +68,11 @@ class ActWrapper(object):
with open(path, "wb") as f: with open(path, "wb") as f:
cloudpickle.dump((model_data, self._act_params), f) cloudpickle.dump((model_data, self._act_params), f)
def save(self, path):
save_state(path)
def load(path):
def load_act(path):
"""Load act function that was returned by learn function. """Load act function that was returned by learn function.
Parameters Parameters
@@ -76,13 +86,14 @@ def load(path):
function that takes a batch of observations function that takes a batch of observations
and returns actions. and returns actions.
""" """
return ActWrapper.load(path) return ActWrapper.load_act(path)
def learn(env, def learn(env,
q_func, network,
seed=None,
lr=5e-4, lr=5e-4,
max_timesteps=100000, total_timesteps=100000,
buffer_size=50000, buffer_size=50000,
exploration_fraction=0.1, exploration_fraction=0.1,
exploration_final_eps=0.02, exploration_final_eps=0.02,
@@ -100,7 +111,10 @@ def learn(env,
prioritized_replay_beta_iters=None, prioritized_replay_beta_iters=None,
prioritized_replay_eps=1e-6, prioritized_replay_eps=1e-6,
param_noise=False, param_noise=False,
callback=None): callback=None,
load_path=None,
**network_kwargs
):
"""Train a deepq model. """Train a deepq model.
Parameters Parameters
@@ -119,7 +133,7 @@ def learn(env,
and returns a tensor of shape (batch_size, num_actions) with values of every action. and returns a tensor of shape (batch_size, num_actions) with values of every action.
lr: float lr: float
learning rate for adam optimizer learning rate for adam optimizer
max_timesteps: int total_timesteps: int
number of env steps to optimizer for number of env steps to optimizer for
buffer_size: int buffer_size: int
size of the replay buffer size of the replay buffer
@@ -153,12 +167,16 @@ def learn(env,
initial value of beta for prioritized replay buffer initial value of beta for prioritized replay buffer
prioritized_replay_beta_iters: int prioritized_replay_beta_iters: int
number of iterations over which beta will be annealed from initial value number of iterations over which beta will be annealed from initial value
to 1.0. If set to None equals to max_timesteps. to 1.0. If set to None equals to total_timesteps.
prioritized_replay_eps: float prioritized_replay_eps: float
epsilon to add to the TD errors when updating priorities. epsilon to add to the TD errors when updating priorities.
callback: (locals, globals) -> None callback: (locals, globals) -> None
function called at every steps with state of the algorithm. function called at every steps with state of the algorithm.
If callback returns true training stops. If callback returns true training stops.
load_path: str
path to load the model from. (default: None)
**network_kwargs
additional keyword arguments to pass to the network builder.
Returns Returns
------- -------
@@ -168,8 +186,10 @@ def learn(env,
""" """
# Create all the functions necessary to train the model # Create all the functions necessary to train the model
sess = tf.Session() sess = get_session()
sess.__enter__() set_global_seeds(seed)
q_func = build_q_func(network, **network_kwargs)
# capture the shape outside the closure so that the env object is not serialized # capture the shape outside the closure so that the env object is not serialized
# by cloudpickle when serializing make_obs_ph # by cloudpickle when serializing make_obs_ph
@@ -199,7 +219,7 @@ def learn(env,
if prioritized_replay: if prioritized_replay:
replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha)
if prioritized_replay_beta_iters is None: if prioritized_replay_beta_iters is None:
prioritized_replay_beta_iters = max_timesteps prioritized_replay_beta_iters = total_timesteps
beta_schedule = LinearSchedule(prioritized_replay_beta_iters, beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
initial_p=prioritized_replay_beta0, initial_p=prioritized_replay_beta0,
final_p=1.0) final_p=1.0)
@@ -207,7 +227,7 @@ def learn(env,
replay_buffer = ReplayBuffer(buffer_size) replay_buffer = ReplayBuffer(buffer_size)
beta_schedule = None beta_schedule = None
# Create the schedule for exploration starting from 1. # Create the schedule for exploration starting from 1.
exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * total_timesteps),
initial_p=1.0, initial_p=1.0,
final_p=exploration_final_eps) final_p=exploration_final_eps)
@@ -225,12 +245,17 @@ def learn(env,
model_file = os.path.join(td, "model") model_file = os.path.join(td, "model")
model_saved = False model_saved = False
if tf.train.latest_checkpoint(td) is not None: if tf.train.latest_checkpoint(td) is not None:
load_state(model_file) load_state(model_file)
logger.log('Loaded model from {}'.format(model_file)) logger.log('Loaded model from {}'.format(model_file))
model_saved = True model_saved = True
elif load_path is not None:
load_state(load_path)
logger.log('Loaded model from {}'.format(load_path))
for t in range(max_timesteps):
for t in range(total_timesteps):
if callback is not None: if callback is not None:
if callback(locals(), globals()): if callback(locals(), globals()):
break break

View File

@@ -0,0 +1,21 @@
def atari():
return dict(
network='conv_only',
lr=1e-4,
buffer_size=10000,
exploration_fraction=0.1,
exploration_final_eps=0.01,
train_freq=4,
learning_starts=10000,
target_network_update_freq=1000,
gamma=0.99,
prioritized_replay=True,
prioritized_replay_alpha=0.6,
checkpoint_freq=10000,
checkpoint_path=None,
dueling=True
)
def retro():
return atari()

View File

@@ -0,0 +1,34 @@
import argparse
import numpy as np
from baselines import deepq
from baselines.common import retro_wrappers
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--env', help='environment ID', default='SuperMarioBros-Nes')
parser.add_argument('--gamestate', help='game state to load', default='Level1-1')
parser.add_argument('--model', help='model pickle file from ActWrapper.save', default='model.pkl')
args = parser.parse_args()
env = retro_wrappers.make_retro(game=args.env, state=args.gamestate, max_episode_steps=None)
env = retro_wrappers.wrap_deepmind_retro(env)
act = deepq.load(args.model)
while True:
obs, done = env.reset(), False
episode_rew = 0
while not done:
env.render()
action = act(obs[None])[0]
env_action = np.zeros(env.action_space.n)
env_action[action] = 1
obs, rew, done, _ = env.step(env_action)
episode_rew += rew
print('Episode reward', episode_rew)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,49 @@
import argparse
from baselines import deepq
from baselines.common import set_global_seeds
from baselines import bench
from baselines import logger
from baselines.common import retro_wrappers
import retro
def main():
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--env', help='environment ID', default='SuperMarioBros-Nes')
parser.add_argument('--gamestate', help='game state to load', default='Level1-1')
parser.add_argument('--seed', help='seed', type=int, default=0)
parser.add_argument('--num-timesteps', type=int, default=int(10e6))
args = parser.parse_args()
logger.configure()
set_global_seeds(args.seed)
env = retro_wrappers.make_retro(game=args.env, state=args.gamestate, max_episode_steps=10000, use_restricted_actions=retro.Actions.DISCRETE)
env.seed(args.seed)
env = bench.Monitor(env, logger.get_dir())
env = retro_wrappers.wrap_deepmind_retro(env)
model = deepq.models.cnn_to_mlp(
convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)],
hiddens=[256],
dueling=True
)
act = deepq.learn(
env,
q_func=model,
lr=1e-4,
max_timesteps=args.num_timesteps,
buffer_size=10000,
exploration_fraction=0.1,
exploration_final_eps=0.01,
train_freq=4,
learning_starts=10000,
target_network_update_freq=1000,
gamma=0.99,
prioritized_replay=True
)
act.save()
env.close()
if __name__ == '__main__':
main()

View File

@@ -89,3 +89,41 @@ def cnn_to_mlp(convs, hiddens, dueling=False, layer_norm=False):
return lambda *args, **kwargs: _cnn_to_mlp(convs, hiddens, dueling, layer_norm=layer_norm, *args, **kwargs) return lambda *args, **kwargs: _cnn_to_mlp(convs, hiddens, dueling, layer_norm=layer_norm, *args, **kwargs)
def build_q_func(network, hiddens=[256], dueling=True, layer_norm=False, **network_kwargs):
if isinstance(network, str):
from baselines.common.models import get_network_builder
network = get_network_builder(network)(**network_kwargs)
def q_func_builder(input_placeholder, num_actions, scope, reuse=False):
with tf.variable_scope(scope, reuse=reuse):
latent, _ = network(input_placeholder)
latent = layers.flatten(latent)
with tf.variable_scope("action_value"):
action_out = latent
for hidden in hiddens:
action_out = layers.fully_connected(action_out, num_outputs=hidden, activation_fn=None)
if layer_norm:
action_out = layers.layer_norm(action_out, center=True, scale=True)
action_out = tf.nn.relu(action_out)
action_scores = layers.fully_connected(action_out, num_outputs=num_actions, activation_fn=None)
if dueling:
with tf.variable_scope("state_value"):
state_out = latent
for hidden in hiddens:
state_out = layers.fully_connected(state_out, num_outputs=hidden, activation_fn=None)
if layer_norm:
state_out = layers.layer_norm(state_out, center=True, scale=True)
state_out = tf.nn.relu(state_out)
state_score = layers.fully_connected(state_out, num_outputs=1, activation_fn=None)
action_scores_mean = tf.reduce_mean(action_scores, 1)
action_scores_centered = action_scores - tf.expand_dims(action_scores_mean, 1)
q_out = state_score + action_scores_centered
else:
q_out = action_scores
return q_out
return q_func_builder

View File

@@ -1,43 +0,0 @@
import tensorflow as tf
import random
from baselines import deepq
from baselines.common.identity_env import IdentityEnv
def test_identity():
with tf.Graph().as_default():
env = IdentityEnv(10)
random.seed(0)
tf.set_random_seed(0)
param_noise = False
model = deepq.models.mlp([32])
act = deepq.learn(
env,
q_func=model,
lr=1e-3,
max_timesteps=10000,
buffer_size=50000,
exploration_fraction=0.1,
exploration_final_eps=0.02,
print_freq=10,
param_noise=param_noise,
)
tf.set_random_seed(0)
N_TRIALS = 1000
sum_rew = 0
obs = env.reset()
for i in range(N_TRIALS):
obs, rew, done, _ = env.step(act([obs]))
sum_rew += rew
assert sum_rew > 0.9 * N_TRIALS
if __name__ == '__main__':
test_identity()

View File

@@ -1,4 +1,5 @@
from baselines.common.input import observation_input from baselines.common.input import observation_input
from baselines.common.tf_util import adjust_shape
import tensorflow as tf import tensorflow as tf
@@ -36,7 +37,7 @@ class PlaceholderTfInput(TfInput):
return self._placeholder return self._placeholder
def make_feed_dict(self, data): def make_feed_dict(self, data):
return {self._placeholder: data} return {self._placeholder: adjust_shape(self._placeholder, data)}
class Uint8Input(PlaceholderTfInput): class Uint8Input(PlaceholderTfInput):

View File

@@ -18,7 +18,7 @@ def train(env_id, num_timesteps, seed):
logger.configure() logger.configure()
else: else:
logger.configure(format_strs=[]) logger.configure(format_strs=[])
workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() if seed is not None else None
set_global_seeds(workerseed) set_global_seeds(workerseed)
env = make_atari(env_id) env = make_atari(env_id)
def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613 def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613

View File

@@ -0,0 +1,22 @@
def mujoco():
return dict(
nsteps=2048,
nminibatches=32,
lam=0.95,
gamma=0.99,
noptepochs=10,
log_interval=1,
ent_coef=0.0,
lr=lambda f: 3e-4 * f,
cliprange=0.2,
value_network='copy'
)
def atari():
return dict(
nsteps=128, nminibatches=4,
lam=0.95, gamma=0.99, noptepochs=4, log_interval=1,
ent_coef=.01,
lr=lambda f : f * 2.5e-4,
cliprange=lambda f : f * 0.1,
)

View File

@@ -1,146 +0,0 @@
import numpy as np
import tensorflow as tf
from baselines.a2c.utils import conv, fc, conv_to_fc, batch_to_seq, seq_to_batch, lstm, lnlstm
from baselines.common.distributions import make_pdtype
from baselines.common.input import observation_input
def nature_cnn(unscaled_images, **conv_kwargs):
"""
CNN from Nature paper.
"""
scaled_images = tf.cast(unscaled_images, tf.float32) / 255.
activ = tf.nn.relu
h = activ(conv(scaled_images, 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2),
**conv_kwargs))
h2 = activ(conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2), **conv_kwargs))
h3 = activ(conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2), **conv_kwargs))
h3 = conv_to_fc(h3)
return activ(fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2)))
class LnLstmPolicy(object):
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False):
nenv = nbatch // nsteps
X, processed_x = observation_input(ob_space, nbatch)
M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
self.pdtype = make_pdtype(ac_space)
with tf.variable_scope("model", reuse=reuse):
h = nature_cnn(processed_x)
xs = batch_to_seq(h, nenv, nsteps)
ms = batch_to_seq(M, nenv, nsteps)
h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm)
h5 = seq_to_batch(h5)
vf = fc(h5, 'v', 1)
self.pd, self.pi = self.pdtype.pdfromlatent(h5)
v0 = vf[:, 0]
a0 = self.pd.sample()
neglogp0 = self.pd.neglogp(a0)
self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)
def step(ob, state, mask):
return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask})
def value(ob, state, mask):
return sess.run(v0, {X:ob, S:state, M:mask})
self.X = X
self.M = M
self.S = S
self.vf = vf
self.step = step
self.value = value
class LstmPolicy(object):
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False):
nenv = nbatch // nsteps
self.pdtype = make_pdtype(ac_space)
X, processed_x = observation_input(ob_space, nbatch)
M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
with tf.variable_scope("model", reuse=reuse):
h = nature_cnn(X)
xs = batch_to_seq(h, nenv, nsteps)
ms = batch_to_seq(M, nenv, nsteps)
h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
h5 = seq_to_batch(h5)
vf = fc(h5, 'v', 1)
self.pd, self.pi = self.pdtype.pdfromlatent(h5)
v0 = vf[:, 0]
a0 = self.pd.sample()
neglogp0 = self.pd.neglogp(a0)
self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)
def step(ob, state, mask):
return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask})
def value(ob, state, mask):
return sess.run(v0, {X:ob, S:state, M:mask})
self.X = X
self.M = M
self.S = S
self.vf = vf
self.step = step
self.value = value
class CnnPolicy(object):
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False, **conv_kwargs): #pylint: disable=W0613
self.pdtype = make_pdtype(ac_space)
X, processed_x = observation_input(ob_space, nbatch)
with tf.variable_scope("model", reuse=reuse):
h = nature_cnn(processed_x, **conv_kwargs)
vf = fc(h, 'v', 1)[:,0]
self.pd, self.pi = self.pdtype.pdfromlatent(h, init_scale=0.01)
a0 = self.pd.sample()
neglogp0 = self.pd.neglogp(a0)
self.initial_state = None
def step(ob, *_args, **_kwargs):
a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob})
return a, v, self.initial_state, neglogp
def value(ob, *_args, **_kwargs):
return sess.run(vf, {X:ob})
self.X = X
self.vf = vf
self.step = step
self.value = value
class MlpPolicy(object):
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False): #pylint: disable=W0613
self.pdtype = make_pdtype(ac_space)
with tf.variable_scope("model", reuse=reuse):
X, processed_x = observation_input(ob_space, nbatch)
activ = tf.tanh
processed_x = tf.layers.flatten(processed_x)
pi_h1 = activ(fc(processed_x, 'pi_fc1', nh=64, init_scale=np.sqrt(2)))
pi_h2 = activ(fc(pi_h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2)))
vf_h1 = activ(fc(processed_x, 'vf_fc1', nh=64, init_scale=np.sqrt(2)))
vf_h2 = activ(fc(vf_h1, 'vf_fc2', nh=64, init_scale=np.sqrt(2)))
vf = fc(vf_h2, 'vf', 1)[:,0]
self.pd, self.pi = self.pdtype.pdfromlatent(pi_h2, init_scale=0.01)
a0 = self.pd.sample()
neglogp0 = self.pd.neglogp(a0)
self.initial_state = None
def step(ob, *_args, **_kwargs):
a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob})
return a, v, self.initial_state, neglogp
def value(ob, *_args, **_kwargs):
return sess.run(vf, {X:ob})
self.X = X
self.vf = vf
self.step = step
self.value = value

View File

@@ -1,21 +1,29 @@
import os import os
import time import time
import joblib import functools
import numpy as np import numpy as np
import os.path as osp import os.path as osp
import tensorflow as tf import tensorflow as tf
from baselines import logger from baselines import logger
from collections import deque from collections import deque
from baselines.common import explained_variance from baselines.common import explained_variance, set_global_seeds
from baselines.common.policies import build_policy
from baselines.common.runners import AbstractEnvRunner from baselines.common.runners import AbstractEnvRunner
from baselines.common.tf_util import get_session, save_variables, load_variables
from baselines.common.mpi_adam_optimizer import MpiAdamOptimizer
from mpi4py import MPI
from baselines.common.tf_util import initialize
from baselines.common.mpi_util import sync_from_root
class Model(object): class Model(object):
def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train, def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train,
nsteps, ent_coef, vf_coef, max_grad_norm): nsteps, ent_coef, vf_coef, max_grad_norm):
sess = tf.get_default_session() sess = get_session()
act_model = policy(sess, ob_space, ac_space, nbatch_act, 1, reuse=False) with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE):
train_model = policy(sess, ob_space, ac_space, nbatch_train, nsteps, reuse=True) act_model = policy(nbatch_act, 1, sess)
train_model = policy(nbatch_train, nsteps, sess)
A = train_model.pdtype.sample_placeholder([None]) A = train_model.pdtype.sample_placeholder([None])
ADV = tf.placeholder(tf.float32, [None]) ADV = tf.placeholder(tf.float32, [None])
@@ -40,14 +48,16 @@ class Model(object):
approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC)) approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC))
clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE))) clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE)))
loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef
with tf.variable_scope('model'): params = tf.trainable_variables('ppo2_model')
params = tf.trainable_variables() trainer = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=LR, epsilon=1e-5)
grads = tf.gradients(loss, params) grads_and_var = trainer.compute_gradients(loss, params)
grads, var = zip(*grads_and_var)
if max_grad_norm is not None: if max_grad_norm is not None:
grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
grads = list(zip(grads, params)) grads_and_var = list(zip(grads, var))
trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5)
_train = trainer.apply_gradients(grads) _train = trainer.apply_gradients(grads_and_var)
def train(lr, cliprange, obs, returns, masks, actions, values, neglogpacs, states=None): def train(lr, cliprange, obs, returns, masks, actions, values, neglogpacs, states=None):
advs = returns - values advs = returns - values
@@ -63,17 +73,6 @@ class Model(object):
)[:-1] )[:-1]
self.loss_names = ['policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac'] self.loss_names = ['policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac']
def save(save_path):
ps = sess.run(params)
joblib.dump(ps, save_path)
def load(load_path):
loaded_params = joblib.load(load_path)
restores = []
for p, loaded_p in zip(params, loaded_params):
restores.append(p.assign(loaded_p))
sess.run(restores)
# If you want to load weights, also save/load observation scaling inside VecNormalize
self.train = train self.train = train
self.train_model = train_model self.train_model = train_model
@@ -81,9 +80,14 @@ class Model(object):
self.step = act_model.step self.step = act_model.step
self.value = act_model.value self.value = act_model.value
self.initial_state = act_model.initial_state self.initial_state = act_model.initial_state
self.save = save
self.load = load self.save = functools.partial(save_variables, sess=sess)
tf.global_variables_initializer().run(session=sess) #pylint: disable=E1101 self.load = functools.partial(load_variables, sess=sess)
if MPI.COMM_WORLD.Get_rank() == 0:
initialize()
global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="")
sync_from_root(sess, global_variables) #pylint: disable=E1101
class Runner(AbstractEnvRunner): class Runner(AbstractEnvRunner):
@@ -97,7 +101,7 @@ class Runner(AbstractEnvRunner):
mb_states = self.states mb_states = self.states
epinfos = [] epinfos = []
for _ in range(self.nsteps): for _ in range(self.nsteps):
actions, values, self.states, neglogpacs = self.model.step(self.obs, self.states, self.dones) actions, values, self.states, neglogpacs = self.model.step(self.obs, S=self.states, M=self.dones)
mb_obs.append(self.obs.copy()) mb_obs.append(self.obs.copy())
mb_actions.append(actions) mb_actions.append(actions)
mb_values.append(values) mb_values.append(values)
@@ -115,7 +119,7 @@ class Runner(AbstractEnvRunner):
mb_values = np.asarray(mb_values, dtype=np.float32) mb_values = np.asarray(mb_values, dtype=np.float32)
mb_neglogpacs = np.asarray(mb_neglogpacs, dtype=np.float32) mb_neglogpacs = np.asarray(mb_neglogpacs, dtype=np.float32)
mb_dones = np.asarray(mb_dones, dtype=np.bool) mb_dones = np.asarray(mb_dones, dtype=np.bool)
last_values = self.model.value(self.obs, self.states, self.dones) last_values = self.model.value(self.obs, S=self.states, M=self.dones)
#discount/bootstrap off value fn #discount/bootstrap off value fn
mb_returns = np.zeros_like(mb_rewards) mb_returns = np.zeros_like(mb_rewards)
mb_advs = np.zeros_like(mb_rewards) mb_advs = np.zeros_like(mb_rewards)
@@ -145,10 +149,65 @@ def constfn(val):
return val return val
return f return f
def learn(*, policy, env, nsteps, total_timesteps, ent_coef, lr, def learn(*, network, env, total_timesteps, seed=None, nsteps=2048, ent_coef=0.0, lr=3e-4,
vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95,
log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2, log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2,
save_interval=0, load_path=None): save_interval=0, load_path=None, **network_kwargs):
'''
Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347)
Parameters:
----------
network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
See baselines.common/policies.py/lstm for more details on using recurrent nets in policies
env: baselines.common.vec_env.VecEnv environment. Needs to be vectorized for parallel environment simulation.
The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class.
nsteps: int number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where
nenv is number of environment copies simulated in parallel)
total_timesteps: int number of timesteps (i.e. number of actions taken in the environment)
ent_coef: float policy entropy coefficient in the optimization objective
lr: float or function learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the
training and 0 is the end of the training.
vf_coef: float value function loss coefficient in the optimization objective
max_grad_norm: float or None gradient norm clipping coefficient
gamma: float discounting factor
lam: float advantage estimation discounting factor (lambda in the paper)
log_interval: int number of timesteps between logging events
nminibatches: int number of training minibatches per update
noptepochs: int number of training epochs per update
cliprange: float or function clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training
and 0 is the end of the training
save_interval: int number of timesteps between saving events
load_path: str path to load the model from
**network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
For instance, 'mlp' network architecture has arguments num_hidden and num_layers.
'''
set_global_seeds(seed)
if isinstance(lr, float): lr = constfn(lr) if isinstance(lr, float): lr = constfn(lr)
else: assert callable(lr) else: assert callable(lr)
@@ -156,6 +215,8 @@ def learn(*, policy, env, nsteps, total_timesteps, ent_coef, lr,
else: assert callable(cliprange) else: assert callable(cliprange)
total_timesteps = int(total_timesteps) total_timesteps = int(total_timesteps)
policy = build_policy(env, network, **network_kwargs)
nenvs = env.num_envs nenvs = env.num_envs
ob_space = env.observation_space ob_space = env.observation_space
ac_space = env.action_space ac_space = env.action_space
@@ -180,7 +241,6 @@ def learn(*, policy, env, nsteps, total_timesteps, ent_coef, lr,
nupdates = total_timesteps//nbatch nupdates = total_timesteps//nbatch
for update in range(1, nupdates+1): for update in range(1, nupdates+1):
assert nbatch % nminibatches == 0 assert nbatch % nminibatches == 0
nbatch_train = nbatch // nminibatches
tstart = time.time() tstart = time.time()
frac = 1.0 - (update - 1.0) / nupdates frac = 1.0 - (update - 1.0) / nupdates
lrnow = lr(frac) lrnow = lr(frac)
@@ -228,8 +288,9 @@ def learn(*, policy, env, nsteps, total_timesteps, ent_coef, lr,
logger.logkv('time_elapsed', tnow - tfirststart) logger.logkv('time_elapsed', tnow - tfirststart)
for (lossval, lossname) in zip(lossvals, model.loss_names): for (lossval, lossname) in zip(lossvals, model.loss_names):
logger.logkv(lossname, lossval) logger.logkv(lossname, lossval)
if MPI.COMM_WORLD.Get_rank() == 0:
logger.dumpkvs() logger.dumpkvs()
if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir(): if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir() and MPI.COMM_WORLD.Get_rank() == 0:
checkdir = osp.join(logger.get_dir(), 'checkpoints') checkdir = osp.join(logger.get_dir(), 'checkpoints')
os.makedirs(checkdir, exist_ok=True) os.makedirs(checkdir, exist_ok=True)
savepath = osp.join(checkdir, '%.5i'%update) savepath = osp.join(checkdir, '%.5i'%update)
@@ -240,3 +301,6 @@ def learn(*, policy, env, nsteps, total_timesteps, ent_coef, lr,
def safemean(xs): def safemean(xs):
return np.nan if len(xs) == 0 else np.mean(xs) return np.nan if len(xs) == 0 else np.mean(xs)

View File

@@ -1,40 +0,0 @@
#!/usr/bin/env python3
import sys
from baselines import logger
from baselines.common.cmd_util import make_atari_env, atari_arg_parser
from baselines.common.vec_env.vec_frame_stack import VecFrameStack
from baselines.ppo2 import ppo2
from baselines.ppo2.policies import CnnPolicy, LstmPolicy, LnLstmPolicy, MlpPolicy
import multiprocessing
import tensorflow as tf
def train(env_id, num_timesteps, seed, policy):
ncpu = multiprocessing.cpu_count()
if sys.platform == 'darwin': ncpu //= 2
config = tf.ConfigProto(allow_soft_placement=True,
intra_op_parallelism_threads=ncpu,
inter_op_parallelism_threads=ncpu)
config.gpu_options.allow_growth = True #pylint: disable=E1101
tf.Session(config=config).__enter__()
env = VecFrameStack(make_atari_env(env_id, 8, seed), 4)
policy = {'cnn' : CnnPolicy, 'lstm' : LstmPolicy, 'lnlstm' : LnLstmPolicy, 'mlp': MlpPolicy}[policy]
ppo2.learn(policy=policy, env=env, nsteps=128, nminibatches=4,
lam=0.95, gamma=0.99, noptepochs=4, log_interval=1,
ent_coef=.01,
lr=lambda f : f * 2.5e-4,
cliprange=lambda f : f * 0.1,
total_timesteps=int(num_timesteps * 1.1))
def main():
parser = atari_arg_parser()
parser.add_argument('--policy', help='Policy architecture', choices=['cnn', 'lstm', 'lnlstm', 'mlp'], default='cnn')
args = parser.parse_args()
logger.configure()
train(args.env, num_timesteps=args.num_timesteps, seed=args.seed,
policy=args.policy)
if __name__ == '__main__':
main()

View File

@@ -1,57 +0,0 @@
#!/usr/bin/env python3
import numpy as np
from baselines.common.cmd_util import mujoco_arg_parser
from baselines import bench, logger
def train(env_id, num_timesteps, seed):
from baselines.common import set_global_seeds
from baselines.common.vec_env.vec_normalize import VecNormalize
from baselines.ppo2 import ppo2
from baselines.ppo2.policies import MlpPolicy
import gym
import tensorflow as tf
from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
ncpu = 1
config = tf.ConfigProto(allow_soft_placement=True,
intra_op_parallelism_threads=ncpu,
inter_op_parallelism_threads=ncpu)
tf.Session(config=config).__enter__()
def make_env():
env = gym.make(env_id)
env = bench.Monitor(env, logger.get_dir(), allow_early_resets=True)
return env
env = DummyVecEnv([make_env])
env = VecNormalize(env)
set_global_seeds(seed)
policy = MlpPolicy
model = ppo2.learn(policy=policy, env=env, nsteps=2048, nminibatches=32,
lam=0.95, gamma=0.99, noptepochs=10, log_interval=1,
ent_coef=0.0,
lr=3e-4,
cliprange=0.2,
total_timesteps=num_timesteps)
return model, env
def main():
args = mujoco_arg_parser().parse_args()
logger.configure()
model, env = train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
if args.play:
logger.log("Running trained model")
obs = np.zeros((env.num_envs,) + env.observation_space.shape)
obs[:] = env.reset()
while True:
actions = model.step(obs)[0]
obs[:] = env.step(actions)[0]
env.render()
if __name__ == '__main__':
main()

230
baselines/run.py Normal file
View File

@@ -0,0 +1,230 @@
import sys
import multiprocessing
import os
import os.path as osp
import gym
from collections import defaultdict
import tensorflow as tf
from baselines.common.vec_env.vec_frame_stack import VecFrameStack
from baselines.common.cmd_util import common_arg_parser, parse_unknown_args, make_mujoco_env, make_atari_env
from baselines.common.tf_util import save_state, load_state, get_session
from baselines import bench, logger
from importlib import import_module
from baselines.common.vec_env.vec_normalize import VecNormalize
from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
from baselines.common import atari_wrappers, retro_wrappers
try:
from mpi4py import MPI
except ImportError:
MPI = None
_game_envs = defaultdict(set)
for env in gym.envs.registry.all():
# solve this with regexes
env_type = env._entry_point.split(':')[0].split('.')[-1]
_game_envs[env_type].add(env.id)
# reading benchmark names directly from retro requires
# importing retro here, and for some reason that crashes tensorflow
# in ubuntu
_game_envs['retro'] = set([
'BubbleBobble-Nes',
'SuperMarioBros-Nes',
'TwinBee3PokoPokoDaimaou-Nes',
'SpaceHarrier-Nes',
'SonicTheHedgehog-Genesis',
'Vectorman-Genesis',
'FinalFight-Snes',
'SpaceInvaders-Snes',
])
def train(args, extra_args):
env_type, env_id = get_env_type(args.env)
total_timesteps = int(args.num_timesteps)
seed = args.seed
learn = get_learn_function(args.alg)
alg_kwargs = get_learn_function_defaults(args.alg, env_type)
alg_kwargs.update(extra_args)
env = build_env(args)
if args.network:
alg_kwargs['network'] = args.network
else:
if alg_kwargs.get('network') is None:
alg_kwargs['network'] = get_default_network(env_type)
print('Training {} on {}:{} with arguments \n{}'.format(args.alg, env_type, env_id, alg_kwargs))
model = learn(
env=env,
seed=seed,
total_timesteps=total_timesteps,
**alg_kwargs
)
return model, env
def build_env(args, render=False):
ncpu = multiprocessing.cpu_count()
if sys.platform == 'darwin': ncpu //= 2
nenv = args.num_env or ncpu if not render else 1
alg = args.alg
rank = MPI.COMM_WORLD.Get_rank() if MPI else 0
seed = args.seed
env_type, env_id = get_env_type(args.env)
if env_type == 'mujoco':
get_session(tf.ConfigProto(allow_soft_placement=True,
intra_op_parallelism_threads=1,
inter_op_parallelism_threads=1))
if args.num_env:
env = SubprocVecEnv([lambda: make_mujoco_env(env_id, seed + i if seed is not None else None, args.reward_scale) for i in range(args.num_env)])
else:
env = DummyVecEnv([lambda: make_mujoco_env(env_id, seed, args.reward_scale)])
env = VecNormalize(env)
elif env_type == 'atari':
if alg == 'acer':
env = make_atari_env(env_id, nenv, seed)
elif alg == 'deepq':
env = atari_wrappers.make_atari(env_id)
env.seed(seed)
env = bench.Monitor(env, logger.get_dir())
env = atari_wrappers.wrap_deepmind(env, frame_stack=True, scale=True)
elif alg == 'trpo_mpi':
env = atari_wrappers.make_atari(env_id)
env.seed(seed)
env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), str(rank)))
env = atari_wrappers.wrap_deepmind(env)
# TODO check if the second seeding is necessary, and eventually remove
env.seed(seed)
else:
frame_stack_size = 4
env = VecFrameStack(make_atari_env(env_id, nenv, seed), frame_stack_size)
elif env_type == 'retro':
import retro
gamestate = args.gamestate or 'Level1-1'
env = retro_wrappers.make_retro(game=args.env, state=gamestate, max_episode_steps=10000, use_restricted_actions=retro.Actions.DISCRETE)
env.seed(args.seed)
env = bench.Monitor(env, logger.get_dir())
env = retro_wrappers.wrap_deepmind_retro(env)
elif env_type == 'classic':
def make_env():
e = gym.make(env_id)
e.seed(seed)
return e
env = DummyVecEnv([make_env])
return env
def get_env_type(env_id):
if env_id in _game_envs.keys():
env_type = env_id
env_id = [g for g in _game_envs[env_type]][0]
else:
env_type = None
for g, e in _game_envs.items():
if env_id in e:
env_type = g
break
assert env_type is not None, 'env_id {} is not recognized in env types'.format(env_id, _game_envs.keys())
return env_type, env_id
def get_default_network(env_type):
if env_type == 'mujoco' or env_type=='classic':
return 'mlp'
if env_type == 'atari':
return 'cnn'
raise ValueError('Unknown env_type {}'.format(env_type))
def get_alg_module(alg, submodule=None):
submodule = submodule or alg
try:
# first try to import the alg module from baselines
alg_module = import_module('.'.join(['baselines', alg, submodule]))
except ImportError:
# then from rl_algs
alg_module = import_module('.'.join(['rl_' + 'algs', alg, submodule]))
return alg_module
def get_learn_function(alg):
return get_alg_module(alg).learn
def get_learn_function_defaults(alg, env_type):
try:
alg_defaults = get_alg_module(alg, 'defaults')
kwargs = getattr(alg_defaults, env_type)()
except (ImportError, AttributeError):
kwargs = {}
return kwargs
def parse(v):
'''
convert value of a command-line arg to a python object if possible, othewise, keep as string
'''
assert isinstance(v, str)
try:
return eval(v)
except (NameError, SyntaxError):
return v
def main():
# configure logger, disable logging in child MPI processes (with rank > 0)
arg_parser = common_arg_parser()
args, unknown_args = arg_parser.parse_known_args()
extra_args = {k: parse(v) for k,v in parse_unknown_args(unknown_args).items()}
if MPI is None or MPI.COMM_WORLD.Get_rank() == 0:
rank = 0
logger.configure()
else:
logger.configure(format_strs = [])
rank = MPI.COMM_WORLD.Get_rank()
model, _ = train(args, extra_args)
if args.save_path is not None and rank == 0:
save_path = osp.expanduser(args.save_path)
model.save(save_path)
if args.play:
logger.log("Running trained model")
env = build_env(args, render=True)
obs = env.reset()
while True:
actions = model.step(obs)[0]
obs, _, done, _ = env.step(actions)
env.render()
if done:
obs = env.reset()
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,30 @@
from rl_common.models import mlp, cnn_small
def atari():
return dict(
network = cnn_small(),
timesteps_per_batch=512,
max_kl=0.001,
cg_iters=10,
cg_damping=1e-3,
gamma=0.98,
lam=1.0,
vf_iters=3,
vf_stepsize=1e-4,
entcoeff=0.00,
)
def mujoco():
return dict(
network = mlp(num_hidden=32, num_layers=2),
timesteps_per_batch=1024,
max_kl=0.01,
cg_iters=10,
cg_damping=0.1,
gamma=0.99,
lam=0.98,
vf_iters=5,
vf_stepsize=1e-3,
normalize_observations=True,
)

View File

@@ -1,56 +0,0 @@
import baselines.common.tf_util as U
import tensorflow as tf
import gym
from baselines.common.distributions import make_pdtype
class CnnPolicy(object):
recurrent = False
def __init__(self, name, ob_space, ac_space):
with tf.variable_scope(name):
self._init(ob_space, ac_space)
self.scope = tf.get_variable_scope().name
def _init(self, ob_space, ac_space):
assert isinstance(ob_space, gym.spaces.Box)
self.pdtype = pdtype = make_pdtype(ac_space)
sequence_length = None
ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))
obscaled = ob / 255.0
with tf.variable_scope("pol"):
x = obscaled
x = tf.nn.relu(U.conv2d(x, 8, "l1", [8, 8], [4, 4], pad="VALID"))
x = tf.nn.relu(U.conv2d(x, 16, "l2", [4, 4], [2, 2], pad="VALID"))
x = U.flattenallbut0(x)
x = tf.nn.relu(tf.layers.dense(x, 128, name='lin', kernel_initializer=U.normc_initializer(1.0)))
logits = tf.layers.dense(x, pdtype.param_shape()[0], name='logits', kernel_initializer=U.normc_initializer(0.01))
self.pd = pdtype.pdfromflat(logits)
with tf.variable_scope("vf"):
x = obscaled
x = tf.nn.relu(U.conv2d(x, 8, "l1", [8, 8], [4, 4], pad="VALID"))
x = tf.nn.relu(U.conv2d(x, 16, "l2", [4, 4], [2, 2], pad="VALID"))
x = U.flattenallbut0(x)
x = tf.nn.relu(tf.layers.dense(x, 128, name='lin', kernel_initializer=U.normc_initializer(1.0)))
self.vpred = tf.layers.dense(x, 1, name='value', kernel_initializer=U.normc_initializer(1.0))
self.vpredz = self.vpred
self.state_in = []
self.state_out = []
stochastic = tf.placeholder(dtype=tf.bool, shape=())
ac = self.pd.sample()
self._act = U.function([stochastic, ob], [ac, self.vpred])
def act(self, stochastic, ob):
ac1, vpred1 = self._act(stochastic, ob[None])
return ac1[0], vpred1[0]
def get_variables(self):
return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope)
def get_trainable_variables(self):
return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)
def get_initial_state(self):
return []

View File

@@ -1,43 +0,0 @@
#!/usr/bin/env python3
from mpi4py import MPI
from baselines.common import set_global_seeds
import os.path as osp
import gym, logging
from baselines import logger
from baselines import bench
from baselines.common.atari_wrappers import make_atari, wrap_deepmind
from baselines.common.cmd_util import atari_arg_parser
def train(env_id, num_timesteps, seed):
from baselines.trpo_mpi.nosharing_cnn_policy import CnnPolicy
from baselines.trpo_mpi import trpo_mpi
import baselines.common.tf_util as U
rank = MPI.COMM_WORLD.Get_rank()
sess = U.single_threaded_session()
sess.__enter__()
if rank == 0:
logger.configure()
else:
logger.configure(format_strs=[])
workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
set_global_seeds(workerseed)
env = make_atari(env_id)
def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613
return CnnPolicy(name=name, ob_space=env.observation_space, ac_space=env.action_space)
env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), str(rank)))
env.seed(workerseed)
env = wrap_deepmind(env)
env.seed(workerseed)
trpo_mpi.learn(env, policy_fn, timesteps_per_batch=512, max_kl=0.001, cg_iters=10, cg_damping=1e-3,
max_timesteps=int(num_timesteps * 1.1), gamma=0.98, lam=1.0, vf_iters=3, vf_stepsize=1e-4, entcoeff=0.00)
env.close()
def main():
args = atari_arg_parser().parse_args()
train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
if __name__ == "__main__":
main()

View File

@@ -1,36 +0,0 @@
#!/usr/bin/env python3
# noinspection PyUnresolvedReferences
from mpi4py import MPI
from baselines.common.cmd_util import make_mujoco_env, mujoco_arg_parser
from baselines import logger
from baselines.ppo1.mlp_policy import MlpPolicy
from baselines.trpo_mpi import trpo_mpi
def train(env_id, num_timesteps, seed):
import baselines.common.tf_util as U
sess = U.single_threaded_session()
sess.__enter__()
rank = MPI.COMM_WORLD.Get_rank()
if rank == 0:
logger.configure()
else:
logger.configure(format_strs=[])
logger.set_level(logger.DISABLED)
workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
def policy_fn(name, ob_space, ac_space):
return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
hid_size=32, num_hid_layers=2)
env = make_mujoco_env(env_id, workerseed)
trpo_mpi.learn(env, policy_fn, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1,
max_timesteps=num_timesteps, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3)
env.close()
def main():
args = mujoco_arg_parser().parse_args()
train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
if __name__ == '__main__':
main()

View File

@@ -6,8 +6,11 @@ import time
from baselines.common import colorize from baselines.common import colorize
from mpi4py import MPI from mpi4py import MPI
from collections import deque from collections import deque
from baselines.common import set_global_seeds
from baselines.common.mpi_adam import MpiAdam from baselines.common.mpi_adam import MpiAdam
from baselines.common.cg import cg from baselines.common.cg import cg
from baselines.common.input import observation_placeholder
from baselines.common.policies import build_policy
from contextlib import contextmanager from contextlib import contextmanager
def traj_segment_generator(pi, env, horizon, stochastic): def traj_segment_generator(pi, env, horizon, stochastic):
@@ -33,7 +36,7 @@ def traj_segment_generator(pi, env, horizon, stochastic):
while True: while True:
prevac = ac prevac = ac
ac, vpred = pi.act(stochastic, ob) ac, vpred, _, _ = pi.step(ob, stochastic=stochastic)
# Slight weirdness here because we need value function at time T # Slight weirdness here because we need value function at time T
# before returning segment [0, T-1] so we get the correct # before returning segment [0, T-1] so we get the correct
# terminal value # terminal value
@@ -41,7 +44,7 @@ def traj_segment_generator(pi, env, horizon, stochastic):
yield {"ob" : obs, "rew" : rews, "vpred" : vpreds, "new" : news, yield {"ob" : obs, "rew" : rews, "vpred" : vpreds, "new" : news,
"ac" : acs, "prevac" : prevacs, "nextvpred": vpred * (1 - new), "ac" : acs, "prevac" : prevacs, "nextvpred": vpred * (1 - new),
"ep_rets" : ep_rets, "ep_lens" : ep_lens} "ep_rets" : ep_rets, "ep_lens" : ep_lens}
_, vpred = pi.act(stochastic, ob) _, vpred, _, _ = pi.step(ob, stochastic=stochastic)
# Be careful!!! if you change the downstream algorithm to aggregate # Be careful!!! if you change the downstream algorithm to aggregate
# several of these batches, then be sure to do a deepcopy # several of these batches, then be sure to do a deepcopy
ep_rets = [] ep_rets = []
@@ -79,30 +82,100 @@ def add_vtarg_and_adv(seg, gamma, lam):
gaelam[t] = lastgaelam = delta + gamma * lam * nonterminal * lastgaelam gaelam[t] = lastgaelam = delta + gamma * lam * nonterminal * lastgaelam
seg["tdlamret"] = seg["adv"] + seg["vpred"] seg["tdlamret"] = seg["adv"] + seg["vpred"]
def learn(env, policy_fn, *, def learn(*,
timesteps_per_batch, # what to train on network,
max_kl, cg_iters, env,
gamma, lam, # advantage estimation total_timesteps,
timesteps_per_batch=1024, # what to train on
max_kl=0.001,
cg_iters=10,
gamma=0.99,
lam=1.0, # advantage estimation
seed=None,
entcoeff=0.0, entcoeff=0.0,
cg_damping=1e-2, cg_damping=1e-2,
vf_stepsize=3e-4, vf_stepsize=3e-4,
vf_iters =3, vf_iters =3,
max_timesteps=0, max_episodes=0, max_iters=0, # time constraint max_episodes=0, max_iters=0, # time constraint
callback=None callback=None,
load_path=None,
**network_kwargs
): ):
'''
learn a policy function with TRPO algorithm
Parameters:
----------
network neural network to learn. Can be either string ('mlp', 'cnn', 'lstm', 'lnlstm' for basic types)
or function that takes input placeholder and returns tuple (output, None) for feedforward nets
or (output, (state_placeholder, state_output, mask_placeholder)) for recurrent nets
env environment (one of the gym environments or wrapped via baselines.common.vec_env.VecEnv-type class
timesteps_per_batch timesteps per gradient estimation batch
max_kl max KL divergence between old policy and new policy ( KL(pi_old || pi) )
entcoeff coefficient of policy entropy term in the optimization objective
cg_iters number of iterations of conjugate gradient algorithm
cg_damping conjugate gradient damping
vf_stepsize learning rate for adam optimizer used to optimie value function loss
vf_iters number of iterations of value function optimization iterations per each policy optimization step
total_timesteps max number of timesteps
max_episodes max number of episodes
max_iters maximum number of policy optimization iterations
callback function to be called with (locals(), globals()) each policy optimization step
load_path str, path to load the model from (default: None, i.e. no model is loaded)
**network_kwargs keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
Returns:
-------
learnt model
'''
nworkers = MPI.COMM_WORLD.Get_size() nworkers = MPI.COMM_WORLD.Get_size()
rank = MPI.COMM_WORLD.Get_rank() rank = MPI.COMM_WORLD.Get_rank()
cpus_per_worker = 1
U.get_session(config=tf.ConfigProto(
allow_soft_placement=True,
inter_op_parallelism_threads=cpus_per_worker,
intra_op_parallelism_threads=cpus_per_worker
))
policy = build_policy(env, network, value_network='copy', **network_kwargs)
set_global_seeds(seed)
np.set_printoptions(precision=3) np.set_printoptions(precision=3)
# Setup losses and stuff # Setup losses and stuff
# ---------------------------------------- # ----------------------------------------
ob_space = env.observation_space ob_space = env.observation_space
ac_space = env.action_space ac_space = env.action_space
pi = policy_fn("pi", ob_space, ac_space)
oldpi = policy_fn("oldpi", ob_space, ac_space) ob = observation_placeholder(ob_space)
with tf.variable_scope("pi"):
pi = policy(observ_placeholder=ob)
with tf.variable_scope("oldpi"):
oldpi = policy(observ_placeholder=ob)
atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable)
ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return
ob = U.get_placeholder_cached(name="ob")
ac = pi.pdtype.sample_placeholder([None]) ac = pi.pdtype.sample_placeholder([None])
kloldnew = oldpi.pd.kl(pi.pd) kloldnew = oldpi.pd.kl(pi.pd)
@@ -111,7 +184,7 @@ def learn(env, policy_fn, *,
meanent = tf.reduce_mean(ent) meanent = tf.reduce_mean(ent)
entbonus = entcoeff * meanent entbonus = entcoeff * meanent
vferr = tf.reduce_mean(tf.square(pi.vpred - ret)) vferr = tf.reduce_mean(tf.square(pi.vf - ret))
ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold
surrgain = tf.reduce_mean(ratio * atarg) surrgain = tf.reduce_mean(ratio * atarg)
@@ -122,9 +195,12 @@ def learn(env, policy_fn, *,
dist = meankl dist = meankl
all_var_list = pi.get_trainable_variables() all_var_list = get_trainable_variables("pi")
var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")] # var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")]
vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")] # vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")]
var_list = get_pi_trainable_variables("pi")
vf_var_list = get_vf_trainable_variables("pi")
vfadam = MpiAdam(vf_var_list) vfadam = MpiAdam(vf_var_list)
get_flat = U.GetFlat(var_list) get_flat = U.GetFlat(var_list)
@@ -142,7 +218,8 @@ def learn(env, policy_fn, *,
fvp = U.flatgrad(gvp, var_list) fvp = U.flatgrad(gvp, var_list)
assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv) assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv)
for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())]) for (oldv, newv) in zipsame(get_variables("oldpi"), get_variables("pi"))])
compute_losses = U.function([ob, ac, atarg], losses) compute_losses = U.function([ob, ac, atarg], losses)
compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)]) compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)])
compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp) compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp)
@@ -166,6 +243,9 @@ def learn(env, policy_fn, *,
return out return out
U.initialize() U.initialize()
if load_path is not None:
pi.load(load_path)
th_init = get_flat() th_init = get_flat()
MPI.COMM_WORLD.Bcast(th_init, root=0) MPI.COMM_WORLD.Bcast(th_init, root=0)
set_from_flat(th_init) set_from_flat(th_init)
@@ -183,11 +263,16 @@ def learn(env, policy_fn, *,
lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths
rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards
assert sum([max_iters>0, max_timesteps>0, max_episodes>0])==1 if sum([max_iters>0, total_timesteps>0, max_episodes>0])==0:
# noththing to be done
return pi
assert sum([max_iters>0, total_timesteps>0, max_episodes>0]) < 2, \
'out of max_iters, total_timesteps, and max_episodes only one should be specified'
while True: while True:
if callback: callback(locals(), globals()) if callback: callback(locals(), globals())
if max_timesteps and timesteps_so_far >= max_timesteps: if total_timesteps and timesteps_so_far >= total_timesteps:
break break
elif max_episodes and episodes_so_far >= max_episodes: elif max_episodes and episodes_so_far >= max_episodes:
break break
@@ -287,5 +372,20 @@ def learn(env, policy_fn, *,
if rank==0: if rank==0:
logger.dump_tabular() logger.dump_tabular()
return pi
def flatten_lists(listoflists): def flatten_lists(listoflists):
return [el for list_ in listoflists for el in list_] return [el for list_ in listoflists for el in list_]
def get_variables(scope):
return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope)
def get_trainable_variables(scope):
return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope)
def get_vf_trainable_variables(scope):
return [v for v in get_trainable_variables(scope) if 'vf' in v.name[len(scope):].split('/')]
def get_pi_trainable_variables(scope):
return [v for v in get_trainable_variables(scope) if 'pi' in v.name[len(scope):].split('/')]

19
conftest.py Normal file
View File

@@ -0,0 +1,19 @@
import pytest
def pytest_addoption(parser):
parser.addoption('--runslow', action='store_true', default=False, help='run slow tests')
def pytest_collection_modifyitems(config, items):
if config.getoption('--runslow'):
# --runslow given in cli: do not skip slow tests
return
skip_slow = pytest.mark.skip(reason='need --runslow option to run')
slow_tests = []
for item in items:
if 'slow' in item.keywords:
slow_tests.append(item.name)
item.add_marker(skip_slow)
print('skipping slow tests', ' '.join(slow_tests), 'use --runslow to run this')

View File

@@ -14,7 +14,6 @@ setup(name='baselines',
'scipy', 'scipy',
'tqdm', 'tqdm',
'joblib', 'joblib',
'zmq',
'dill', 'dill',
'progressbar2', 'progressbar2',
'mpi4py', 'mpi4py',
@@ -23,6 +22,12 @@ setup(name='baselines',
'click', 'click',
'opencv-python' 'opencv-python'
], ],
extras_require={
'test': [
'filelock',
'pytest'
]
},
description='OpenAI baselines: high quality implementations of reinforcement learning algorithms', description='OpenAI baselines: high quality implementations of reinforcement learning algorithms',
author='OpenAI', author='OpenAI',
url='https://github.com/openai/baselines', url='https://github.com/openai/baselines',