Compare commits
21 Commits
peterz_mpi
...
peterz_pr_
Author | SHA1 | Date | |
---|---|---|---|
|
f05b716b03 | ||
|
5cd66010dc | ||
|
0a13da8dfe | ||
|
18b6390be6 | ||
|
52255beda5 | ||
|
d80acbb4d1 | ||
|
556b198454 | ||
|
cc88804042 | ||
|
c14d307834 | ||
|
0b71d4c6c4 | ||
|
7bb405c7a7 | ||
|
8b95576a92 | ||
|
9d4fb76ef0 | ||
|
664ec6faf0 | ||
|
3917321fbe | ||
|
6e607efa90 | ||
|
c74ce02b9d | ||
|
ab59de6922 | ||
|
a071fa7630 | ||
|
637bf55da7 | ||
|
165c622572 |
@@ -11,4 +11,4 @@ install:
|
|||||||
|
|
||||||
script:
|
script:
|
||||||
- flake8 . --show-source --statistics
|
- flake8 . --show-source --statistics
|
||||||
- docker run baselines-test pytest -v .
|
- docker run baselines-test pytest -v --forked .
|
||||||
|
15
Dockerfile
15
Dockerfile
@@ -1,16 +1,9 @@
|
|||||||
FROM ubuntu:16.04
|
FROM python:3.6
|
||||||
|
|
||||||
|
RUN apt-get -y update && apt-get -y install ffmpeg
|
||||||
|
# RUN apt-get -y update && apt-get -y install git wget python-dev python3-dev libopenmpi-dev python-pip zlib1g-dev cmake python-opencv
|
||||||
|
|
||||||
RUN apt-get -y update && apt-get -y install git wget python-dev python3-dev libopenmpi-dev python-pip zlib1g-dev cmake python-opencv
|
|
||||||
ENV CODE_DIR /root/code
|
ENV CODE_DIR /root/code
|
||||||
ENV VENV /root/venv
|
|
||||||
|
|
||||||
RUN \
|
|
||||||
pip install virtualenv && \
|
|
||||||
virtualenv $VENV --python=python3 && \
|
|
||||||
. $VENV/bin/activate && \
|
|
||||||
pip install --upgrade pip
|
|
||||||
|
|
||||||
ENV PATH=$VENV/bin:$PATH
|
|
||||||
|
|
||||||
COPY . $CODE_DIR/baselines
|
COPY . $CODE_DIR/baselines
|
||||||
WORKDIR $CODE_DIR/baselines
|
WORKDIR $CODE_DIR/baselines
|
||||||
|
12
README.md
12
README.md
@@ -109,17 +109,9 @@ python -m baselines.run --alg=ppo2 --env=PongNoFrameskip-v4 --num_timesteps=0 --
|
|||||||
|
|
||||||
*NOTE:* At the moment Mujoco training uses VecNormalize wrapper for the environment which is not being saved correctly; so loading the models trained on Mujoco will not work well if the environment is recreated. If necessary, you can work around that by replacing RunningMeanStd by TfRunningMeanStd in [baselines/common/vec_env/vec_normalize.py](baselines/common/vec_env/vec_normalize.py#L12). This way, mean and std of environment normalizing wrapper will be saved in tensorflow variables and included in the model file; however, training is slower that way - hence not including it by default
|
*NOTE:* At the moment Mujoco training uses VecNormalize wrapper for the environment which is not being saved correctly; so loading the models trained on Mujoco will not work well if the environment is recreated. If necessary, you can work around that by replacing RunningMeanStd by TfRunningMeanStd in [baselines/common/vec_env/vec_normalize.py](baselines/common/vec_env/vec_normalize.py#L12). This way, mean and std of environment normalizing wrapper will be saved in tensorflow variables and included in the model file; however, training is slower that way - hence not including it by default
|
||||||
|
|
||||||
|
## Loading and vizualizing learning curves and other training metrics
|
||||||
|
See [here](docs/viz/viz.ipynb) for instructions on how to load and display the training data.
|
||||||
|
|
||||||
## Using baselines with TensorBoard
|
|
||||||
Baselines logger can save data in the TensorBoard format. To do so, set environment variables `OPENAI_LOG_FORMAT` and `OPENAI_LOGDIR`:
|
|
||||||
```bash
|
|
||||||
export OPENAI_LOG_FORMAT='stdout,log,csv,tensorboard' # formats are comma-separated, but for tensorboard you only really need the last one
|
|
||||||
export OPENAI_LOGDIR=path/to/tensorboard/data
|
|
||||||
```
|
|
||||||
And you can now start TensorBoard with:
|
|
||||||
```bash
|
|
||||||
tensorboard --logdir=$OPENAI_LOGDIR
|
|
||||||
```
|
|
||||||
## Subpackages
|
## Subpackages
|
||||||
|
|
||||||
- [A2C](baselines/a2c)
|
- [A2C](baselines/a2c)
|
||||||
|
@@ -129,18 +129,26 @@ class ClipRewardEnv(gym.RewardWrapper):
|
|||||||
return np.sign(reward)
|
return np.sign(reward)
|
||||||
|
|
||||||
class WarpFrame(gym.ObservationWrapper):
|
class WarpFrame(gym.ObservationWrapper):
|
||||||
def __init__(self, env):
|
def __init__(self, env, width=84, height=84, grayscale=True):
|
||||||
"""Warp frames to 84x84 as done in the Nature paper and later work."""
|
"""Warp frames to 84x84 as done in the Nature paper and later work."""
|
||||||
gym.ObservationWrapper.__init__(self, env)
|
gym.ObservationWrapper.__init__(self, env)
|
||||||
self.width = 84
|
self.width = width
|
||||||
self.height = 84
|
self.height = height
|
||||||
self.observation_space = spaces.Box(low=0, high=255,
|
self.grayscale = grayscale
|
||||||
shape=(self.height, self.width, 1), dtype=np.uint8)
|
if self.grayscale:
|
||||||
|
self.observation_space = spaces.Box(low=0, high=255,
|
||||||
|
shape=(self.height, self.width, 1), dtype=np.uint8)
|
||||||
|
else:
|
||||||
|
self.observation_space = spaces.Box(low=0, high=255,
|
||||||
|
shape=(self.height, self.width, 3), dtype=np.uint8)
|
||||||
|
|
||||||
def observation(self, frame):
|
def observation(self, frame):
|
||||||
frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
|
if self.grayscale:
|
||||||
|
frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
|
||||||
frame = cv2.resize(frame, (self.width, self.height), interpolation=cv2.INTER_AREA)
|
frame = cv2.resize(frame, (self.width, self.height), interpolation=cv2.INTER_AREA)
|
||||||
return frame[:, :, None]
|
if self.grayscale:
|
||||||
|
frame = np.expand_dims(frame, -1)
|
||||||
|
return frame
|
||||||
|
|
||||||
class FrameStack(gym.Wrapper):
|
class FrameStack(gym.Wrapper):
|
||||||
def __init__(self, env, k):
|
def __init__(self, env, k):
|
||||||
@@ -156,7 +164,7 @@ class FrameStack(gym.Wrapper):
|
|||||||
self.k = k
|
self.k = k
|
||||||
self.frames = deque([], maxlen=k)
|
self.frames = deque([], maxlen=k)
|
||||||
shp = env.observation_space.shape
|
shp = env.observation_space.shape
|
||||||
self.observation_space = spaces.Box(low=0, high=255, shape=(shp[0], shp[1], shp[2] * k), dtype=env.observation_space.dtype)
|
self.observation_space = spaces.Box(low=0, high=255, shape=(shp[:-1] + (shp[-1] * k,)), dtype=env.observation_space.dtype)
|
||||||
|
|
||||||
def reset(self):
|
def reset(self):
|
||||||
ob = self.env.reset()
|
ob = self.env.reset()
|
||||||
@@ -197,7 +205,7 @@ class LazyFrames(object):
|
|||||||
|
|
||||||
def _force(self):
|
def _force(self):
|
||||||
if self._out is None:
|
if self._out is None:
|
||||||
self._out = np.concatenate(self._frames, axis=2)
|
self._out = np.concatenate(self._frames, axis=-1)
|
||||||
self._frames = None
|
self._frames = None
|
||||||
return self._out
|
return self._out
|
||||||
|
|
||||||
|
@@ -60,12 +60,14 @@ def make_env(env_id, env_type, subrank=0, seed=None, reward_scale=1.0, gamestate
|
|||||||
allow_early_resets=True)
|
allow_early_resets=True)
|
||||||
|
|
||||||
if env_type == 'atari':
|
if env_type == 'atari':
|
||||||
return wrap_deepmind(env, **wrapper_kwargs)
|
env = wrap_deepmind(env, **wrapper_kwargs)
|
||||||
elif reward_scale != 1:
|
elif env_type == 'retro':
|
||||||
return retro_wrappers.RewardScaler(env, reward_scale)
|
env = retro_wrappers.wrap_deepmind_retro(env, **wrapper_kwargs)
|
||||||
else:
|
|
||||||
return env
|
|
||||||
|
|
||||||
|
if reward_scale != 1:
|
||||||
|
env = retro_wrappers.RewardScaler(env, reward_scale)
|
||||||
|
|
||||||
|
return env
|
||||||
|
|
||||||
|
|
||||||
def make_mujoco_env(env_id, seed, reward_scale=1.0):
|
def make_mujoco_env(env_id, seed, reward_scale=1.0):
|
||||||
@@ -129,6 +131,8 @@ def common_arg_parser():
|
|||||||
parser.add_argument('--num_env', help='Number of environment copies being run in parallel. When not specified, set to number of cpus for Atari, and to 1 for Mujoco', default=None, type=int)
|
parser.add_argument('--num_env', help='Number of environment copies being run in parallel. When not specified, set to number of cpus for Atari, and to 1 for Mujoco', default=None, type=int)
|
||||||
parser.add_argument('--reward_scale', help='Reward scale factor. Default: 1.0', default=1.0, type=float)
|
parser.add_argument('--reward_scale', help='Reward scale factor. Default: 1.0', default=1.0, type=float)
|
||||||
parser.add_argument('--save_path', help='Path to save trained model to', default=None, type=str)
|
parser.add_argument('--save_path', help='Path to save trained model to', default=None, type=str)
|
||||||
|
parser.add_argument('--save_video_interval', help='Save video every x steps (0 = disabled)', default=0, type=int)
|
||||||
|
parser.add_argument('--save_video_length', help='Length of recorded video. Default: 200', default=200, type=int)
|
||||||
parser.add_argument('--play', default=False, action='store_true')
|
parser.add_argument('--play', default=False, action='store_true')
|
||||||
return parser
|
return parser
|
||||||
|
|
||||||
|
@@ -62,7 +62,7 @@ class CategoricalPdType(PdType):
|
|||||||
def pdclass(self):
|
def pdclass(self):
|
||||||
return CategoricalPd
|
return CategoricalPd
|
||||||
def pdfromlatent(self, latent_vector, init_scale=1.0, init_bias=0.0):
|
def pdfromlatent(self, latent_vector, init_scale=1.0, init_bias=0.0):
|
||||||
pdparam = fc(latent_vector, 'pi', self.ncat, init_scale=init_scale, init_bias=init_bias)
|
pdparam = _matching_fc(latent_vector, 'pi', self.ncat, init_scale=init_scale, init_bias=init_bias)
|
||||||
return self.pdfromflat(pdparam), pdparam
|
return self.pdfromflat(pdparam), pdparam
|
||||||
|
|
||||||
def param_shape(self):
|
def param_shape(self):
|
||||||
@@ -82,7 +82,7 @@ class MultiCategoricalPdType(PdType):
|
|||||||
return MultiCategoricalPd(self.ncats, flat)
|
return MultiCategoricalPd(self.ncats, flat)
|
||||||
|
|
||||||
def pdfromlatent(self, latent, init_scale=1.0, init_bias=0.0):
|
def pdfromlatent(self, latent, init_scale=1.0, init_bias=0.0):
|
||||||
pdparam = fc(latent, 'pi', self.ncats.sum(), init_scale=init_scale, init_bias=init_bias)
|
pdparam = _matching_fc(latent, 'pi', self.ncats.sum(), init_scale=init_scale, init_bias=init_bias)
|
||||||
return self.pdfromflat(pdparam), pdparam
|
return self.pdfromflat(pdparam), pdparam
|
||||||
|
|
||||||
def param_shape(self):
|
def param_shape(self):
|
||||||
@@ -99,7 +99,7 @@ class DiagGaussianPdType(PdType):
|
|||||||
return DiagGaussianPd
|
return DiagGaussianPd
|
||||||
|
|
||||||
def pdfromlatent(self, latent_vector, init_scale=1.0, init_bias=0.0):
|
def pdfromlatent(self, latent_vector, init_scale=1.0, init_bias=0.0):
|
||||||
mean = fc(latent_vector, 'pi', self.size, init_scale=init_scale, init_bias=init_bias)
|
mean = _matching_fc(latent_vector, 'pi', self.size, init_scale=init_scale, init_bias=init_bias)
|
||||||
logstd = tf.get_variable(name='pi/logstd', shape=[1, self.size], initializer=tf.zeros_initializer())
|
logstd = tf.get_variable(name='pi/logstd', shape=[1, self.size], initializer=tf.zeros_initializer())
|
||||||
pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
|
pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
|
||||||
return self.pdfromflat(pdparam), mean
|
return self.pdfromflat(pdparam), mean
|
||||||
@@ -123,7 +123,7 @@ class BernoulliPdType(PdType):
|
|||||||
def sample_dtype(self):
|
def sample_dtype(self):
|
||||||
return tf.int32
|
return tf.int32
|
||||||
def pdfromlatent(self, latent_vector, init_scale=1.0, init_bias=0.0):
|
def pdfromlatent(self, latent_vector, init_scale=1.0, init_bias=0.0):
|
||||||
pdparam = fc(latent_vector, 'pi', self.size, init_scale=init_scale, init_bias=init_bias)
|
pdparam = _matching_fc(latent_vector, 'pi', self.size, init_scale=init_scale, init_bias=init_bias)
|
||||||
return self.pdfromflat(pdparam), pdparam
|
return self.pdfromflat(pdparam), pdparam
|
||||||
|
|
||||||
# WRONG SECOND DERIVATIVES
|
# WRONG SECOND DERIVATIVES
|
||||||
@@ -345,3 +345,9 @@ def validate_probtype(probtype, pdparam):
|
|||||||
assert np.abs(klval - klval_ll) < 3 * klval_ll_stderr # within 3 sigmas
|
assert np.abs(klval - klval_ll) < 3 * klval_ll_stderr # within 3 sigmas
|
||||||
print('ok on', probtype, pdparam)
|
print('ok on', probtype, pdparam)
|
||||||
|
|
||||||
|
|
||||||
|
def _matching_fc(tensor, name, size, init_scale, init_bias):
|
||||||
|
if tensor.shape[-1] == size:
|
||||||
|
return tensor
|
||||||
|
else:
|
||||||
|
return fc(tensor, name, size, init_scale=init_scale, init_bias=init_bias)
|
||||||
|
@@ -1,7 +1,11 @@
|
|||||||
from mpi4py import MPI
|
|
||||||
import baselines.common.tf_util as U
|
import baselines.common.tf_util as U
|
||||||
import tensorflow as tf
|
import tensorflow as tf
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
try:
|
||||||
|
from mpi4py import MPI
|
||||||
|
except ImportError:
|
||||||
|
MPI = None
|
||||||
|
|
||||||
|
|
||||||
class MpiAdam(object):
|
class MpiAdam(object):
|
||||||
def __init__(self, var_list, *, beta1=0.9, beta2=0.999, epsilon=1e-08, scale_grad_by_procs=True, comm=None):
|
def __init__(self, var_list, *, beta1=0.9, beta2=0.999, epsilon=1e-08, scale_grad_by_procs=True, comm=None):
|
||||||
@@ -16,16 +20,19 @@ class MpiAdam(object):
|
|||||||
self.t = 0
|
self.t = 0
|
||||||
self.setfromflat = U.SetFromFlat(var_list)
|
self.setfromflat = U.SetFromFlat(var_list)
|
||||||
self.getflat = U.GetFlat(var_list)
|
self.getflat = U.GetFlat(var_list)
|
||||||
self.comm = MPI.COMM_WORLD if comm is None else comm
|
self.comm = MPI.COMM_WORLD if comm is None and MPI is not None else comm
|
||||||
|
|
||||||
def update(self, localg, stepsize):
|
def update(self, localg, stepsize):
|
||||||
if self.t % 100 == 0:
|
if self.t % 100 == 0:
|
||||||
self.check_synced()
|
self.check_synced()
|
||||||
localg = localg.astype('float32')
|
localg = localg.astype('float32')
|
||||||
globalg = np.zeros_like(localg)
|
if self.comm is not None:
|
||||||
self.comm.Allreduce(localg, globalg, op=MPI.SUM)
|
globalg = np.zeros_like(localg)
|
||||||
if self.scale_grad_by_procs:
|
self.comm.Allreduce(localg, globalg, op=MPI.SUM)
|
||||||
globalg /= self.comm.Get_size()
|
if self.scale_grad_by_procs:
|
||||||
|
globalg /= self.comm.Get_size()
|
||||||
|
else:
|
||||||
|
globalg = np.copy(localg)
|
||||||
|
|
||||||
self.t += 1
|
self.t += 1
|
||||||
a = stepsize * np.sqrt(1 - self.beta2**self.t)/(1 - self.beta1**self.t)
|
a = stepsize * np.sqrt(1 - self.beta2**self.t)/(1 - self.beta1**self.t)
|
||||||
@@ -35,11 +42,15 @@ class MpiAdam(object):
|
|||||||
self.setfromflat(self.getflat() + step)
|
self.setfromflat(self.getflat() + step)
|
||||||
|
|
||||||
def sync(self):
|
def sync(self):
|
||||||
|
if self.comm is None:
|
||||||
|
return
|
||||||
theta = self.getflat()
|
theta = self.getflat()
|
||||||
self.comm.Bcast(theta, root=0)
|
self.comm.Bcast(theta, root=0)
|
||||||
self.setfromflat(theta)
|
self.setfromflat(theta)
|
||||||
|
|
||||||
def check_synced(self):
|
def check_synced(self):
|
||||||
|
if self.comm is None:
|
||||||
|
return
|
||||||
if self.comm.Get_rank() == 0: # this is root
|
if self.comm.Get_rank() == 0: # this is root
|
||||||
theta = self.getflat()
|
theta = self.getflat()
|
||||||
self.comm.Bcast(theta, root=0)
|
self.comm.Bcast(theta, root=0)
|
||||||
@@ -63,17 +74,30 @@ def test_MpiAdam():
|
|||||||
do_update = U.function([], loss, updates=[update_op])
|
do_update = U.function([], loss, updates=[update_op])
|
||||||
|
|
||||||
tf.get_default_session().run(tf.global_variables_initializer())
|
tf.get_default_session().run(tf.global_variables_initializer())
|
||||||
|
losslist_ref = []
|
||||||
for i in range(10):
|
for i in range(10):
|
||||||
print(i,do_update())
|
l = do_update()
|
||||||
|
print(i, l)
|
||||||
|
losslist_ref.append(l)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
tf.set_random_seed(0)
|
tf.set_random_seed(0)
|
||||||
tf.get_default_session().run(tf.global_variables_initializer())
|
tf.get_default_session().run(tf.global_variables_initializer())
|
||||||
|
|
||||||
var_list = [a,b]
|
var_list = [a,b]
|
||||||
lossandgrad = U.function([], [loss, U.flatgrad(loss, var_list)], updates=[update_op])
|
lossandgrad = U.function([], [loss, U.flatgrad(loss, var_list)])
|
||||||
adam = MpiAdam(var_list)
|
adam = MpiAdam(var_list)
|
||||||
|
|
||||||
|
losslist_test = []
|
||||||
for i in range(10):
|
for i in range(10):
|
||||||
l,g = lossandgrad()
|
l,g = lossandgrad()
|
||||||
adam.update(g, stepsize)
|
adam.update(g, stepsize)
|
||||||
print(i,l)
|
print(i,l)
|
||||||
|
losslist_test.append(l)
|
||||||
|
|
||||||
|
np.testing.assert_allclose(np.array(losslist_ref), np.array(losslist_test), atol=1e-4)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
test_MpiAdam()
|
||||||
|
@@ -1,4 +1,8 @@
|
|||||||
from mpi4py import MPI
|
try:
|
||||||
|
from mpi4py import MPI
|
||||||
|
except ImportError:
|
||||||
|
MPI = None
|
||||||
|
|
||||||
import tensorflow as tf, baselines.common.tf_util as U, numpy as np
|
import tensorflow as tf, baselines.common.tf_util as U, numpy as np
|
||||||
|
|
||||||
class RunningMeanStd(object):
|
class RunningMeanStd(object):
|
||||||
@@ -39,7 +43,8 @@ class RunningMeanStd(object):
|
|||||||
n = int(np.prod(self.shape))
|
n = int(np.prod(self.shape))
|
||||||
totalvec = np.zeros(n*2+1, 'float64')
|
totalvec = np.zeros(n*2+1, 'float64')
|
||||||
addvec = np.concatenate([x.sum(axis=0).ravel(), np.square(x).sum(axis=0).ravel(), np.array([len(x)],dtype='float64')])
|
addvec = np.concatenate([x.sum(axis=0).ravel(), np.square(x).sum(axis=0).ravel(), np.array([len(x)],dtype='float64')])
|
||||||
MPI.COMM_WORLD.Allreduce(addvec, totalvec, op=MPI.SUM)
|
if MPI is not None:
|
||||||
|
MPI.COMM_WORLD.Allreduce(addvec, totalvec, op=MPI.SUM)
|
||||||
self.incfiltparams(totalvec[0:n].reshape(self.shape), totalvec[n:2*n].reshape(self.shape), totalvec[2*n])
|
self.incfiltparams(totalvec[0:n].reshape(self.shape), totalvec[n:2*n].reshape(self.shape), totalvec[2*n])
|
||||||
|
|
||||||
@U.in_session
|
@U.in_session
|
||||||
|
401
baselines/common/plot_util.py
Normal file
401
baselines/common/plot_util.py
Normal file
@@ -0,0 +1,401 @@
|
|||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import os.path as osp
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import numpy as np
|
||||||
|
import pandas
|
||||||
|
from collections import defaultdict, namedtuple
|
||||||
|
from baselines.bench import monitor
|
||||||
|
from baselines.logger import read_json, read_csv
|
||||||
|
|
||||||
|
def smooth(y, radius, mode='two_sided', valid_only=False):
|
||||||
|
'''
|
||||||
|
Smooth signal y, where radius is determines the size of the window
|
||||||
|
|
||||||
|
mode='twosided':
|
||||||
|
average over the window [max(index - radius, 0), min(index + radius, len(y)-1)]
|
||||||
|
mode='causal':
|
||||||
|
average over the window [max(index - radius, 0), index]
|
||||||
|
|
||||||
|
valid_only: put nan in entries where the full-sized window is not available
|
||||||
|
|
||||||
|
'''
|
||||||
|
assert mode in ('two_sided', 'causal')
|
||||||
|
if len(y) < 2*radius+1:
|
||||||
|
return np.ones_like(y) * y.mean()
|
||||||
|
elif mode == 'two_sided':
|
||||||
|
convkernel = np.ones(2 * radius+1)
|
||||||
|
out = np.convolve(y, convkernel,mode='same') / np.convolve(np.ones_like(y), convkernel, mode='same')
|
||||||
|
if valid_only:
|
||||||
|
out[:radius] = out[-radius:] = np.nan
|
||||||
|
elif mode == 'causal':
|
||||||
|
convkernel = np.ones(radius)
|
||||||
|
out = np.convolve(y, convkernel,mode='full') / np.convolve(np.ones_like(y), convkernel, mode='full')
|
||||||
|
out = out[:-radius+1]
|
||||||
|
if valid_only:
|
||||||
|
out[:radius] = np.nan
|
||||||
|
return out
|
||||||
|
|
||||||
|
def one_sided_ema(xolds, yolds, low=None, high=None, n=512, decay_steps=1., low_counts_threshold=1e-8):
|
||||||
|
'''
|
||||||
|
perform one-sided (causal) EMA (exponential moving average)
|
||||||
|
smoothing and resampling to an even grid with n points.
|
||||||
|
Does not do extrapolation, so we assume
|
||||||
|
xolds[0] <= low && high <= xolds[-1]
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
|
||||||
|
xolds: array or list - x values of data. Needs to be sorted in ascending order
|
||||||
|
yolds: array of list - y values of data. Has to have the same length as xolds
|
||||||
|
|
||||||
|
low: float - min value of the new x grid. By default equals to xolds[0]
|
||||||
|
high: float - max value of the new x grid. By default equals to xolds[-1]
|
||||||
|
|
||||||
|
n: int - number of points in new x grid
|
||||||
|
|
||||||
|
decay_steps: float - EMA decay factor, expressed in new x grid steps.
|
||||||
|
|
||||||
|
low_counts_threshold: float or int
|
||||||
|
- y values with counts less than this value will be set to NaN
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
tuple sum_ys, count_ys where
|
||||||
|
xs - array with new x grid
|
||||||
|
ys - array of EMA of y at each point of the new x grid
|
||||||
|
count_ys - array of EMA of y counts at each point of the new x grid
|
||||||
|
|
||||||
|
'''
|
||||||
|
|
||||||
|
low = xolds[0] if low is None else low
|
||||||
|
high = xolds[-1] if high is None else high
|
||||||
|
|
||||||
|
assert xolds[0] <= low, 'low = {} < xolds[0] = {} - extrapolation not permitted!'.format(low, xolds[0])
|
||||||
|
assert xolds[-1] >= high, 'high = {} > xolds[-1] = {} - extrapolation not permitted!'.format(high, xolds[-1])
|
||||||
|
assert len(xolds) == len(yolds), 'length of xolds ({}) and yolds ({}) do not match!'.format(len(xolds), len(yolds))
|
||||||
|
|
||||||
|
|
||||||
|
xolds = xolds.astype('float64')
|
||||||
|
yolds = yolds.astype('float64')
|
||||||
|
|
||||||
|
luoi = 0 # last unused old index
|
||||||
|
sum_y = 0.
|
||||||
|
count_y = 0.
|
||||||
|
xnews = np.linspace(low, high, n)
|
||||||
|
decay_period = (high - low) / (n - 1) * decay_steps
|
||||||
|
interstep_decay = np.exp(- 1. / decay_steps)
|
||||||
|
sum_ys = np.zeros_like(xnews)
|
||||||
|
count_ys = np.zeros_like(xnews)
|
||||||
|
for i in range(n):
|
||||||
|
xnew = xnews[i]
|
||||||
|
sum_y *= interstep_decay
|
||||||
|
count_y *= interstep_decay
|
||||||
|
while True:
|
||||||
|
xold = xolds[luoi]
|
||||||
|
if xold <= xnew:
|
||||||
|
decay = np.exp(- (xnew - xold) / decay_period)
|
||||||
|
sum_y += decay * yolds[luoi]
|
||||||
|
count_y += decay
|
||||||
|
luoi += 1
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
if luoi >= len(xolds):
|
||||||
|
break
|
||||||
|
sum_ys[i] = sum_y
|
||||||
|
count_ys[i] = count_y
|
||||||
|
|
||||||
|
ys = sum_ys / count_ys
|
||||||
|
ys[count_ys < low_counts_threshold] = np.nan
|
||||||
|
|
||||||
|
return xnews, ys, count_ys
|
||||||
|
|
||||||
|
def symmetric_ema(xolds, yolds, low=None, high=None, n=512, decay_steps=1., low_counts_threshold=1e-8):
|
||||||
|
'''
|
||||||
|
perform symmetric EMA (exponential moving average)
|
||||||
|
smoothing and resampling to an even grid with n points.
|
||||||
|
Does not do extrapolation, so we assume
|
||||||
|
xolds[0] <= low && high <= xolds[-1]
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
|
||||||
|
xolds: array or list - x values of data. Needs to be sorted in ascending order
|
||||||
|
yolds: array of list - y values of data. Has to have the same length as xolds
|
||||||
|
|
||||||
|
low: float - min value of the new x grid. By default equals to xolds[0]
|
||||||
|
high: float - max value of the new x grid. By default equals to xolds[-1]
|
||||||
|
|
||||||
|
n: int - number of points in new x grid
|
||||||
|
|
||||||
|
decay_steps: float - EMA decay factor, expressed in new x grid steps.
|
||||||
|
|
||||||
|
low_counts_threshold: float or int
|
||||||
|
- y values with counts less than this value will be set to NaN
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
tuple sum_ys, count_ys where
|
||||||
|
xs - array with new x grid
|
||||||
|
ys - array of EMA of y at each point of the new x grid
|
||||||
|
count_ys - array of EMA of y counts at each point of the new x grid
|
||||||
|
|
||||||
|
'''
|
||||||
|
xs, ys1, count_ys1 = one_sided_ema(xolds, yolds, low, high, n, decay_steps, low_counts_threshold=0)
|
||||||
|
_, ys2, count_ys2 = one_sided_ema(-xolds[::-1], yolds[::-1], -high, -low, n, decay_steps, low_counts_threshold=0)
|
||||||
|
ys2 = ys2[::-1]
|
||||||
|
count_ys2 = count_ys2[::-1]
|
||||||
|
count_ys = count_ys1 + count_ys2
|
||||||
|
ys = (ys1 * count_ys1 + ys2 * count_ys2) / count_ys
|
||||||
|
ys[count_ys < low_counts_threshold] = np.nan
|
||||||
|
return xs, ys, count_ys
|
||||||
|
|
||||||
|
Result = namedtuple('Result', 'monitor progress dirname metadata')
|
||||||
|
Result.__new__.__defaults__ = (None,) * len(Result._fields)
|
||||||
|
|
||||||
|
def load_results(root_dir_or_dirs, enable_progress=True, enable_monitor=True, verbose=False):
|
||||||
|
'''
|
||||||
|
load summaries of runs from a list of directories (including subdirectories)
|
||||||
|
Arguments:
|
||||||
|
|
||||||
|
enable_progress: bool - if True, will attempt to load data from progress.csv files (data saved by logger). Default: True
|
||||||
|
|
||||||
|
enable_monitor: bool - if True, will attempt to load data from monitor.csv files (data saved by Monitor environment wrapper). Default: True
|
||||||
|
|
||||||
|
verbose: bool - if True, will print out list of directories from which the data is loaded. Default: False
|
||||||
|
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of Result objects with the following fields:
|
||||||
|
- dirname - path to the directory data was loaded from
|
||||||
|
- metadata - run metadata (such as command-line arguments and anything else in metadata.json file
|
||||||
|
- monitor - if enable_monitor is True, this field contains pandas dataframe with loaded monitor.csv file (or aggregate of all *.monitor.csv files in the directory)
|
||||||
|
- progress - if enable_progress is True, this field contains pandas dataframe with loaded progress.csv file
|
||||||
|
'''
|
||||||
|
if isinstance(root_dir_or_dirs, str):
|
||||||
|
rootdirs = [osp.expanduser(root_dir_or_dirs)]
|
||||||
|
else:
|
||||||
|
rootdirs = [osp.expanduser(d) for d in root_dir_or_dirs]
|
||||||
|
allresults = []
|
||||||
|
for rootdir in rootdirs:
|
||||||
|
assert osp.exists(rootdir), "%s doesn't exist"%rootdir
|
||||||
|
for dirname, dirs, files in os.walk(rootdir):
|
||||||
|
if '-proc' in dirname:
|
||||||
|
files[:] = []
|
||||||
|
continue
|
||||||
|
if set(['metadata.json', 'monitor.json', 'monitor.csv', 'progress.json', 'progress.csv']).intersection(files):
|
||||||
|
# used to be uncommented, which means do not go deeper than current directory if any of the data files
|
||||||
|
# are found
|
||||||
|
# dirs[:] = []
|
||||||
|
result = {'dirname' : dirname}
|
||||||
|
if "metadata.json" in files:
|
||||||
|
with open(osp.join(dirname, "metadata.json"), "r") as fh:
|
||||||
|
result['metadata'] = json.load(fh)
|
||||||
|
progjson = osp.join(dirname, "progress.json")
|
||||||
|
progcsv = osp.join(dirname, "progress.csv")
|
||||||
|
if enable_progress:
|
||||||
|
if osp.exists(progjson):
|
||||||
|
result['progress'] = pandas.DataFrame(read_json(progjson))
|
||||||
|
elif osp.exists(progcsv):
|
||||||
|
try:
|
||||||
|
result['progress'] = read_csv(progcsv)
|
||||||
|
except pandas.errors.EmptyDataError:
|
||||||
|
print('skipping progress file in ', dirname, 'empty data')
|
||||||
|
else:
|
||||||
|
if verbose: print('skipping %s: no progress file'%dirname)
|
||||||
|
|
||||||
|
if enable_monitor:
|
||||||
|
try:
|
||||||
|
result['monitor'] = pandas.DataFrame(monitor.load_results(dirname))
|
||||||
|
except monitor.LoadMonitorResultsError:
|
||||||
|
print('skipping %s: no monitor files'%dirname)
|
||||||
|
except Exception as e:
|
||||||
|
print('exception loading monitor file in %s: %s'%(dirname, e))
|
||||||
|
|
||||||
|
if result.get('monitor') is not None or result.get('progress') is not None:
|
||||||
|
allresults.append(Result(**result))
|
||||||
|
if verbose:
|
||||||
|
print('successfully loaded %s'%dirname)
|
||||||
|
|
||||||
|
if verbose: print('loaded %i results'%len(allresults))
|
||||||
|
return allresults
|
||||||
|
|
||||||
|
COLORS = ['blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'black', 'purple', 'pink',
|
||||||
|
'brown', 'orange', 'teal', 'lightblue', 'lime', 'lavender', 'turquoise',
|
||||||
|
'darkgreen', 'tan', 'salmon', 'gold', 'darkred', 'darkblue']
|
||||||
|
|
||||||
|
|
||||||
|
def default_xy_fn(r):
|
||||||
|
x = np.cumsum(r.monitor.l)
|
||||||
|
y = smooth(r.monitor.r, radius=10)
|
||||||
|
return x,y
|
||||||
|
|
||||||
|
def default_split_fn(r):
|
||||||
|
import re
|
||||||
|
# match name between slash and -<digits> at the end of the string
|
||||||
|
# (slash in the beginning or -<digits> in the end or either may be missing)
|
||||||
|
match = re.search(r'[^/-]+(?=(-\d+)?\Z)', r.dirname)
|
||||||
|
if match:
|
||||||
|
return match.group(0)
|
||||||
|
|
||||||
|
def plot_results(
|
||||||
|
allresults, *,
|
||||||
|
xy_fn=default_xy_fn,
|
||||||
|
split_fn=default_split_fn,
|
||||||
|
group_fn=default_split_fn,
|
||||||
|
average_group=False,
|
||||||
|
shaded_std=True,
|
||||||
|
shaded_err=True,
|
||||||
|
figsize=None,
|
||||||
|
legend_outside=False,
|
||||||
|
resample=0,
|
||||||
|
smooth_step=1.0,
|
||||||
|
):
|
||||||
|
'''
|
||||||
|
Plot multiple Results objects
|
||||||
|
|
||||||
|
xy_fn: function Result -> x,y - function that converts results objects into tuple of x and y values.
|
||||||
|
By default, x is cumsum of episode lengths, and y is episode rewards
|
||||||
|
|
||||||
|
split_fn: function Result -> hashable - function that converts results objects into keys to split curves into sub-panels by.
|
||||||
|
That is, the results r for which split_fn(r) is different will be put on different sub-panels.
|
||||||
|
By default, the portion of r.dirname between last / and -<digits> is returned. The sub-panels are
|
||||||
|
stacked vertically in the figure.
|
||||||
|
|
||||||
|
group_fn: function Result -> hashable - function that converts results objects into keys to group curves by.
|
||||||
|
That is, the results r for which group_fn(r) is the same will be put into the same group.
|
||||||
|
Curves in the same group have the same color (if average_group is False), or averaged over
|
||||||
|
(if average_group is True). The default value is the same as default value for split_fn
|
||||||
|
|
||||||
|
average_group: bool - if True, will average the curves in the same group and plot the mean. Enables resampling
|
||||||
|
(if resample = 0, will use 512 steps)
|
||||||
|
|
||||||
|
shaded_std: bool - if True (default), the shaded region corresponding to standard deviation of the group of curves will be
|
||||||
|
shown (only applicable if average_group = True)
|
||||||
|
|
||||||
|
shaded_err: bool - if True (default), the shaded region corresponding to error in mean estimate of the group of curves
|
||||||
|
(that is, standard deviation divided by square root of number of curves) will be
|
||||||
|
shown (only applicable if average_group = True)
|
||||||
|
|
||||||
|
figsize: tuple or None - size of the resulting figure (including sub-panels). By default, width is 6 and height is 6 times number of
|
||||||
|
sub-panels.
|
||||||
|
|
||||||
|
|
||||||
|
legend_outside: bool - if True, will place the legend outside of the sub-panels.
|
||||||
|
|
||||||
|
resample: int - if not zero, size of the uniform grid in x direction to resample onto. Resampling is performed via symmetric
|
||||||
|
EMA smoothing (see the docstring for symmetric_ema).
|
||||||
|
Default is zero (no resampling). Note that if average_group is True, resampling is necessary; in that case, default
|
||||||
|
value is 512.
|
||||||
|
|
||||||
|
smooth_step: float - when resampling (i.e. when resample > 0 or average_group is True), use this EMA decay parameter (in units of the new grid step).
|
||||||
|
See docstrings for decay_steps in symmetric_ema or one_sided_ema functions.
|
||||||
|
|
||||||
|
'''
|
||||||
|
|
||||||
|
if split_fn is None: split_fn = lambda _ : ''
|
||||||
|
if group_fn is None: group_fn = lambda _ : ''
|
||||||
|
sk2r = defaultdict(list) # splitkey2results
|
||||||
|
for result in allresults:
|
||||||
|
splitkey = split_fn(result)
|
||||||
|
sk2r[splitkey].append(result)
|
||||||
|
assert len(sk2r) > 0
|
||||||
|
assert isinstance(resample, int), "0: don't resample. <integer>: that many samples"
|
||||||
|
nrows = len(sk2r)
|
||||||
|
ncols = 1
|
||||||
|
figsize = figsize or (6, 6 * nrows)
|
||||||
|
f, axarr = plt.subplots(nrows, ncols, sharex=False, squeeze=False, figsize=figsize)
|
||||||
|
|
||||||
|
groups = list(set(group_fn(result) for result in allresults))
|
||||||
|
|
||||||
|
default_samples = 512
|
||||||
|
if average_group:
|
||||||
|
resample = resample or default_samples
|
||||||
|
|
||||||
|
for (isplit, sk) in enumerate(sorted(sk2r.keys())):
|
||||||
|
g2l = {}
|
||||||
|
g2c = defaultdict(int)
|
||||||
|
sresults = sk2r[sk]
|
||||||
|
gresults = defaultdict(list)
|
||||||
|
ax = axarr[isplit][0]
|
||||||
|
for result in sresults:
|
||||||
|
group = group_fn(result)
|
||||||
|
g2c[group] += 1
|
||||||
|
x, y = xy_fn(result)
|
||||||
|
if x is None: x = np.arange(len(y))
|
||||||
|
x, y = map(np.asarray, (x, y))
|
||||||
|
if average_group:
|
||||||
|
gresults[group].append((x,y))
|
||||||
|
else:
|
||||||
|
if resample:
|
||||||
|
x, y, counts = symmetric_ema(x, y, x[0], x[-1], resample, decay_steps=smooth_step)
|
||||||
|
l, = ax.plot(x, y, color=COLORS[groups.index(group) % len(COLORS)])
|
||||||
|
g2l[group] = l
|
||||||
|
if average_group:
|
||||||
|
for group in sorted(groups):
|
||||||
|
xys = gresults[group]
|
||||||
|
if not any(xys):
|
||||||
|
continue
|
||||||
|
color = COLORS[groups.index(group) % len(COLORS)]
|
||||||
|
origxs = [xy[0] for xy in xys]
|
||||||
|
minxlen = min(map(len, origxs))
|
||||||
|
def allequal(qs):
|
||||||
|
return all((q==qs[0]).all() for q in qs[1:])
|
||||||
|
if resample:
|
||||||
|
low = max(x[0] for x in origxs)
|
||||||
|
high = min(x[-1] for x in origxs)
|
||||||
|
usex = np.linspace(low, high, resample)
|
||||||
|
ys = []
|
||||||
|
for (x, y) in xys:
|
||||||
|
ys.append(symmetric_ema(x, y, low, high, resample, decay_steps=smooth_step)[1])
|
||||||
|
else:
|
||||||
|
assert allequal([x[:minxlen] for x in origxs]),\
|
||||||
|
'If you want to average unevenly sampled data, set resample=<number of samples you want>'
|
||||||
|
usex = origxs[0]
|
||||||
|
ys = [xy[1][:minxlen] for xy in xys]
|
||||||
|
ymean = np.mean(ys, axis=0)
|
||||||
|
ystd = np.std(ys, axis=0)
|
||||||
|
ystderr = ystd / np.sqrt(len(ys))
|
||||||
|
l, = axarr[isplit][0].plot(usex, ymean, color=color)
|
||||||
|
g2l[group] = l
|
||||||
|
if shaded_err:
|
||||||
|
ax.fill_between(usex, ymean - ystderr, ymean + ystderr, color=color, alpha=.4)
|
||||||
|
if shaded_std:
|
||||||
|
ax.fill_between(usex, ymean - ystd, ymean + ystd, color=color, alpha=.2)
|
||||||
|
|
||||||
|
|
||||||
|
# https://matplotlib.org/users/legend_guide.html
|
||||||
|
plt.tight_layout()
|
||||||
|
if any(g2l.keys()):
|
||||||
|
ax.legend(
|
||||||
|
g2l.values(),
|
||||||
|
['%s (%i)'%(g, g2c[g]) for g in g2l] if average_group else g2l.keys(),
|
||||||
|
loc=2 if legend_outside else None,
|
||||||
|
bbox_to_anchor=(1,1) if legend_outside else None)
|
||||||
|
ax.set_title(sk)
|
||||||
|
return f, axarr
|
||||||
|
|
||||||
|
def regression_analysis(df):
|
||||||
|
xcols = list(df.columns.copy())
|
||||||
|
xcols.remove('score')
|
||||||
|
ycols = ['score']
|
||||||
|
import statsmodels.api as sm
|
||||||
|
mod = sm.OLS(df[ycols], sm.add_constant(df[xcols]), hasconst=False)
|
||||||
|
res = mod.fit()
|
||||||
|
print(res.summary())
|
||||||
|
|
||||||
|
def test_smooth():
|
||||||
|
norig = 100
|
||||||
|
nup = 300
|
||||||
|
ndown = 30
|
||||||
|
xs = np.cumsum(np.random.rand(norig) * 10 / norig)
|
||||||
|
yclean = np.sin(xs)
|
||||||
|
ys = yclean + .1 * np.random.randn(yclean.size)
|
||||||
|
xup, yup, _ = symmetric_ema(xs, ys, xs.min(), xs.max(), nup, decay_steps=nup/ndown)
|
||||||
|
xdown, ydown, _ = symmetric_ema(xs, ys, xs.min(), xs.max(), ndown, decay_steps=ndown/ndown)
|
||||||
|
xsame, ysame, _ = symmetric_ema(xs, ys, xs.min(), xs.max(), norig, decay_steps=norig/ndown)
|
||||||
|
plt.plot(xs, ys, label='orig', marker='x')
|
||||||
|
plt.plot(xup, yup, label='up', marker='x')
|
||||||
|
plt.plot(xdown, ydown, label='down', marker='x')
|
||||||
|
plt.plot(xsame, ysame, label='same', marker='x')
|
||||||
|
plt.plot(xs, yclean, label='clean', marker='x')
|
||||||
|
plt.legend()
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
|
@@ -132,10 +132,8 @@ class MovieRecord(gym.Wrapper):
|
|||||||
self.epcount = 0
|
self.epcount = 0
|
||||||
def reset(self):
|
def reset(self):
|
||||||
if self.epcount % self.k == 0:
|
if self.epcount % self.k == 0:
|
||||||
print('saving movie this episode', self.savedir)
|
|
||||||
self.env.unwrapped.movie_path = self.savedir
|
self.env.unwrapped.movie_path = self.savedir
|
||||||
else:
|
else:
|
||||||
print('not saving this episode')
|
|
||||||
self.env.unwrapped.movie_path = None
|
self.env.unwrapped.movie_path = None
|
||||||
self.env.unwrapped.movie = None
|
self.env.unwrapped.movie = None
|
||||||
self.epcount += 1
|
self.epcount += 1
|
||||||
|
@@ -103,9 +103,9 @@ def test_coexistence(learn_fn, network_fn):
|
|||||||
kwargs.update(learn_kwargs[learn_fn])
|
kwargs.update(learn_kwargs[learn_fn])
|
||||||
|
|
||||||
learn = partial(learn, env=env, network=network_fn, total_timesteps=0, **kwargs)
|
learn = partial(learn, env=env, network=network_fn, total_timesteps=0, **kwargs)
|
||||||
make_session(make_default=True, graph=tf.Graph());
|
make_session(make_default=True, graph=tf.Graph())
|
||||||
model1 = learn(seed=1)
|
model1 = learn(seed=1)
|
||||||
make_session(make_default=True, graph=tf.Graph());
|
make_session(make_default=True, graph=tf.Graph())
|
||||||
model2 = learn(seed=2)
|
model2 = learn(seed=2)
|
||||||
|
|
||||||
model1.step(env.observation_space.sample())
|
model1.step(env.observation_space.sample())
|
||||||
|
@@ -32,6 +32,11 @@ class VecEnv(ABC):
|
|||||||
"""
|
"""
|
||||||
closed = False
|
closed = False
|
||||||
viewer = None
|
viewer = None
|
||||||
|
|
||||||
|
metadata = {
|
||||||
|
'render.modes': ['human', 'rgb_array']
|
||||||
|
}
|
||||||
|
|
||||||
def __init__(self, num_envs, observation_space, action_space):
|
def __init__(self, num_envs, observation_space, action_space):
|
||||||
self.num_envs = num_envs
|
self.num_envs = num_envs
|
||||||
self.observation_space = observation_space
|
self.observation_space = observation_space
|
||||||
|
@@ -20,9 +20,6 @@ class DummyVecEnv(VecEnv):
|
|||||||
env = self.envs[0]
|
env = self.envs[0]
|
||||||
VecEnv.__init__(self, len(env_fns), env.observation_space, env.action_space)
|
VecEnv.__init__(self, len(env_fns), env.observation_space, env.action_space)
|
||||||
obs_space = env.observation_space
|
obs_space = env.observation_space
|
||||||
if isinstance(obs_space, spaces.MultiDiscrete):
|
|
||||||
obs_space.shape = obs_space.shape[0]
|
|
||||||
|
|
||||||
self.keys, shapes, dtypes = obs_space_info(obs_space)
|
self.keys, shapes, dtypes = obs_space_info(obs_space)
|
||||||
|
|
||||||
self.buf_obs = { k: np.zeros((self.num_envs,) + tuple(shapes[k]), dtype=dtypes[k]) for k in self.keys }
|
self.buf_obs = { k: np.zeros((self.num_envs,) + tuple(shapes[k]), dtype=dtypes[k]) for k in self.keys }
|
||||||
@@ -79,6 +76,6 @@ class DummyVecEnv(VecEnv):
|
|||||||
|
|
||||||
def render(self, mode='human'):
|
def render(self, mode='human'):
|
||||||
if self.num_envs == 1:
|
if self.num_envs == 1:
|
||||||
self.envs[0].render(mode=mode)
|
return self.envs[0].render(mode=mode)
|
||||||
else:
|
else:
|
||||||
super().render(mode=mode)
|
return super().render(mode=mode)
|
||||||
|
49
baselines/common/vec_env/test_video_recorder.py
Normal file
49
baselines/common/vec_env/test_video_recorder.py
Normal file
@@ -0,0 +1,49 @@
|
|||||||
|
"""
|
||||||
|
Tests for asynchronous vectorized environments.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import gym
|
||||||
|
import pytest
|
||||||
|
import os
|
||||||
|
import glob
|
||||||
|
import tempfile
|
||||||
|
|
||||||
|
from .dummy_vec_env import DummyVecEnv
|
||||||
|
from .shmem_vec_env import ShmemVecEnv
|
||||||
|
from .subproc_vec_env import SubprocVecEnv
|
||||||
|
from .vec_video_recorder import VecVideoRecorder
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('klass', (DummyVecEnv, ShmemVecEnv, SubprocVecEnv))
|
||||||
|
@pytest.mark.parametrize('num_envs', (1, 4))
|
||||||
|
@pytest.mark.parametrize('video_length', (10, 100))
|
||||||
|
@pytest.mark.parametrize('video_interval', (1, 50))
|
||||||
|
def test_video_recorder(klass, num_envs, video_length, video_interval):
|
||||||
|
"""
|
||||||
|
Wrap an existing VecEnv with VevVideoRecorder,
|
||||||
|
Make (video_interval + video_length + 1) steps,
|
||||||
|
then check that the file is present
|
||||||
|
"""
|
||||||
|
|
||||||
|
def make_fn():
|
||||||
|
env = gym.make('PongNoFrameskip-v4')
|
||||||
|
return env
|
||||||
|
fns = [make_fn for _ in range(num_envs)]
|
||||||
|
env = klass(fns)
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as video_path:
|
||||||
|
env = VecVideoRecorder(env, video_path, record_video_trigger=lambda x: x % video_interval == 0, video_length=video_length)
|
||||||
|
|
||||||
|
env.reset()
|
||||||
|
for _ in range(video_interval + video_length + 1):
|
||||||
|
env.step([0] * num_envs)
|
||||||
|
env.close()
|
||||||
|
|
||||||
|
|
||||||
|
recorded_video = glob.glob(os.path.join(video_path, "*.mp4"))
|
||||||
|
|
||||||
|
# first and second step
|
||||||
|
assert len(recorded_video) == 2
|
||||||
|
# Files are not empty
|
||||||
|
assert all(os.stat(p).st_size != 0 for p in recorded_video)
|
||||||
|
|
||||||
|
|
89
baselines/common/vec_env/vec_video_recorder.py
Normal file
89
baselines/common/vec_env/vec_video_recorder.py
Normal file
@@ -0,0 +1,89 @@
|
|||||||
|
import os
|
||||||
|
from baselines import logger
|
||||||
|
from baselines.common.vec_env import VecEnvWrapper
|
||||||
|
from gym.wrappers.monitoring import video_recorder
|
||||||
|
|
||||||
|
|
||||||
|
class VecVideoRecorder(VecEnvWrapper):
|
||||||
|
"""
|
||||||
|
Wrap VecEnv to record rendered image as mp4 video.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, venv, directory, record_video_trigger, video_length=200):
|
||||||
|
"""
|
||||||
|
# Arguments
|
||||||
|
venv: VecEnv to wrap
|
||||||
|
directory: Where to save videos
|
||||||
|
record_video_trigger:
|
||||||
|
Function that defines when to start recording.
|
||||||
|
The function takes the current number of step,
|
||||||
|
and returns whether we should start recording or not.
|
||||||
|
video_length: Length of recorded video
|
||||||
|
"""
|
||||||
|
|
||||||
|
VecEnvWrapper.__init__(self, venv)
|
||||||
|
self.record_video_trigger = record_video_trigger
|
||||||
|
self.video_recorder = None
|
||||||
|
|
||||||
|
self.directory = os.path.abspath(directory)
|
||||||
|
if not os.path.exists(self.directory): os.mkdir(self.directory)
|
||||||
|
|
||||||
|
self.file_prefix = "vecenv"
|
||||||
|
self.file_infix = '{}'.format(os.getpid())
|
||||||
|
self.step_id = 0
|
||||||
|
self.video_length = video_length
|
||||||
|
|
||||||
|
self.recording = False
|
||||||
|
self.recorded_frames = 0
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
obs = self.venv.reset()
|
||||||
|
|
||||||
|
self.start_video_recorder()
|
||||||
|
|
||||||
|
return obs
|
||||||
|
|
||||||
|
def start_video_recorder(self):
|
||||||
|
self.close_video_recorder()
|
||||||
|
|
||||||
|
base_path = os.path.join(self.directory, '{}.video.{}.video{:06}'.format(self.file_prefix, self.file_infix, self.step_id))
|
||||||
|
self.video_recorder = video_recorder.VideoRecorder(
|
||||||
|
env=self.venv,
|
||||||
|
base_path=base_path,
|
||||||
|
metadata={'step_id': self.step_id}
|
||||||
|
)
|
||||||
|
|
||||||
|
self.video_recorder.capture_frame()
|
||||||
|
self.recorded_frames = 1
|
||||||
|
self.recording = True
|
||||||
|
|
||||||
|
def _video_enabled(self):
|
||||||
|
return self.record_video_trigger(self.step_id)
|
||||||
|
|
||||||
|
def step_wait(self):
|
||||||
|
obs, rews, dones, infos = self.venv.step_wait()
|
||||||
|
|
||||||
|
self.step_id += 1
|
||||||
|
if self.recording:
|
||||||
|
self.video_recorder.capture_frame()
|
||||||
|
self.recorded_frames += 1
|
||||||
|
if self.recorded_frames > self.video_length:
|
||||||
|
logger.info("Saving video to ", self.video_recorder.path)
|
||||||
|
self.close_video_recorder()
|
||||||
|
elif self._video_enabled():
|
||||||
|
self.start_video_recorder()
|
||||||
|
|
||||||
|
return obs, rews, dones, infos
|
||||||
|
|
||||||
|
def close_video_recorder(self):
|
||||||
|
if self.recording:
|
||||||
|
self.video_recorder.close()
|
||||||
|
self.recording = False
|
||||||
|
self.recorded_frames = 0
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
VecEnvWrapper.close(self)
|
||||||
|
self.close_video_recorder()
|
||||||
|
|
||||||
|
def __del__(self):
|
||||||
|
self.close()
|
@@ -12,8 +12,11 @@ import baselines.common.tf_util as U
|
|||||||
|
|
||||||
from baselines import logger
|
from baselines import logger
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from mpi4py import MPI
|
|
||||||
|
|
||||||
|
try:
|
||||||
|
from mpi4py import MPI
|
||||||
|
except ImportError:
|
||||||
|
MPI = None
|
||||||
|
|
||||||
def learn(network, env,
|
def learn(network, env,
|
||||||
seed=None,
|
seed=None,
|
||||||
@@ -49,7 +52,11 @@ def learn(network, env,
|
|||||||
else:
|
else:
|
||||||
nb_epochs = 500
|
nb_epochs = 500
|
||||||
|
|
||||||
rank = MPI.COMM_WORLD.Get_rank()
|
if MPI is not None:
|
||||||
|
rank = MPI.COMM_WORLD.Get_rank()
|
||||||
|
else:
|
||||||
|
rank = 0
|
||||||
|
|
||||||
nb_actions = env.action_space.shape[-1]
|
nb_actions = env.action_space.shape[-1]
|
||||||
assert (np.abs(env.action_space.low) == env.action_space.high).all() # we assume symmetric actions.
|
assert (np.abs(env.action_space.low) == env.action_space.high).all() # we assume symmetric actions.
|
||||||
|
|
||||||
@@ -59,7 +66,6 @@ def learn(network, env,
|
|||||||
|
|
||||||
action_noise = None
|
action_noise = None
|
||||||
param_noise = None
|
param_noise = None
|
||||||
nb_actions = env.action_space.shape[-1]
|
|
||||||
if noise_type is not None:
|
if noise_type is not None:
|
||||||
for current_noise_type in noise_type.split(','):
|
for current_noise_type in noise_type.split(','):
|
||||||
current_noise_type = current_noise_type.strip()
|
current_noise_type = current_noise_type.strip()
|
||||||
@@ -200,7 +206,11 @@ def learn(network, env,
|
|||||||
eval_episode_rewards_history.append(eval_episode_reward[d])
|
eval_episode_rewards_history.append(eval_episode_reward[d])
|
||||||
eval_episode_reward[d] = 0.0
|
eval_episode_reward[d] = 0.0
|
||||||
|
|
||||||
mpi_size = MPI.COMM_WORLD.Get_size()
|
if MPI is not None:
|
||||||
|
mpi_size = MPI.COMM_WORLD.Get_size()
|
||||||
|
else:
|
||||||
|
mpi_size = 1
|
||||||
|
|
||||||
# Log stats.
|
# Log stats.
|
||||||
# XXX shouldn't call np.mean on variable length lists
|
# XXX shouldn't call np.mean on variable length lists
|
||||||
duration = time.time() - start_time
|
duration = time.time() - start_time
|
||||||
@@ -234,7 +244,10 @@ def learn(network, env,
|
|||||||
else:
|
else:
|
||||||
raise ValueError('expected scalar, got %s'%x)
|
raise ValueError('expected scalar, got %s'%x)
|
||||||
|
|
||||||
combined_stats_sums = MPI.COMM_WORLD.allreduce(np.array([ np.array(x).flatten()[0] for x in combined_stats.values()]))
|
combined_stats_sums = np.array([ np.array(x).flatten()[0] for x in combined_stats.values()])
|
||||||
|
if MPI is not None:
|
||||||
|
combined_stats_sums = MPI.COMM_WORLD.allreduce(combined_stats_sums)
|
||||||
|
|
||||||
combined_stats = {k : v / mpi_size for (k,v) in zip(combined_stats.keys(), combined_stats_sums)}
|
combined_stats = {k : v / mpi_size for (k,v) in zip(combined_stats.keys(), combined_stats_sums)}
|
||||||
|
|
||||||
# Total statistics.
|
# Total statistics.
|
||||||
|
@@ -9,7 +9,10 @@ from baselines import logger
|
|||||||
from baselines.common.mpi_adam import MpiAdam
|
from baselines.common.mpi_adam import MpiAdam
|
||||||
import baselines.common.tf_util as U
|
import baselines.common.tf_util as U
|
||||||
from baselines.common.mpi_running_mean_std import RunningMeanStd
|
from baselines.common.mpi_running_mean_std import RunningMeanStd
|
||||||
from mpi4py import MPI
|
try:
|
||||||
|
from mpi4py import MPI
|
||||||
|
except ImportError:
|
||||||
|
MPI = None
|
||||||
|
|
||||||
def normalize(x, stats):
|
def normalize(x, stats):
|
||||||
if stats is None:
|
if stats is None:
|
||||||
@@ -64,7 +67,6 @@ class DDPG(object):
|
|||||||
def __init__(self, actor, critic, memory, observation_shape, action_shape, param_noise=None, action_noise=None,
|
def __init__(self, actor, critic, memory, observation_shape, action_shape, param_noise=None, action_noise=None,
|
||||||
gamma=0.99, tau=0.001, normalize_returns=False, enable_popart=False, normalize_observations=True,
|
gamma=0.99, tau=0.001, normalize_returns=False, enable_popart=False, normalize_observations=True,
|
||||||
batch_size=128, observation_range=(-5., 5.), action_range=(-1., 1.), return_range=(-np.inf, np.inf),
|
batch_size=128, observation_range=(-5., 5.), action_range=(-1., 1.), return_range=(-np.inf, np.inf),
|
||||||
adaptive_param_noise=True, adaptive_param_noise_policy_threshold=.1,
|
|
||||||
critic_l2_reg=0., actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1.):
|
critic_l2_reg=0., actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1.):
|
||||||
# Inputs.
|
# Inputs.
|
||||||
self.obs0 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs0')
|
self.obs0 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs0')
|
||||||
@@ -268,7 +270,7 @@ class DDPG(object):
|
|||||||
|
|
||||||
if self.action_noise is not None and apply_noise:
|
if self.action_noise is not None and apply_noise:
|
||||||
noise = self.action_noise()
|
noise = self.action_noise()
|
||||||
assert noise.shape == action.shape
|
assert noise.shape == action[0].shape
|
||||||
action += noise
|
action += noise
|
||||||
action = np.clip(action, self.action_range[0], self.action_range[1])
|
action = np.clip(action, self.action_range[0], self.action_range[1])
|
||||||
|
|
||||||
@@ -358,6 +360,11 @@ class DDPG(object):
|
|||||||
return stats
|
return stats
|
||||||
|
|
||||||
def adapt_param_noise(self):
|
def adapt_param_noise(self):
|
||||||
|
try:
|
||||||
|
from mpi4py import MPI
|
||||||
|
except ImportError:
|
||||||
|
MPI = None
|
||||||
|
|
||||||
if self.param_noise is None:
|
if self.param_noise is None:
|
||||||
return 0.
|
return 0.
|
||||||
|
|
||||||
@@ -371,7 +378,16 @@ class DDPG(object):
|
|||||||
self.param_noise_stddev: self.param_noise.current_stddev,
|
self.param_noise_stddev: self.param_noise.current_stddev,
|
||||||
})
|
})
|
||||||
|
|
||||||
mean_distance = MPI.COMM_WORLD.allreduce(distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size()
|
if MPI is not None:
|
||||||
|
mean_distance = MPI.COMM_WORLD.allreduce(distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size()
|
||||||
|
else:
|
||||||
|
mean_distance = distance
|
||||||
|
|
||||||
|
if MPI is not None:
|
||||||
|
mean_distance = MPI.COMM_WORLD.allreduce(distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size()
|
||||||
|
else:
|
||||||
|
mean_distance = distance
|
||||||
|
|
||||||
self.param_noise.adapt(mean_distance)
|
self.param_noise.adapt(mean_distance)
|
||||||
return mean_distance
|
return mean_distance
|
||||||
|
|
||||||
|
@@ -33,7 +33,7 @@ The functions in this file can are used to create the following functions:
|
|||||||
stochastic: bool
|
stochastic: bool
|
||||||
if set to False all the actions are always deterministic (default False)
|
if set to False all the actions are always deterministic (default False)
|
||||||
update_eps_ph: float
|
update_eps_ph: float
|
||||||
update epsilon a new value, if negative not update happens
|
update epsilon to a new value, if negative no update happens
|
||||||
(default: no update)
|
(default: no update)
|
||||||
reset_ph: bool
|
reset_ph: bool
|
||||||
reset the perturbed policy by sampling a new perturbation
|
reset the perturbed policy by sampling a new perturbation
|
||||||
|
@@ -2,9 +2,9 @@ import tensorflow as tf
|
|||||||
import tensorflow.contrib.layers as layers
|
import tensorflow.contrib.layers as layers
|
||||||
|
|
||||||
|
|
||||||
def _mlp(hiddens, inpt, num_actions, scope, reuse=False, layer_norm=False):
|
def _mlp(hiddens, input_, num_actions, scope, reuse=False, layer_norm=False):
|
||||||
with tf.variable_scope(scope, reuse=reuse):
|
with tf.variable_scope(scope, reuse=reuse):
|
||||||
out = inpt
|
out = input_
|
||||||
for hidden in hiddens:
|
for hidden in hiddens:
|
||||||
out = layers.fully_connected(out, num_outputs=hidden, activation_fn=None)
|
out = layers.fully_connected(out, num_outputs=hidden, activation_fn=None)
|
||||||
if layer_norm:
|
if layer_norm:
|
||||||
@@ -30,9 +30,9 @@ def mlp(hiddens=[], layer_norm=False):
|
|||||||
return lambda *args, **kwargs: _mlp(hiddens, layer_norm=layer_norm, *args, **kwargs)
|
return lambda *args, **kwargs: _mlp(hiddens, layer_norm=layer_norm, *args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
def _cnn_to_mlp(convs, hiddens, dueling, inpt, num_actions, scope, reuse=False, layer_norm=False):
|
def _cnn_to_mlp(convs, hiddens, dueling, input_, num_actions, scope, reuse=False, layer_norm=False):
|
||||||
with tf.variable_scope(scope, reuse=reuse):
|
with tf.variable_scope(scope, reuse=reuse):
|
||||||
out = inpt
|
out = input_
|
||||||
with tf.variable_scope("convnet"):
|
with tf.variable_scope("convnet"):
|
||||||
for num_outputs, kernel_size, stride in convs:
|
for num_outputs, kernel_size, stride in convs:
|
||||||
out = layers.convolution2d(out,
|
out = layers.convolution2d(out,
|
||||||
|
@@ -54,7 +54,7 @@ class HumanOutputFormat(KVWriter, SeqWriter):
|
|||||||
# Write out the data
|
# Write out the data
|
||||||
dashes = '-' * (keywidth + valwidth + 7)
|
dashes = '-' * (keywidth + valwidth + 7)
|
||||||
lines = [dashes]
|
lines = [dashes]
|
||||||
for (key, val) in sorted(key2str.items()):
|
for (key, val) in sorted(key2str.items(), key=lambda kv: kv[0].lower()):
|
||||||
lines.append('| %s%s | %s%s |' % (
|
lines.append('| %s%s | %s%s |' % (
|
||||||
key,
|
key,
|
||||||
' ' * (keywidth - len(key)),
|
' ' * (keywidth - len(key)),
|
||||||
|
@@ -20,3 +20,6 @@ def atari():
|
|||||||
lr=lambda f : f * 2.5e-4,
|
lr=lambda f : f * 2.5e-4,
|
||||||
cliprange=lambda f : f * 0.1,
|
cliprange=lambda f : f * 0.1,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def retro():
|
||||||
|
return atari()
|
||||||
|
76
baselines/ppo2/microbatched_model.py
Normal file
76
baselines/ppo2/microbatched_model.py
Normal file
@@ -0,0 +1,76 @@
|
|||||||
|
import tensorflow as tf
|
||||||
|
import numpy as np
|
||||||
|
from baselines.ppo2.model import Model
|
||||||
|
|
||||||
|
class MicrobatchedModel(Model):
|
||||||
|
"""
|
||||||
|
Model that does training one microbatch at a time - when gradient computation
|
||||||
|
on the entire minibatch causes some overflow
|
||||||
|
"""
|
||||||
|
def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train,
|
||||||
|
nsteps, ent_coef, vf_coef, max_grad_norm, microbatch_size):
|
||||||
|
|
||||||
|
self.nmicrobatches = nbatch_train // microbatch_size
|
||||||
|
self.microbatch_size = microbatch_size
|
||||||
|
assert nbatch_train % microbatch_size == 0, 'microbatch_size ({}) should divide nbatch_train ({}) evenly'.format(microbatch_size, nbatch_train)
|
||||||
|
|
||||||
|
super().__init__(
|
||||||
|
policy=policy,
|
||||||
|
ob_space=ob_space,
|
||||||
|
ac_space=ac_space,
|
||||||
|
nbatch_act=nbatch_act,
|
||||||
|
nbatch_train=microbatch_size,
|
||||||
|
nsteps=nsteps,
|
||||||
|
ent_coef=ent_coef,
|
||||||
|
vf_coef=vf_coef,
|
||||||
|
max_grad_norm=max_grad_norm)
|
||||||
|
|
||||||
|
self.grads_ph = [tf.placeholder(dtype=g.dtype, shape=g.shape) for g in self.grads]
|
||||||
|
grads_ph_and_vars = list(zip(self.grads_ph, self.var))
|
||||||
|
self._apply_gradients_op = self.trainer.apply_gradients(grads_ph_and_vars)
|
||||||
|
|
||||||
|
|
||||||
|
def train(self, lr, cliprange, obs, returns, masks, actions, values, neglogpacs, states=None):
|
||||||
|
assert states is None, "microbatches with recurrent models are not supported yet"
|
||||||
|
|
||||||
|
# Here we calculate advantage A(s,a) = R + yV(s') - V(s)
|
||||||
|
# Returns = R + yV(s')
|
||||||
|
advs = returns - values
|
||||||
|
|
||||||
|
# Normalize the advantages
|
||||||
|
advs = (advs - advs.mean()) / (advs.std() + 1e-8)
|
||||||
|
|
||||||
|
# Initialize empty list for per-microbatch stats like pg_loss, vf_loss, entropy, approxkl (whatever is in self.stats_list)
|
||||||
|
stats_vs = []
|
||||||
|
|
||||||
|
for microbatch_idx in range(self.nmicrobatches):
|
||||||
|
_sli = range(microbatch_idx * self.microbatch_size, (microbatch_idx+1) * self.microbatch_size)
|
||||||
|
td_map = {
|
||||||
|
self.train_model.X: obs[_sli],
|
||||||
|
self.A:actions[_sli],
|
||||||
|
self.ADV:advs[_sli],
|
||||||
|
self.R:returns[_sli],
|
||||||
|
self.CLIPRANGE:cliprange,
|
||||||
|
self.OLDNEGLOGPAC:neglogpacs[_sli],
|
||||||
|
self.OLDVPRED:values[_sli]
|
||||||
|
}
|
||||||
|
|
||||||
|
# Compute gradient on a microbatch (note that variables do not change here) ...
|
||||||
|
grad_v, stats_v = self.sess.run([self.grads, self.stats_list], td_map)
|
||||||
|
if microbatch_idx == 0:
|
||||||
|
sum_grad_v = grad_v
|
||||||
|
else:
|
||||||
|
# .. and add to the total of the gradients
|
||||||
|
for i, g in enumerate(grad_v):
|
||||||
|
sum_grad_v[i] += g
|
||||||
|
stats_vs.append(stats_v)
|
||||||
|
|
||||||
|
feed_dict = {ph: sum_g / self.nmicrobatches for ph, sum_g in zip(self.grads_ph, sum_grad_v)}
|
||||||
|
feed_dict[self.LR] = lr
|
||||||
|
# Update variables using average of the gradients
|
||||||
|
self.sess.run(self._apply_gradients_op, feed_dict)
|
||||||
|
# Return average of the stats
|
||||||
|
return np.mean(np.array(stats_vs), axis=0).tolist()
|
||||||
|
|
||||||
|
|
||||||
|
|
157
baselines/ppo2/model.py
Normal file
157
baselines/ppo2/model.py
Normal file
@@ -0,0 +1,157 @@
|
|||||||
|
import tensorflow as tf
|
||||||
|
import functools
|
||||||
|
|
||||||
|
from baselines.common.tf_util import get_session, save_variables, load_variables
|
||||||
|
from baselines.common.tf_util import initialize
|
||||||
|
|
||||||
|
try:
|
||||||
|
from baselines.common.mpi_adam_optimizer import MpiAdamOptimizer
|
||||||
|
from mpi4py import MPI
|
||||||
|
from baselines.common.mpi_util import sync_from_root
|
||||||
|
except ImportError:
|
||||||
|
MPI = None
|
||||||
|
|
||||||
|
class Model(object):
|
||||||
|
"""
|
||||||
|
We use this object to :
|
||||||
|
__init__:
|
||||||
|
- Creates the step_model
|
||||||
|
- Creates the train_model
|
||||||
|
|
||||||
|
train():
|
||||||
|
- Make the training part (feedforward and retropropagation of gradients)
|
||||||
|
|
||||||
|
save/load():
|
||||||
|
- Save load the model
|
||||||
|
"""
|
||||||
|
def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train,
|
||||||
|
nsteps, ent_coef, vf_coef, max_grad_norm, microbatch_size=None):
|
||||||
|
self.sess = sess = get_session()
|
||||||
|
|
||||||
|
with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE):
|
||||||
|
# CREATE OUR TWO MODELS
|
||||||
|
# act_model that is used for sampling
|
||||||
|
act_model = policy(nbatch_act, 1, sess)
|
||||||
|
|
||||||
|
# Train model for training
|
||||||
|
if microbatch_size is None:
|
||||||
|
train_model = policy(nbatch_train, nsteps, sess)
|
||||||
|
else:
|
||||||
|
train_model = policy(microbatch_size, nsteps, sess)
|
||||||
|
|
||||||
|
# CREATE THE PLACEHOLDERS
|
||||||
|
self.A = A = train_model.pdtype.sample_placeholder([None])
|
||||||
|
self.ADV = ADV = tf.placeholder(tf.float32, [None])
|
||||||
|
self.R = R = tf.placeholder(tf.float32, [None])
|
||||||
|
# Keep track of old actor
|
||||||
|
self.OLDNEGLOGPAC = OLDNEGLOGPAC = tf.placeholder(tf.float32, [None])
|
||||||
|
# Keep track of old critic
|
||||||
|
self.OLDVPRED = OLDVPRED = tf.placeholder(tf.float32, [None])
|
||||||
|
self.LR = LR = tf.placeholder(tf.float32, [])
|
||||||
|
# Cliprange
|
||||||
|
self.CLIPRANGE = CLIPRANGE = tf.placeholder(tf.float32, [])
|
||||||
|
|
||||||
|
neglogpac = train_model.pd.neglogp(A)
|
||||||
|
|
||||||
|
# Calculate the entropy
|
||||||
|
# Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy.
|
||||||
|
entropy = tf.reduce_mean(train_model.pd.entropy())
|
||||||
|
|
||||||
|
# CALCULATE THE LOSS
|
||||||
|
# Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss
|
||||||
|
|
||||||
|
# Clip the value to reduce variability during Critic training
|
||||||
|
# Get the predicted value
|
||||||
|
vpred = train_model.vf
|
||||||
|
vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, - CLIPRANGE, CLIPRANGE)
|
||||||
|
# Unclipped value
|
||||||
|
vf_losses1 = tf.square(vpred - R)
|
||||||
|
# Clipped value
|
||||||
|
vf_losses2 = tf.square(vpredclipped - R)
|
||||||
|
|
||||||
|
vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))
|
||||||
|
|
||||||
|
# Calculate ratio (pi current policy / pi old policy)
|
||||||
|
ratio = tf.exp(OLDNEGLOGPAC - neglogpac)
|
||||||
|
|
||||||
|
# Defining Loss = - J is equivalent to max J
|
||||||
|
pg_losses = -ADV * ratio
|
||||||
|
|
||||||
|
pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE)
|
||||||
|
|
||||||
|
# Final PG loss
|
||||||
|
pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2))
|
||||||
|
approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC))
|
||||||
|
clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE)))
|
||||||
|
|
||||||
|
# Total loss
|
||||||
|
loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef
|
||||||
|
|
||||||
|
# UPDATE THE PARAMETERS USING LOSS
|
||||||
|
# 1. Get the model parameters
|
||||||
|
params = tf.trainable_variables('ppo2_model')
|
||||||
|
# 2. Build our trainer
|
||||||
|
if MPI is not None:
|
||||||
|
self.trainer = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=LR, epsilon=1e-5)
|
||||||
|
else:
|
||||||
|
self.trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5)
|
||||||
|
# 3. Calculate the gradients
|
||||||
|
grads_and_var = self.trainer.compute_gradients(loss, params)
|
||||||
|
grads, var = zip(*grads_and_var)
|
||||||
|
|
||||||
|
if max_grad_norm is not None:
|
||||||
|
# Clip the gradients (normalize)
|
||||||
|
grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
|
||||||
|
grads_and_var = list(zip(grads, var))
|
||||||
|
# zip aggregate each gradient with parameters associated
|
||||||
|
# For instance zip(ABCD, xyza) => Ax, By, Cz, Da
|
||||||
|
|
||||||
|
self.grads = grads
|
||||||
|
self.var = var
|
||||||
|
self._train_op = self.trainer.apply_gradients(grads_and_var)
|
||||||
|
self.loss_names = ['policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac']
|
||||||
|
self.stats_list = [pg_loss, vf_loss, entropy, approxkl, clipfrac]
|
||||||
|
|
||||||
|
|
||||||
|
self.train_model = train_model
|
||||||
|
self.act_model = act_model
|
||||||
|
self.step = act_model.step
|
||||||
|
self.value = act_model.value
|
||||||
|
self.initial_state = act_model.initial_state
|
||||||
|
|
||||||
|
self.save = functools.partial(save_variables, sess=sess)
|
||||||
|
self.load = functools.partial(load_variables, sess=sess)
|
||||||
|
|
||||||
|
if MPI is None or MPI.COMM_WORLD.Get_rank() == 0:
|
||||||
|
initialize()
|
||||||
|
else:
|
||||||
|
global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="")
|
||||||
|
sync_from_root(sess, global_variables) #pylint: disable=E1101
|
||||||
|
|
||||||
|
def train(self, lr, cliprange, obs, returns, masks, actions, values, neglogpacs, states=None):
|
||||||
|
# Here we calculate advantage A(s,a) = R + yV(s') - V(s)
|
||||||
|
# Returns = R + yV(s')
|
||||||
|
advs = returns - values
|
||||||
|
|
||||||
|
# Normalize the advantages
|
||||||
|
advs = (advs - advs.mean()) / (advs.std() + 1e-8)
|
||||||
|
|
||||||
|
td_map = {
|
||||||
|
self.train_model.X : obs,
|
||||||
|
self.A : actions,
|
||||||
|
self.ADV : advs,
|
||||||
|
self.R : returns,
|
||||||
|
self.LR : lr,
|
||||||
|
self.CLIPRANGE : cliprange,
|
||||||
|
self.OLDNEGLOGPAC : neglogpacs,
|
||||||
|
self.OLDVPRED : values
|
||||||
|
}
|
||||||
|
if states is not None:
|
||||||
|
td_map[self.train_model.S] = states
|
||||||
|
td_map[self.train_model.M] = masks
|
||||||
|
|
||||||
|
return self.sess.run(
|
||||||
|
self.stats_list + [self._train_op],
|
||||||
|
td_map
|
||||||
|
)[:-1]
|
||||||
|
|
@@ -1,217 +1,17 @@
|
|||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
import functools
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import os.path as osp
|
import os.path as osp
|
||||||
import tensorflow as tf
|
|
||||||
from baselines import logger
|
from baselines import logger
|
||||||
from collections import deque
|
from collections import deque
|
||||||
from baselines.common import explained_variance, set_global_seeds
|
from baselines.common import explained_variance, set_global_seeds
|
||||||
from baselines.common.policies import build_policy
|
from baselines.common.policies import build_policy
|
||||||
from baselines.common.runners import AbstractEnvRunner
|
try:
|
||||||
from baselines.common.tf_util import get_session, save_variables, load_variables
|
from mpi4py import MPI
|
||||||
from baselines.common.mpi_adam_optimizer import MpiAdamOptimizer
|
except ImportError:
|
||||||
|
MPI = None
|
||||||
|
from baselines.ppo2.runner import Runner
|
||||||
|
|
||||||
from mpi4py import MPI
|
|
||||||
from baselines.common.tf_util import initialize
|
|
||||||
from baselines.common.mpi_util import sync_from_root
|
|
||||||
|
|
||||||
class Model(object):
|
|
||||||
"""
|
|
||||||
We use this object to :
|
|
||||||
__init__:
|
|
||||||
- Creates the step_model
|
|
||||||
- Creates the train_model
|
|
||||||
|
|
||||||
train():
|
|
||||||
- Make the training part (feedforward and retropropagation of gradients)
|
|
||||||
|
|
||||||
save/load():
|
|
||||||
- Save load the model
|
|
||||||
"""
|
|
||||||
def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train,
|
|
||||||
nsteps, ent_coef, vf_coef, max_grad_norm):
|
|
||||||
sess = get_session()
|
|
||||||
|
|
||||||
with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE):
|
|
||||||
# CREATE OUR TWO MODELS
|
|
||||||
# act_model that is used for sampling
|
|
||||||
act_model = policy(nbatch_act, 1, sess)
|
|
||||||
|
|
||||||
# Train model for training
|
|
||||||
train_model = policy(nbatch_train, nsteps, sess)
|
|
||||||
|
|
||||||
# CREATE THE PLACEHOLDERS
|
|
||||||
A = train_model.pdtype.sample_placeholder([None])
|
|
||||||
ADV = tf.placeholder(tf.float32, [None])
|
|
||||||
R = tf.placeholder(tf.float32, [None])
|
|
||||||
# Keep track of old actor
|
|
||||||
OLDNEGLOGPAC = tf.placeholder(tf.float32, [None])
|
|
||||||
# Keep track of old critic
|
|
||||||
OLDVPRED = tf.placeholder(tf.float32, [None])
|
|
||||||
LR = tf.placeholder(tf.float32, [])
|
|
||||||
# Cliprange
|
|
||||||
CLIPRANGE = tf.placeholder(tf.float32, [])
|
|
||||||
|
|
||||||
neglogpac = train_model.pd.neglogp(A)
|
|
||||||
|
|
||||||
# Calculate the entropy
|
|
||||||
# Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy.
|
|
||||||
entropy = tf.reduce_mean(train_model.pd.entropy())
|
|
||||||
|
|
||||||
# CALCULATE THE LOSS
|
|
||||||
# Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss
|
|
||||||
|
|
||||||
# Clip the value to reduce variability during Critic training
|
|
||||||
# Get the predicted value
|
|
||||||
vpred = train_model.vf
|
|
||||||
vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, - CLIPRANGE, CLIPRANGE)
|
|
||||||
# Unclipped value
|
|
||||||
vf_losses1 = tf.square(vpred - R)
|
|
||||||
# Clipped value
|
|
||||||
vf_losses2 = tf.square(vpredclipped - R)
|
|
||||||
|
|
||||||
vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))
|
|
||||||
|
|
||||||
# Calculate ratio (pi current policy / pi old policy)
|
|
||||||
ratio = tf.exp(OLDNEGLOGPAC - neglogpac)
|
|
||||||
|
|
||||||
# Defining Loss = - J is equivalent to max J
|
|
||||||
pg_losses = -ADV * ratio
|
|
||||||
|
|
||||||
pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE)
|
|
||||||
|
|
||||||
# Final PG loss
|
|
||||||
pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2))
|
|
||||||
approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC))
|
|
||||||
clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE)))
|
|
||||||
|
|
||||||
# Total loss
|
|
||||||
loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef
|
|
||||||
|
|
||||||
# UPDATE THE PARAMETERS USING LOSS
|
|
||||||
# 1. Get the model parameters
|
|
||||||
params = tf.trainable_variables('ppo2_model')
|
|
||||||
# 2. Build our trainer
|
|
||||||
trainer = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=LR, epsilon=1e-5)
|
|
||||||
# 3. Calculate the gradients
|
|
||||||
grads_and_var = trainer.compute_gradients(loss, params)
|
|
||||||
grads, var = zip(*grads_and_var)
|
|
||||||
|
|
||||||
if max_grad_norm is not None:
|
|
||||||
# Clip the gradients (normalize)
|
|
||||||
grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
|
|
||||||
grads_and_var = list(zip(grads, var))
|
|
||||||
# zip aggregate each gradient with parameters associated
|
|
||||||
# For instance zip(ABCD, xyza) => Ax, By, Cz, Da
|
|
||||||
|
|
||||||
_train = trainer.apply_gradients(grads_and_var)
|
|
||||||
|
|
||||||
def train(lr, cliprange, obs, returns, masks, actions, values, neglogpacs, states=None):
|
|
||||||
# Here we calculate advantage A(s,a) = R + yV(s') - V(s)
|
|
||||||
# Returns = R + yV(s')
|
|
||||||
advs = returns - values
|
|
||||||
|
|
||||||
# Normalize the advantages
|
|
||||||
advs = (advs - advs.mean()) / (advs.std() + 1e-8)
|
|
||||||
td_map = {train_model.X:obs, A:actions, ADV:advs, R:returns, LR:lr,
|
|
||||||
CLIPRANGE:cliprange, OLDNEGLOGPAC:neglogpacs, OLDVPRED:values}
|
|
||||||
if states is not None:
|
|
||||||
td_map[train_model.S] = states
|
|
||||||
td_map[train_model.M] = masks
|
|
||||||
return sess.run(
|
|
||||||
[pg_loss, vf_loss, entropy, approxkl, clipfrac, _train],
|
|
||||||
td_map
|
|
||||||
)[:-1]
|
|
||||||
self.loss_names = ['policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac']
|
|
||||||
|
|
||||||
|
|
||||||
self.train = train
|
|
||||||
self.train_model = train_model
|
|
||||||
self.act_model = act_model
|
|
||||||
self.step = act_model.step
|
|
||||||
self.value = act_model.value
|
|
||||||
self.initial_state = act_model.initial_state
|
|
||||||
|
|
||||||
self.save = functools.partial(save_variables, sess=sess)
|
|
||||||
self.load = functools.partial(load_variables, sess=sess)
|
|
||||||
|
|
||||||
if MPI.COMM_WORLD.Get_rank() == 0:
|
|
||||||
initialize()
|
|
||||||
global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="")
|
|
||||||
sync_from_root(sess, global_variables) #pylint: disable=E1101
|
|
||||||
|
|
||||||
class Runner(AbstractEnvRunner):
|
|
||||||
"""
|
|
||||||
We use this object to make a mini batch of experiences
|
|
||||||
__init__:
|
|
||||||
- Initialize the runner
|
|
||||||
|
|
||||||
run():
|
|
||||||
- Make a mini batch
|
|
||||||
"""
|
|
||||||
def __init__(self, *, env, model, nsteps, gamma, lam):
|
|
||||||
super().__init__(env=env, model=model, nsteps=nsteps)
|
|
||||||
# Lambda used in GAE (General Advantage Estimation)
|
|
||||||
self.lam = lam
|
|
||||||
# Discount rate
|
|
||||||
self.gamma = gamma
|
|
||||||
|
|
||||||
def run(self):
|
|
||||||
# Here, we init the lists that will contain the mb of experiences
|
|
||||||
mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, mb_neglogpacs = [],[],[],[],[],[]
|
|
||||||
mb_states = self.states
|
|
||||||
epinfos = []
|
|
||||||
# For n in range number of steps
|
|
||||||
for _ in range(self.nsteps):
|
|
||||||
# Given observations, get action value and neglopacs
|
|
||||||
# We already have self.obs because Runner superclass run self.obs[:] = env.reset() on init
|
|
||||||
actions, values, self.states, neglogpacs = self.model.step(self.obs, S=self.states, M=self.dones)
|
|
||||||
mb_obs.append(self.obs.copy())
|
|
||||||
mb_actions.append(actions)
|
|
||||||
mb_values.append(values)
|
|
||||||
mb_neglogpacs.append(neglogpacs)
|
|
||||||
mb_dones.append(self.dones)
|
|
||||||
|
|
||||||
# Take actions in env and look the results
|
|
||||||
# Infos contains a ton of useful informations
|
|
||||||
self.obs[:], rewards, self.dones, infos = self.env.step(actions)
|
|
||||||
for info in infos:
|
|
||||||
maybeepinfo = info.get('episode')
|
|
||||||
if maybeepinfo: epinfos.append(maybeepinfo)
|
|
||||||
mb_rewards.append(rewards)
|
|
||||||
#batch of steps to batch of rollouts
|
|
||||||
mb_obs = np.asarray(mb_obs, dtype=self.obs.dtype)
|
|
||||||
mb_rewards = np.asarray(mb_rewards, dtype=np.float32)
|
|
||||||
mb_actions = np.asarray(mb_actions)
|
|
||||||
mb_values = np.asarray(mb_values, dtype=np.float32)
|
|
||||||
mb_neglogpacs = np.asarray(mb_neglogpacs, dtype=np.float32)
|
|
||||||
mb_dones = np.asarray(mb_dones, dtype=np.bool)
|
|
||||||
last_values = self.model.value(self.obs, S=self.states, M=self.dones)
|
|
||||||
|
|
||||||
# discount/bootstrap off value fn
|
|
||||||
mb_returns = np.zeros_like(mb_rewards)
|
|
||||||
mb_advs = np.zeros_like(mb_rewards)
|
|
||||||
lastgaelam = 0
|
|
||||||
for t in reversed(range(self.nsteps)):
|
|
||||||
if t == self.nsteps - 1:
|
|
||||||
nextnonterminal = 1.0 - self.dones
|
|
||||||
nextvalues = last_values
|
|
||||||
else:
|
|
||||||
nextnonterminal = 1.0 - mb_dones[t+1]
|
|
||||||
nextvalues = mb_values[t+1]
|
|
||||||
delta = mb_rewards[t] + self.gamma * nextvalues * nextnonterminal - mb_values[t]
|
|
||||||
mb_advs[t] = lastgaelam = delta + self.gamma * self.lam * nextnonterminal * lastgaelam
|
|
||||||
mb_returns = mb_advs + mb_values
|
|
||||||
return (*map(sf01, (mb_obs, mb_returns, mb_dones, mb_actions, mb_values, mb_neglogpacs)),
|
|
||||||
mb_states, epinfos)
|
|
||||||
# obs, returns, masks, actions, values, neglogpacs, states = runner.run()
|
|
||||||
def sf01(arr):
|
|
||||||
"""
|
|
||||||
swap and then flatten axes 0 and 1
|
|
||||||
"""
|
|
||||||
s = arr.shape
|
|
||||||
return arr.swapaxes(0, 1).reshape(s[0] * s[1], *s[2:])
|
|
||||||
|
|
||||||
def constfn(val):
|
def constfn(val):
|
||||||
def f(_):
|
def f(_):
|
||||||
@@ -221,7 +21,7 @@ def constfn(val):
|
|||||||
def learn(*, network, env, total_timesteps, eval_env = None, seed=None, nsteps=2048, ent_coef=0.0, lr=3e-4,
|
def learn(*, network, env, total_timesteps, eval_env = None, seed=None, nsteps=2048, ent_coef=0.0, lr=3e-4,
|
||||||
vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95,
|
vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95,
|
||||||
log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2,
|
log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2,
|
||||||
save_interval=0, load_path=None, **network_kwargs):
|
save_interval=0, load_path=None, model_fn=None, **network_kwargs):
|
||||||
'''
|
'''
|
||||||
Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347)
|
Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347)
|
||||||
|
|
||||||
@@ -299,10 +99,14 @@ def learn(*, network, env, total_timesteps, eval_env = None, seed=None, nsteps=2
|
|||||||
nbatch_train = nbatch // nminibatches
|
nbatch_train = nbatch // nminibatches
|
||||||
|
|
||||||
# Instantiate the model object (that creates act_model and train_model)
|
# Instantiate the model object (that creates act_model and train_model)
|
||||||
make_model = lambda : Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train,
|
if model_fn is None:
|
||||||
|
from baselines.ppo2.model import Model
|
||||||
|
model_fn = Model
|
||||||
|
|
||||||
|
model = model_fn(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train,
|
||||||
nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef,
|
nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef,
|
||||||
max_grad_norm=max_grad_norm)
|
max_grad_norm=max_grad_norm)
|
||||||
model = make_model()
|
|
||||||
if load_path is not None:
|
if load_path is not None:
|
||||||
model.load(load_path)
|
model.load(load_path)
|
||||||
# Instantiate the runner object
|
# Instantiate the runner object
|
||||||
@@ -310,8 +114,6 @@ def learn(*, network, env, total_timesteps, eval_env = None, seed=None, nsteps=2
|
|||||||
if eval_env is not None:
|
if eval_env is not None:
|
||||||
eval_runner = Runner(env = eval_env, model = model, nsteps = nsteps, gamma = gamma, lam= lam)
|
eval_runner = Runner(env = eval_env, model = model, nsteps = nsteps, gamma = gamma, lam= lam)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
epinfobuf = deque(maxlen=100)
|
epinfobuf = deque(maxlen=100)
|
||||||
if eval_env is not None:
|
if eval_env is not None:
|
||||||
eval_epinfobuf = deque(maxlen=100)
|
eval_epinfobuf = deque(maxlen=100)
|
||||||
@@ -392,9 +194,9 @@ def learn(*, network, env, total_timesteps, eval_env = None, seed=None, nsteps=2
|
|||||||
logger.logkv('time_elapsed', tnow - tfirststart)
|
logger.logkv('time_elapsed', tnow - tfirststart)
|
||||||
for (lossval, lossname) in zip(lossvals, model.loss_names):
|
for (lossval, lossname) in zip(lossvals, model.loss_names):
|
||||||
logger.logkv(lossname, lossval)
|
logger.logkv(lossname, lossval)
|
||||||
if MPI.COMM_WORLD.Get_rank() == 0:
|
if MPI is None or MPI.COMM_WORLD.Get_rank() == 0:
|
||||||
logger.dumpkvs()
|
logger.dumpkvs()
|
||||||
if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir() and MPI.COMM_WORLD.Get_rank() == 0:
|
if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir() and (MPI is None or MPI.COMM_WORLD.Get_rank() == 0):
|
||||||
checkdir = osp.join(logger.get_dir(), 'checkpoints')
|
checkdir = osp.join(logger.get_dir(), 'checkpoints')
|
||||||
os.makedirs(checkdir, exist_ok=True)
|
os.makedirs(checkdir, exist_ok=True)
|
||||||
savepath = osp.join(checkdir, '%.5i'%update)
|
savepath = osp.join(checkdir, '%.5i'%update)
|
||||||
|
76
baselines/ppo2/runner.py
Normal file
76
baselines/ppo2/runner.py
Normal file
@@ -0,0 +1,76 @@
|
|||||||
|
import numpy as np
|
||||||
|
from baselines.common.runners import AbstractEnvRunner
|
||||||
|
|
||||||
|
class Runner(AbstractEnvRunner):
|
||||||
|
"""
|
||||||
|
We use this object to make a mini batch of experiences
|
||||||
|
__init__:
|
||||||
|
- Initialize the runner
|
||||||
|
|
||||||
|
run():
|
||||||
|
- Make a mini batch
|
||||||
|
"""
|
||||||
|
def __init__(self, *, env, model, nsteps, gamma, lam):
|
||||||
|
super().__init__(env=env, model=model, nsteps=nsteps)
|
||||||
|
# Lambda used in GAE (General Advantage Estimation)
|
||||||
|
self.lam = lam
|
||||||
|
# Discount rate
|
||||||
|
self.gamma = gamma
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
# Here, we init the lists that will contain the mb of experiences
|
||||||
|
mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, mb_neglogpacs = [],[],[],[],[],[]
|
||||||
|
mb_states = self.states
|
||||||
|
epinfos = []
|
||||||
|
# For n in range number of steps
|
||||||
|
for _ in range(self.nsteps):
|
||||||
|
# Given observations, get action value and neglopacs
|
||||||
|
# We already have self.obs because Runner superclass run self.obs[:] = env.reset() on init
|
||||||
|
actions, values, self.states, neglogpacs = self.model.step(self.obs, S=self.states, M=self.dones)
|
||||||
|
mb_obs.append(self.obs.copy())
|
||||||
|
mb_actions.append(actions)
|
||||||
|
mb_values.append(values)
|
||||||
|
mb_neglogpacs.append(neglogpacs)
|
||||||
|
mb_dones.append(self.dones)
|
||||||
|
|
||||||
|
# Take actions in env and look the results
|
||||||
|
# Infos contains a ton of useful informations
|
||||||
|
self.obs[:], rewards, self.dones, infos = self.env.step(actions)
|
||||||
|
for info in infos:
|
||||||
|
maybeepinfo = info.get('episode')
|
||||||
|
if maybeepinfo: epinfos.append(maybeepinfo)
|
||||||
|
mb_rewards.append(rewards)
|
||||||
|
#batch of steps to batch of rollouts
|
||||||
|
mb_obs = np.asarray(mb_obs, dtype=self.obs.dtype)
|
||||||
|
mb_rewards = np.asarray(mb_rewards, dtype=np.float32)
|
||||||
|
mb_actions = np.asarray(mb_actions)
|
||||||
|
mb_values = np.asarray(mb_values, dtype=np.float32)
|
||||||
|
mb_neglogpacs = np.asarray(mb_neglogpacs, dtype=np.float32)
|
||||||
|
mb_dones = np.asarray(mb_dones, dtype=np.bool)
|
||||||
|
last_values = self.model.value(self.obs, S=self.states, M=self.dones)
|
||||||
|
|
||||||
|
# discount/bootstrap off value fn
|
||||||
|
mb_returns = np.zeros_like(mb_rewards)
|
||||||
|
mb_advs = np.zeros_like(mb_rewards)
|
||||||
|
lastgaelam = 0
|
||||||
|
for t in reversed(range(self.nsteps)):
|
||||||
|
if t == self.nsteps - 1:
|
||||||
|
nextnonterminal = 1.0 - self.dones
|
||||||
|
nextvalues = last_values
|
||||||
|
else:
|
||||||
|
nextnonterminal = 1.0 - mb_dones[t+1]
|
||||||
|
nextvalues = mb_values[t+1]
|
||||||
|
delta = mb_rewards[t] + self.gamma * nextvalues * nextnonterminal - mb_values[t]
|
||||||
|
mb_advs[t] = lastgaelam = delta + self.gamma * self.lam * nextnonterminal * lastgaelam
|
||||||
|
mb_returns = mb_advs + mb_values
|
||||||
|
return (*map(sf01, (mb_obs, mb_returns, mb_dones, mb_actions, mb_values, mb_neglogpacs)),
|
||||||
|
mb_states, epinfos)
|
||||||
|
# obs, returns, masks, actions, values, neglogpacs, states = runner.run()
|
||||||
|
def sf01(arr):
|
||||||
|
"""
|
||||||
|
swap and then flatten axes 0 and 1
|
||||||
|
"""
|
||||||
|
s = arr.shape
|
||||||
|
return arr.swapaxes(0, 1).reshape(s[0] * s[1], *s[2:])
|
||||||
|
|
||||||
|
|
34
baselines/ppo2/test_microbatches.py
Normal file
34
baselines/ppo2/test_microbatches.py
Normal file
@@ -0,0 +1,34 @@
|
|||||||
|
import gym
|
||||||
|
import tensorflow as tf
|
||||||
|
import numpy as np
|
||||||
|
from functools import partial
|
||||||
|
|
||||||
|
from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
|
||||||
|
from baselines.common.tf_util import make_session
|
||||||
|
from baselines.ppo2.ppo2 import learn
|
||||||
|
|
||||||
|
from baselines.ppo2.microbatched_model import MicrobatchedModel
|
||||||
|
|
||||||
|
def test_microbatches():
|
||||||
|
def env_fn():
|
||||||
|
env = gym.make('CartPole-v0')
|
||||||
|
env.seed(0)
|
||||||
|
return env
|
||||||
|
|
||||||
|
learn_fn = partial(learn, network='mlp', nsteps=32, total_timesteps=32, seed=0)
|
||||||
|
|
||||||
|
env_ref = DummyVecEnv([env_fn])
|
||||||
|
sess_ref = make_session(make_default=True, graph=tf.Graph())
|
||||||
|
learn_fn(env=env_ref)
|
||||||
|
vars_ref = {v.name: sess_ref.run(v) for v in tf.trainable_variables()}
|
||||||
|
|
||||||
|
env_test = DummyVecEnv([env_fn])
|
||||||
|
sess_test = make_session(make_default=True, graph=tf.Graph())
|
||||||
|
learn_fn(env=env_test, model_fn=partial(MicrobatchedModel, microbatch_size=2))
|
||||||
|
vars_test = {v.name: sess_test.run(v) for v in tf.trainable_variables()}
|
||||||
|
|
||||||
|
for v in vars_ref:
|
||||||
|
np.testing.assert_allclose(vars_ref[v], vars_test[v], atol=1e-3)
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
test_microbatches()
|
@@ -6,6 +6,7 @@ from collections import defaultdict
|
|||||||
import tensorflow as tf
|
import tensorflow as tf
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
|
from baselines.common.vec_env.vec_video_recorder import VecVideoRecorder
|
||||||
from baselines.common.vec_env.vec_frame_stack import VecFrameStack
|
from baselines.common.vec_env.vec_frame_stack import VecFrameStack
|
||||||
from baselines.common.cmd_util import common_arg_parser, parse_unknown_args, make_vec_env, make_env
|
from baselines.common.cmd_util import common_arg_parser, parse_unknown_args, make_vec_env, make_env
|
||||||
from baselines.common.tf_util import get_session
|
from baselines.common.tf_util import get_session
|
||||||
@@ -62,6 +63,8 @@ def train(args, extra_args):
|
|||||||
alg_kwargs.update(extra_args)
|
alg_kwargs.update(extra_args)
|
||||||
|
|
||||||
env = build_env(args)
|
env = build_env(args)
|
||||||
|
if args.save_video_interval != 0:
|
||||||
|
env = VecVideoRecorder(env, osp.join(logger.Logger.CURRENT.dir, "videos"), record_video_trigger=lambda x: x % args.save_video_interval == 0, video_length=args.save_video_length)
|
||||||
|
|
||||||
if args.network:
|
if args.network:
|
||||||
alg_kwargs['network'] = args.network
|
alg_kwargs['network'] = args.network
|
||||||
|
@@ -4,7 +4,6 @@ import baselines.common.tf_util as U
|
|||||||
import tensorflow as tf, numpy as np
|
import tensorflow as tf, numpy as np
|
||||||
import time
|
import time
|
||||||
from baselines.common import colorize
|
from baselines.common import colorize
|
||||||
from mpi4py import MPI
|
|
||||||
from collections import deque
|
from collections import deque
|
||||||
from baselines.common import set_global_seeds
|
from baselines.common import set_global_seeds
|
||||||
from baselines.common.mpi_adam import MpiAdam
|
from baselines.common.mpi_adam import MpiAdam
|
||||||
@@ -13,6 +12,11 @@ from baselines.common.input import observation_placeholder
|
|||||||
from baselines.common.policies import build_policy
|
from baselines.common.policies import build_policy
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
|
|
||||||
|
try:
|
||||||
|
from mpi4py import MPI
|
||||||
|
except ImportError:
|
||||||
|
MPI = None
|
||||||
|
|
||||||
def traj_segment_generator(pi, env, horizon, stochastic):
|
def traj_segment_generator(pi, env, horizon, stochastic):
|
||||||
# Initialize state variables
|
# Initialize state variables
|
||||||
t = 0
|
t = 0
|
||||||
@@ -146,9 +150,12 @@ def learn(*,
|
|||||||
|
|
||||||
'''
|
'''
|
||||||
|
|
||||||
|
if MPI is not None:
|
||||||
nworkers = MPI.COMM_WORLD.Get_size()
|
nworkers = MPI.COMM_WORLD.Get_size()
|
||||||
rank = MPI.COMM_WORLD.Get_rank()
|
rank = MPI.COMM_WORLD.Get_rank()
|
||||||
|
else:
|
||||||
|
nworkers = 1
|
||||||
|
rank = 0
|
||||||
|
|
||||||
cpus_per_worker = 1
|
cpus_per_worker = 1
|
||||||
U.get_session(config=tf.ConfigProto(
|
U.get_session(config=tf.ConfigProto(
|
||||||
@@ -237,9 +244,13 @@ def learn(*,
|
|||||||
|
|
||||||
def allmean(x):
|
def allmean(x):
|
||||||
assert isinstance(x, np.ndarray)
|
assert isinstance(x, np.ndarray)
|
||||||
out = np.empty_like(x)
|
if MPI is not None:
|
||||||
MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM)
|
out = np.empty_like(x)
|
||||||
out /= nworkers
|
MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM)
|
||||||
|
out /= nworkers
|
||||||
|
else:
|
||||||
|
out = np.copy(x)
|
||||||
|
|
||||||
return out
|
return out
|
||||||
|
|
||||||
U.initialize()
|
U.initialize()
|
||||||
@@ -247,7 +258,9 @@ def learn(*,
|
|||||||
pi.load(load_path)
|
pi.load(load_path)
|
||||||
|
|
||||||
th_init = get_flat()
|
th_init = get_flat()
|
||||||
MPI.COMM_WORLD.Bcast(th_init, root=0)
|
if MPI is not None:
|
||||||
|
MPI.COMM_WORLD.Bcast(th_init, root=0)
|
||||||
|
|
||||||
set_from_flat(th_init)
|
set_from_flat(th_init)
|
||||||
vfadam.sync()
|
vfadam.sync()
|
||||||
print("Init param sum", th_init.sum(), flush=True)
|
print("Init param sum", th_init.sum(), flush=True)
|
||||||
@@ -260,8 +273,8 @@ def learn(*,
|
|||||||
timesteps_so_far = 0
|
timesteps_so_far = 0
|
||||||
iters_so_far = 0
|
iters_so_far = 0
|
||||||
tstart = time.time()
|
tstart = time.time()
|
||||||
lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths
|
lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths
|
||||||
rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards
|
rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards
|
||||||
|
|
||||||
if sum([max_iters>0, total_timesteps>0, max_episodes>0])==0:
|
if sum([max_iters>0, total_timesteps>0, max_episodes>0])==0:
|
||||||
# noththing to be done
|
# noththing to be done
|
||||||
@@ -353,7 +366,11 @@ def learn(*,
|
|||||||
logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret))
|
logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret))
|
||||||
|
|
||||||
lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values
|
lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values
|
||||||
listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples
|
if MPI is not None:
|
||||||
|
listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples
|
||||||
|
else:
|
||||||
|
listoflrpairs = [lrlocal]
|
||||||
|
|
||||||
lens, rews = map(flatten_lists, zip(*listoflrpairs))
|
lens, rews = map(flatten_lists, zip(*listoflrpairs))
|
||||||
lenbuffer.extend(lens)
|
lenbuffer.extend(lens)
|
||||||
rewbuffer.extend(rews)
|
rewbuffer.extend(rews)
|
||||||
|
808
docs/viz/viz.ipynb
Normal file
808
docs/viz/viz.ipynb
Normal file
File diff suppressed because one or more lines are too long
5
setup.py
5
setup.py
@@ -11,10 +11,14 @@ extras = {
|
|||||||
'test': [
|
'test': [
|
||||||
'filelock',
|
'filelock',
|
||||||
'pytest',
|
'pytest',
|
||||||
|
'pytest-forked',
|
||||||
'atari-py'
|
'atari-py'
|
||||||
],
|
],
|
||||||
'bullet': [
|
'bullet': [
|
||||||
'pybullet',
|
'pybullet',
|
||||||
|
],
|
||||||
|
'mpi': [
|
||||||
|
'mpi4py'
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -34,7 +38,6 @@ setup(name='baselines',
|
|||||||
'joblib',
|
'joblib',
|
||||||
'dill',
|
'dill',
|
||||||
'progressbar2',
|
'progressbar2',
|
||||||
'mpi4py',
|
|
||||||
'cloudpickle',
|
'cloudpickle',
|
||||||
'click',
|
'click',
|
||||||
'opencv-python'
|
'opencv-python'
|
||||||
|
Reference in New Issue
Block a user