diff --git a/baselines/acer/acer.py b/baselines/acer/acer.py index 4e2e00f..0ae0330 100644 --- a/baselines/acer/acer.py +++ b/baselines/acer/acer.py @@ -7,6 +7,7 @@ from baselines import logger from baselines.common import set_global_seeds from baselines.common.policies import build_policy from baselines.common.tf_util import get_session, save_variables +from baselines.common.vec_env.vec_frame_stack import VecFrameStack from baselines.a2c.utils import batch_to_seq, seq_to_batch from baselines.a2c.utils import cat_entropy_softmax @@ -55,8 +56,7 @@ def q_retrace(R, D, q_i, v, rho_i, nenvs, nsteps, gamma): # return tf.minimum(1 + eps_clip, tf.maximum(1 - eps_clip, ratio)) class Model(object): - def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, nstack, num_procs, - ent_coef, q_coef, gamma, max_grad_norm, lr, + def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, ent_coef, q_coef, gamma, max_grad_norm, lr, rprop_alpha, rprop_epsilon, total_timesteps, lrschedule, c, trust_region, alpha, delta): @@ -71,8 +71,8 @@ class Model(object): LR = tf.placeholder(tf.float32, []) eps = 1e-6 - step_ob_placeholder = tf.placeholder(dtype=ob_space.dtype, shape=(nenvs,) + ob_space.shape[:-1] + (ob_space.shape[-1] * nstack,)) - train_ob_placeholder = tf.placeholder(dtype=ob_space.dtype, shape=(nenvs*(nsteps+1),) + ob_space.shape[:-1] + (ob_space.shape[-1] * nstack,)) + step_ob_placeholder = tf.placeholder(dtype=ob_space.dtype, shape=(nenvs,) + ob_space.shape) + train_ob_placeholder = tf.placeholder(dtype=ob_space.dtype, shape=(nenvs*(nsteps+1),) + ob_space.shape) with tf.variable_scope('acer_model', reuse=tf.AUTO_REUSE): step_model = policy(observ_placeholder=step_ob_placeholder, sess=sess) @@ -247,6 +247,7 @@ class Acer(): # get obs, actions, rewards, mus, dones from buffer. obs, actions, rewards, mus, dones, masks = buffer.get() + # reshape stuff correctly obs = obs.reshape(runner.batch_ob_shape) actions = actions.reshape([runner.nbatch]) @@ -270,7 +271,7 @@ class Acer(): logger.dump_tabular() -def learn(network, env, seed=None, nsteps=20, nstack=4, total_timesteps=int(80e6), q_coef=0.5, ent_coef=0.01, +def learn(network, env, seed=None, nsteps=20, total_timesteps=int(80e6), q_coef=0.5, ent_coef=0.01, max_grad_norm=10, lr=7e-4, lrschedule='linear', rprop_epsilon=1e-5, rprop_alpha=0.99, gamma=0.99, log_interval=100, buffer_size=50000, replay_ratio=4, replay_start=10000, c=10.0, trust_region=True, alpha=0.99, delta=1, load_path=None, **network_kwargs): @@ -342,21 +343,24 @@ def learn(network, env, seed=None, nsteps=20, nstack=4, total_timesteps=int(80e6 print("Running Acer Simple") print(locals()) set_global_seeds(seed) - policy = build_policy(env, network, estimate_q=True, **network_kwargs) + if not isinstance(env, VecFrameStack): + env = VecFrameStack(env, 1) + policy = build_policy(env, network, estimate_q=True, **network_kwargs) nenvs = env.num_envs ob_space = env.observation_space ac_space = env.action_space - num_procs = len(env.remotes) if hasattr(env, 'remotes') else 1# HACK - model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nenvs=nenvs, nsteps=nsteps, nstack=nstack, - num_procs=num_procs, ent_coef=ent_coef, q_coef=q_coef, gamma=gamma, + + nstack = env.nstack + model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nenvs=nenvs, nsteps=nsteps, + ent_coef=ent_coef, q_coef=q_coef, gamma=gamma, max_grad_norm=max_grad_norm, lr=lr, rprop_alpha=rprop_alpha, rprop_epsilon=rprop_epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule, c=c, trust_region=trust_region, alpha=alpha, delta=delta) - runner = Runner(env=env, model=model, nsteps=nsteps, nstack=nstack) + runner = Runner(env=env, model=model, nsteps=nsteps) if replay_ratio > 0: - buffer = Buffer(env=env, nsteps=nsteps, nstack=nstack, size=buffer_size) + buffer = Buffer(env=env, nsteps=nsteps, size=buffer_size) else: buffer = None nbatch = nenvs*nsteps diff --git a/baselines/acer/buffer.py b/baselines/acer/buffer.py index 2dcfa10..000592c 100644 --- a/baselines/acer/buffer.py +++ b/baselines/acer/buffer.py @@ -2,11 +2,16 @@ import numpy as np class Buffer(object): # gets obs, actions, rewards, mu's, (states, masks), dones - def __init__(self, env, nsteps, nstack, size=50000): + def __init__(self, env, nsteps, size=50000): self.nenv = env.num_envs self.nsteps = nsteps - self.nh, self.nw, self.nc = env.observation_space.shape - self.nstack = nstack + # self.nh, self.nw, self.nc = env.observation_space.shape + self.obs_shape = env.observation_space.shape + self.obs_dtype = env.observation_space.dtype + self.ac_dtype = env.action_space.dtype + self.nc = self.obs_shape[-1] + self.nstack = env.nstack + self.nc //= self.nstack self.nbatch = self.nenv * self.nsteps self.size = size // (self.nsteps) # Each loc contains nenv * nsteps frames, thus total buffer is nenv * size frames @@ -33,22 +38,11 @@ class Buffer(object): # Generate stacked frames def decode(self, enc_obs, dones): # enc_obs has shape [nenvs, nsteps + nstack, nh, nw, nc] - # dones has shape [nenvs, nsteps, nh, nw, nc] + # dones has shape [nenvs, nsteps] # returns stacked obs of shape [nenv, (nsteps + 1), nh, nw, nstack*nc] - nstack, nenv, nsteps, nh, nw, nc = self.nstack, self.nenv, self.nsteps, self.nh, self.nw, self.nc - y = np.empty([nsteps + nstack - 1, nenv, 1, 1, 1], dtype=np.float32) - obs = np.zeros([nstack, nsteps + nstack, nenv, nh, nw, nc], dtype=np.uint8) - x = np.reshape(enc_obs, [nenv, nsteps + nstack, nh, nw, nc]).swapaxes(1, - 0) # [nsteps + nstack, nenv, nh, nw, nc] - y[3:] = np.reshape(1.0 - dones, [nenv, nsteps, 1, 1, 1]).swapaxes(1, 0) # keep - y[:3] = 1.0 - # y = np.reshape(1 - dones, [nenvs, nsteps, 1, 1, 1]) - for i in range(nstack): - obs[-(i + 1), i:] = x - # obs[:,i:,:,:,-(i+1),:] = x - x = x[:-1] * y - y = y[1:] - return np.reshape(obs[:, 3:].transpose((2, 1, 3, 4, 0, 5)), [nenv, (nsteps + 1), nh, nw, nstack * nc]) + + return _stack_obs(enc_obs, dones, + nsteps=self.nsteps) def put(self, enc_obs, actions, rewards, mus, dones, masks): # enc_obs [nenv, (nsteps + nstack), nh, nw, nc] @@ -56,8 +50,8 @@ class Buffer(object): # mus [nenv, nsteps, nact] if self.enc_obs is None: - self.enc_obs = np.empty([self.size] + list(enc_obs.shape), dtype=np.uint8) - self.actions = np.empty([self.size] + list(actions.shape), dtype=np.int32) + self.enc_obs = np.empty([self.size] + list(enc_obs.shape), dtype=self.obs_dtype) + self.actions = np.empty([self.size] + list(actions.shape), dtype=self.ac_dtype) self.rewards = np.empty([self.size] + list(rewards.shape), dtype=np.float32) self.mus = np.empty([self.size] + list(mus.shape), dtype=np.float32) self.dones = np.empty([self.size] + list(dones.shape), dtype=np.bool) @@ -101,3 +95,62 @@ class Buffer(object): mus = take(self.mus) masks = take(self.masks) return obs, actions, rewards, mus, dones, masks + + + +def _stack_obs_ref(enc_obs, dones, nsteps): + nenv = enc_obs.shape[0] + nstack = enc_obs.shape[1] - nsteps + nh, nw, nc = enc_obs.shape[2:] + obs_dtype = enc_obs.dtype + obs_shape = (nh, nw, nc*nstack) + + mask = np.empty([nsteps + nstack - 1, nenv, 1, 1, 1], dtype=np.float32) + obs = np.zeros([nstack, nsteps + nstack, nenv, nh, nw, nc], dtype=obs_dtype) + x = np.reshape(enc_obs, [nenv, nsteps + nstack, nh, nw, nc]).swapaxes(1, 0) # [nsteps + nstack, nenv, nh, nw, nc] + + mask[nstack-1:] = np.reshape(1.0 - dones, [nenv, nsteps, 1, 1, 1]).swapaxes(1, 0) # keep + mask[:nstack-1] = 1.0 + + # y = np.reshape(1 - dones, [nenvs, nsteps, 1, 1, 1]) + for i in range(nstack): + obs[-(i + 1), i:] = x + # obs[:,i:,:,:,-(i+1),:] = x + x = x[:-1] * mask + mask = mask[1:] + + return np.reshape(obs[:, (nstack-1):].transpose((2, 1, 3, 4, 0, 5)), (nenv, (nsteps + 1)) + obs_shape) + +def _stack_obs(enc_obs, dones, nsteps): + nenv = enc_obs.shape[0] + nstack = enc_obs.shape[1] - nsteps + nc = enc_obs.shape[-1] + + obs_ = np.zeros((nenv, nsteps + 1) + enc_obs.shape[2:-1] + (enc_obs.shape[-1] * nstack, ), dtype=enc_obs.dtype) + mask = np.ones((nenv, nsteps+1), dtype=enc_obs.dtype) + mask[:, 1:] = 1.0 - dones + mask = mask.reshape(mask.shape + tuple(np.ones(len(enc_obs.shape)-2, dtype=np.uint8))) + + for i in range(nstack-1, -1, -1): + obs_[..., i * nc : (i + 1) * nc] = enc_obs[:, i : i + nsteps + 1, :] + if i < nstack-1: + obs_[..., i * nc : (i + 1) * nc] *= mask + mask[:, 1:, ...] *= mask[:, :-1, ...] + + return obs_ + +def test_stack_obs(): + nstack = 7 + nenv = 1 + nsteps = 5 + + obs_shape = (2, 3, nstack) + + enc_obs_shape = (nenv, nsteps + nstack) + obs_shape[:-1] + (1,) + enc_obs = np.random.random(enc_obs_shape) + dones = np.random.randint(low=0, high=2, size=(nenv, nsteps)) + + stacked_obs_ref = _stack_obs_ref(enc_obs, dones, nsteps=nsteps) + stacked_obs_test = _stack_obs(enc_obs, dones, nsteps=nsteps) + + np.testing.assert_allclose(stacked_obs_ref, stacked_obs_test) diff --git a/baselines/acer/runner.py b/baselines/acer/runner.py index 6bc1b4c..afd19ce 100644 --- a/baselines/acer/runner.py +++ b/baselines/acer/runner.py @@ -1,30 +1,31 @@ import numpy as np from baselines.common.runners import AbstractEnvRunner +from baselines.common.vec_env.vec_frame_stack import VecFrameStack +from gym import spaces + class Runner(AbstractEnvRunner): - def __init__(self, env, model, nsteps, nstack): + def __init__(self, env, model, nsteps): super().__init__(env=env, model=model, nsteps=nsteps) - self.nstack = nstack - nh, nw, nc = env.observation_space.shape - self.nc = nc # nc = 1 for atari, but just in case + assert isinstance(env.action_space, spaces.Discrete), 'This ACER implementation works only with discrete action spaces!' + assert isinstance(env, VecFrameStack) + self.nact = env.action_space.n nenv = self.nenv self.nbatch = nenv * nsteps - self.batch_ob_shape = (nenv*(nsteps+1), nh, nw, nc*nstack) - self.obs = np.zeros((nenv, nh, nw, nc * nstack), dtype=np.uint8) - obs = env.reset() - self.update_obs(obs) + self.batch_ob_shape = (nenv*(nsteps+1),) + env.observation_space.shape + + self.obs = env.reset() + self.obs_dtype = env.observation_space.dtype + self.ac_dtype = env.action_space.dtype + self.nstack = self.env.nstack + self.nc = self.batch_ob_shape[-1] // self.nstack - def update_obs(self, obs, dones=None): - #self.obs = obs - if dones is not None: - self.obs *= (1 - dones.astype(np.uint8))[:, None, None, None] - self.obs = np.roll(self.obs, shift=-self.nc, axis=3) - self.obs[:, :, :, -self.nc:] = obs[:, :, :, :] def run(self): - enc_obs = np.split(self.obs, self.nstack, axis=3) # so now list of obs steps + # enc_obs = np.split(self.obs, self.nstack, axis=3) # so now list of obs steps + enc_obs = np.split(self.env.stackedobs, self.env.nstack, axis=-1) mb_obs, mb_actions, mb_mus, mb_dones, mb_rewards = [], [], [], [], [] for _ in range(self.nsteps): actions, mus, states = self.model._step(self.obs, S=self.states, M=self.dones) @@ -36,15 +37,15 @@ class Runner(AbstractEnvRunner): # states information for statefull models like LSTM self.states = states self.dones = dones - self.update_obs(obs, dones) + self.obs = obs mb_rewards.append(rewards) - enc_obs.append(obs) + enc_obs.append(obs[..., -self.nc:]) mb_obs.append(np.copy(self.obs)) mb_dones.append(self.dones) - enc_obs = np.asarray(enc_obs, dtype=np.uint8).swapaxes(1, 0) - mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0) - mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0) + enc_obs = np.asarray(enc_obs, dtype=self.obs_dtype).swapaxes(1, 0) + mb_obs = np.asarray(mb_obs, dtype=self.obs_dtype).swapaxes(1, 0) + mb_actions = np.asarray(mb_actions, dtype=self.ac_dtype).swapaxes(1, 0) mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0) mb_mus = np.asarray(mb_mus, dtype=np.float32).swapaxes(1, 0) diff --git a/baselines/acktr/acktr.py b/baselines/acktr/acktr.py index dcbe612..10ab32b 100644 --- a/baselines/acktr/acktr.py +++ b/baselines/acktr/acktr.py @@ -21,16 +21,16 @@ class Model(object): self.sess = sess = get_session() nbatch = nenvs * nsteps - A = tf.placeholder(ac_space.dtype, [nbatch,] + list(ac_space.shape)) + with tf.variable_scope('acktr_model', reuse=tf.AUTO_REUSE): + self.model = step_model = policy(nenvs, 1, sess=sess) + self.model2 = train_model = policy(nenvs*nsteps, nsteps, sess=sess) + + A = train_model.pdtype.sample_placeholder([None]) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) PG_LR = tf.placeholder(tf.float32, []) VF_LR = tf.placeholder(tf.float32, []) - with tf.variable_scope('acktr_model', reuse=tf.AUTO_REUSE): - self.model = step_model = policy(nenvs, 1, sess=sess) - self.model2 = train_model = policy(nenvs*nsteps, nsteps, sess=sess) - neglogpac = train_model.pd.neglogp(A) self.logits = train_model.pi diff --git a/baselines/common/atari_wrappers.py b/baselines/common/atari_wrappers.py index 6be3582..731ee7e 100644 --- a/baselines/common/atari_wrappers.py +++ b/baselines/common/atari_wrappers.py @@ -213,8 +213,11 @@ class LazyFrames(object): def __getitem__(self, i): return self._force()[i] -def make_atari(env_id): +def make_atari(env_id, timelimit=True): + # XXX(john): remove timelimit argument after gym is upgraded to allow double wrapping env = gym.make(env_id) + if not timelimit: + env = env.env assert 'NoFrameskip' in env.spec.id env = NoopResetEnv(env, noop_max=30) env = MaxAndSkipEnv(env, skip=4) diff --git a/baselines/common/cmd_util.py b/baselines/common/cmd_util.py index d69589c..7c38a77 100644 --- a/baselines/common/cmd_util.py +++ b/baselines/common/cmd_util.py @@ -16,30 +16,57 @@ from baselines.common import set_global_seeds from baselines.common.atari_wrappers import make_atari, wrap_deepmind from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv from baselines.common.vec_env.dummy_vec_env import DummyVecEnv -from baselines.common.retro_wrappers import RewardScaler +from baselines.common import retro_wrappers - -def make_vec_env(env_id, env_type, num_env, seed, wrapper_kwargs=None, start_index=0, reward_scale=1.0): +def make_vec_env(env_id, env_type, num_env, seed, wrapper_kwargs=None, start_index=0, reward_scale=1.0, gamestate=None): """ Create a wrapped, monitored SubprocVecEnv for Atari and MuJoCo. """ if wrapper_kwargs is None: wrapper_kwargs = {} mpi_rank = MPI.COMM_WORLD.Get_rank() if MPI else 0 - def make_env(rank): # pylint: disable=C0111 - def _thunk(): - env = make_atari(env_id) if env_type == 'atari' else gym.make(env_id) - env.seed(seed + 10000*mpi_rank + rank if seed is not None else None) - env = Monitor(env, - logger.get_dir() and os.path.join(logger.get_dir(), str(mpi_rank) + '.' + str(rank)), - allow_early_resets=True) + seed = seed + 10000 * mpi_rank if seed is not None else None + def make_thunk(rank): + return lambda: make_env( + env_id=env_id, + env_type=env_type, + subrank = rank, + seed=seed, + reward_scale=reward_scale, + gamestate=gamestate, + wrapper_kwargs=wrapper_kwargs + ) - if env_type == 'atari': return wrap_deepmind(env, **wrapper_kwargs) - elif reward_scale != 1: return RewardScaler(env, reward_scale) - else: return env - return _thunk set_global_seeds(seed) - if num_env > 1: return SubprocVecEnv([make_env(i + start_index) for i in range(num_env)]) - else: return DummyVecEnv([make_env(start_index)]) + if num_env > 1: + return SubprocVecEnv([make_thunk(i + start_index) for i in range(num_env)]) + else: + return DummyVecEnv([make_thunk(start_index)]) + + +def make_env(env_id, env_type, subrank=0, seed=None, reward_scale=1.0, gamestate=None, wrapper_kwargs={}): + mpi_rank = MPI.COMM_WORLD.Get_rank() if MPI else 0 + if env_type == 'atari': + env = make_atari(env_id) + elif env_type == 'retro': + import retro + gamestate = gamestate or retro.State.DEFAULT + env = retro_wrappers.make_retro(game=env_id, max_episode_steps=10000, use_restricted_actions=retro.Actions.DISCRETE, state=gamestate) + else: + env = gym.make(env_id) + + env.seed(seed + subrank if seed is not None else None) + env = Monitor(env, + logger.get_dir() and os.path.join(logger.get_dir(), str(mpi_rank) + '.' + str(subrank)), + allow_early_resets=True) + + if env_type == 'atari': + return wrap_deepmind(env, **wrapper_kwargs) + elif reward_scale != 1: + return retro_wrappers.RewardScaler(env, reward_scale) + else: + return env + + def make_mujoco_env(env_id, seed, reward_scale=1.0): """ diff --git a/baselines/common/distributions.py b/baselines/common/distributions.py index 491b9ff..5b3e7be 100644 --- a/baselines/common/distributions.py +++ b/baselines/common/distributions.py @@ -39,7 +39,7 @@ class PdType(object): raise NotImplementedError def pdfromflat(self, flat): return self.pdclass()(flat) - def pdfromlatent(self, latent_vector): + def pdfromlatent(self, latent_vector, init_scale, init_bias): raise NotImplementedError def param_shape(self): raise NotImplementedError @@ -80,6 +80,11 @@ class MultiCategoricalPdType(PdType): return MultiCategoricalPd def pdfromflat(self, flat): return MultiCategoricalPd(self.ncats, flat) + + def pdfromlatent(self, latent, init_scale=1.0, init_bias=0.0): + pdparam = fc(latent, 'pi', self.ncats.sum(), init_scale=init_scale, init_bias=init_bias) + return self.pdfromflat(pdparam), pdparam + def param_shape(self): return [sum(self.ncats)] def sample_shape(self): diff --git a/baselines/common/filters.py b/baselines/common/filters.py deleted file mode 100644 index 5ce019c..0000000 --- a/baselines/common/filters.py +++ /dev/null @@ -1,98 +0,0 @@ -from .running_stat import RunningStat -from collections import deque -import numpy as np - -class Filter(object): - def __call__(self, x, update=True): - raise NotImplementedError - def reset(self): - pass - -class IdentityFilter(Filter): - def __call__(self, x, update=True): - return x - -class CompositionFilter(Filter): - def __init__(self, fs): - self.fs = fs - def __call__(self, x, update=True): - for f in self.fs: - x = f(x) - return x - def output_shape(self, input_space): - out = input_space.shape - for f in self.fs: - out = f.output_shape(out) - return out - -class ZFilter(Filter): - """ - y = (x-mean)/std - using running estimates of mean,std - """ - - def __init__(self, shape, demean=True, destd=True, clip=10.0): - self.demean = demean - self.destd = destd - self.clip = clip - - self.rs = RunningStat(shape) - - def __call__(self, x, update=True): - if update: self.rs.push(x) - if self.demean: - x = x - self.rs.mean - if self.destd: - x = x / (self.rs.std+1e-8) - if self.clip: - x = np.clip(x, -self.clip, self.clip) - return x - def output_shape(self, input_space): - return input_space.shape - -class AddClock(Filter): - def __init__(self): - self.count = 0 - def reset(self): - self.count = 0 - def __call__(self, x, update=True): - return np.append(x, self.count/100.0) - def output_shape(self, input_space): - return (input_space.shape[0]+1,) - -class FlattenFilter(Filter): - def __call__(self, x, update=True): - return x.ravel() - def output_shape(self, input_space): - return (int(np.prod(input_space.shape)),) - -class Ind2OneHotFilter(Filter): - def __init__(self, n): - self.n = n - def __call__(self, x, update=True): - out = np.zeros(self.n) - out[x] = 1 - return out - def output_shape(self, input_space): - return (input_space.n,) - -class DivFilter(Filter): - def __init__(self, divisor): - self.divisor = divisor - def __call__(self, x, update=True): - return x / self.divisor - def output_shape(self, input_space): - return input_space.shape - -class StackFilter(Filter): - def __init__(self, length): - self.stack = deque(maxlen=length) - def reset(self): - self.stack.clear() - def __call__(self, x, update=True): - self.stack.append(x) - while len(self.stack) < self.stack.maxlen: - self.stack.append(x) - return np.concatenate(self.stack, axis=-1) - def output_shape(self, input_space): - return input_space.shape[:-1] + (input_space.shape[-1] * self.stack.maxlen,) diff --git a/baselines/common/input.py b/baselines/common/input.py index 7d51008..ebaf30a 100644 --- a/baselines/common/input.py +++ b/baselines/common/input.py @@ -1,5 +1,6 @@ +import numpy as np import tensorflow as tf -from gym.spaces import Discrete, Box +from gym.spaces import Discrete, Box, MultiDiscrete def observation_placeholder(ob_space, batch_size=None, name='Ob'): ''' @@ -20,10 +21,14 @@ def observation_placeholder(ob_space, batch_size=None, name='Ob'): tensorflow placeholder tensor ''' - assert isinstance(ob_space, Discrete) or isinstance(ob_space, Box), \ + assert isinstance(ob_space, Discrete) or isinstance(ob_space, Box) or isinstance(ob_space, MultiDiscrete), \ 'Can only deal with Discrete and Box observation spaces for now' - return tf.placeholder(shape=(batch_size,) + ob_space.shape, dtype=ob_space.dtype, name=name) + dtype = ob_space.dtype + if dtype == np.int8: + dtype = np.uint8 + + return tf.placeholder(shape=(batch_size,) + ob_space.shape, dtype=dtype, name=name) def observation_input(ob_space, batch_size=None, name='Ob'): @@ -48,9 +53,12 @@ def encode_observation(ob_space, placeholder): ''' if isinstance(ob_space, Discrete): return tf.to_float(tf.one_hot(placeholder, ob_space.n)) - elif isinstance(ob_space, Box): return tf.to_float(placeholder) + elif isinstance(ob_space, MultiDiscrete): + placeholder = tf.cast(placeholder, tf.int32) + one_hots = [tf.to_float(tf.one_hot(placeholder[..., i], ob_space.nvec[i])) for i in range(placeholder.shape[-1])] + return tf.concat(one_hots, axis=-1) else: raise NotImplementedError diff --git a/baselines/common/running_stat.py b/baselines/common/running_stat.py deleted file mode 100644 index b9aa86c..0000000 --- a/baselines/common/running_stat.py +++ /dev/null @@ -1,46 +0,0 @@ -import numpy as np - -# http://www.johndcook.com/blog/standard_deviation/ -class RunningStat(object): - def __init__(self, shape): - self._n = 0 - self._M = np.zeros(shape) - self._S = np.zeros(shape) - def push(self, x): - x = np.asarray(x) - assert x.shape == self._M.shape - self._n += 1 - if self._n == 1: - self._M[...] = x - else: - oldM = self._M.copy() - self._M[...] = oldM + (x - oldM)/self._n - self._S[...] = self._S + (x - oldM)*(x - self._M) - @property - def n(self): - return self._n - @property - def mean(self): - return self._M - @property - def var(self): - return self._S/(self._n - 1) if self._n > 1 else np.square(self._M) - @property - def std(self): - return np.sqrt(self.var) - @property - def shape(self): - return self._M.shape - -def test_running_stat(): - for shp in ((), (3,), (3,4)): - li = [] - rs = RunningStat(shp) - for _ in range(5): - val = np.random.randn(*shp) - rs.push(val) - li.append(val) - m = np.mean(li, axis=0) - assert np.allclose(rs.mean, m) - v = np.square(m) if (len(li) == 1) else np.var(li, ddof=1, axis=0) - assert np.allclose(rs.var, v) diff --git a/baselines/common/tests/envs/identity_env.py b/baselines/common/tests/envs/identity_env.py index 005d3ff..4429f04 100644 --- a/baselines/common/tests/envs/identity_env.py +++ b/baselines/common/tests/envs/identity_env.py @@ -1,7 +1,7 @@ import numpy as np from abc import abstractmethod from gym import Env -from gym.spaces import Discrete, Box +from gym.spaces import MultiDiscrete, Discrete, Box class IdentityEnv(Env): @@ -53,6 +53,19 @@ class DiscreteIdentityEnv(IdentityEnv): def _get_reward(self, actions): return 1 if self.state == actions else 0 +class MultiDiscreteIdentityEnv(IdentityEnv): + def __init__( + self, + dims, + episode_len=None, + ): + + self.action_space = MultiDiscrete(dims) + super().__init__(episode_len=episode_len) + + def _get_reward(self, actions): + return 1 if all(self.state == actions) else 0 + class BoxIdentityEnv(IdentityEnv): def __init__( diff --git a/baselines/common/tests/test_cartpole.py b/baselines/common/tests/test_cartpole.py index 06d65e4..475ad1d 100644 --- a/baselines/common/tests/test_cartpole.py +++ b/baselines/common/tests/test_cartpole.py @@ -13,6 +13,7 @@ common_kwargs = dict( learn_kwargs = { 'a2c' : dict(nsteps=32, value_network='copy', lr=0.05), + 'acer': dict(value_network='copy'), 'acktr': dict(nsteps=32, value_network='copy', is_async=False), 'deepq': dict(total_timesteps=20000), 'ppo2': dict(value_network='copy'), @@ -40,4 +41,4 @@ def test_cartpole(alg): reward_per_episode_test(env_fn, learn_fn, 100) if __name__ == '__main__': - test_cartpole('deepq') + test_cartpole('acer') diff --git a/baselines/common/tests/test_identity.py b/baselines/common/tests/test_identity.py index 744ed83..c950e5a 100644 --- a/baselines/common/tests/test_identity.py +++ b/baselines/common/tests/test_identity.py @@ -1,5 +1,5 @@ import pytest -from baselines.common.tests.envs.identity_env import DiscreteIdentityEnv, BoxIdentityEnv +from baselines.common.tests.envs.identity_env import DiscreteIdentityEnv, BoxIdentityEnv, MultiDiscreteIdentityEnv from baselines.run import get_learn_function from baselines.common.tests.util import simple_test @@ -20,8 +20,9 @@ learn_kwargs = { } -algos_disc = ['a2c', 'deepq', 'ppo2', 'trpo_mpi'] -algos_cont = ['a2c', 'ddpg', 'ppo2', 'trpo_mpi'] +algos_disc = ['a2c', 'acktr', 'deepq', 'ppo2', 'trpo_mpi'] +algos_multidisc = ['a2c', 'acktr', 'ppo2', 'trpo_mpi'] +algos_cont = ['a2c', 'acktr', 'ddpg', 'ppo2', 'trpo_mpi'] @pytest.mark.slow @pytest.mark.parametrize("alg", algos_disc) @@ -38,6 +39,21 @@ def test_discrete_identity(alg): env_fn = lambda: DiscreteIdentityEnv(10, episode_len=100) simple_test(env_fn, learn_fn, 0.9) +@pytest.mark.slow +@pytest.mark.parametrize("alg", algos_multidisc) +def test_multidiscrete_identity(alg): + ''' + Test if the algorithm (with an mlp policy) + can learn an identity transformation (i.e. return observation as an action) + ''' + + kwargs = learn_kwargs[alg] + kwargs.update(common_kwargs) + + learn_fn = lambda e: get_learn_function(alg)(env=e, **kwargs) + env_fn = lambda: MultiDiscreteIdentityEnv((3,3), episode_len=100) + simple_test(env_fn, learn_fn, 0.9) + @pytest.mark.slow @pytest.mark.parametrize("alg", algos_cont) def test_continuous_identity(alg): @@ -55,5 +71,5 @@ def test_continuous_identity(alg): simple_test(env_fn, learn_fn, -0.1) if __name__ == '__main__': - test_continuous_identity('ddpg') + test_multidiscrete_identity('acktr') diff --git a/baselines/common/tests/test_mnist.py b/baselines/common/tests/test_mnist.py index 536164f..eea094d 100644 --- a/baselines/common/tests/test_mnist.py +++ b/baselines/common/tests/test_mnist.py @@ -17,8 +17,7 @@ common_kwargs = { learn_args = { 'a2c': dict(total_timesteps=50000), - # TODO need to resolve inference (step) API differences for acer; also slow - # 'acer': dict(seed=0, total_timesteps=1000), + 'acer': dict(total_timesteps=20000), 'deepq': dict(total_timesteps=5000), 'acktr': dict(total_timesteps=30000), 'ppo2': dict(total_timesteps=50000, lr=1e-3, nsteps=128, ent_coef=0.0), @@ -47,4 +46,4 @@ def test_mnist(alg): simple_test(env_fn, learn_fn, 0.6) if __name__ == '__main__': - test_mnist('deepq') + test_mnist('acer') diff --git a/baselines/common/tests/test_serialization.py b/baselines/common/tests/test_serialization.py index f46b578..fac4929 100644 --- a/baselines/common/tests/test_serialization.py +++ b/baselines/common/tests/test_serialization.py @@ -17,6 +17,7 @@ learn_kwargs = { 'deepq': {}, 'a2c': {}, 'acktr': {}, + 'acer': {}, 'ppo2': {'nminibatches': 1, 'nsteps': 10}, 'trpo_mpi': {}, } @@ -37,7 +38,7 @@ def test_serialization(learn_fn, network_fn): ''' - if network_fn.endswith('lstm') and learn_fn in ['acktr', 'trpo_mpi', 'deepq']: + if network_fn.endswith('lstm') and learn_fn in ['acer', 'acktr', 'trpo_mpi', 'deepq']: # TODO make acktr work with recurrent policies # and test # github issue: https://github.com/openai/baselines/issues/660 diff --git a/baselines/common/vec_env/dummy_vec_env.py b/baselines/common/vec_env/dummy_vec_env.py index 60db11d..2b4d2ba 100644 --- a/baselines/common/vec_env/dummy_vec_env.py +++ b/baselines/common/vec_env/dummy_vec_env.py @@ -20,8 +20,11 @@ class DummyVecEnv(VecEnv): env = self.envs[0] VecEnv.__init__(self, len(env_fns), env.observation_space, env.action_space) obs_space = env.observation_space + if isinstance(obs_space, spaces.MultiDiscrete): + obs_space.shape = obs_space.shape[0] self.keys, shapes, dtypes = obs_space_info(obs_space) + self.buf_obs = { k: np.zeros((self.num_envs,) + tuple(shapes[k]), dtype=dtypes[k]) for k in self.keys } self.buf_dones = np.zeros((self.num_envs,), dtype=np.bool) self.buf_rews = np.zeros((self.num_envs,), dtype=np.float32) diff --git a/baselines/common/vec_env/vec_frame_stack.py b/baselines/common/vec_env/vec_frame_stack.py index 9185873..1b7a695 100644 --- a/baselines/common/vec_env/vec_frame_stack.py +++ b/baselines/common/vec_env/vec_frame_stack.py @@ -28,6 +28,3 @@ class VecFrameStack(VecEnvWrapper): self.stackedobs[...] = 0 self.stackedobs[..., -obs.shape[-1]:] = obs return self.stackedobs - - def close(self): - self.venv.close() diff --git a/baselines/ddpg/ddpg.py b/baselines/ddpg/ddpg.py index 181f923..8b8659b 100755 --- a/baselines/ddpg/ddpg.py +++ b/baselines/ddpg/ddpg.py @@ -7,7 +7,7 @@ from baselines.ddpg.ddpg_learner import DDPG from baselines.ddpg.models import Actor, Critic from baselines.ddpg.memory import Memory from baselines.ddpg.noise import AdaptiveParamNoiseSpec, NormalActionNoise, OrnsteinUhlenbeckActionNoise - +from baselines.common import set_global_seeds import baselines.common.tf_util as U from baselines import logger @@ -41,6 +41,7 @@ def learn(network, env, param_noise_adaption_interval=50, **network_kwargs): + set_global_seeds(seed) if total_timesteps is not None: assert nb_epochs is None diff --git a/baselines/deepq/deepq.py b/baselines/deepq/deepq.py index c6004b2..b7b9d1a 100644 --- a/baselines/deepq/deepq.py +++ b/baselines/deepq/deepq.py @@ -124,16 +124,12 @@ def learn(env, ------- env: gym.Env environment to train on - q_func: (tf.Variable, int, str, bool) -> tf.Variable - the model that takes the following inputs: - observation_in: object - the output of observation placeholder - num_actions: int - number of actions - scope: str - reuse: bool - should be passed to outer variable scope - and returns a tensor of shape (batch_size, num_actions) with values of every action. + network: string or a function + neural network to use as a q function approximator. If string, has to be one of the names of registered models in baselines.common.models + (mlp, cnn, conv_only). If a function, should take an observation tensor and return a latent variable tensor, which + will be mapped to the Q function heads (see build_q_func in baselines.deepq.models for details on that) + seed: int or None + prng seed. The runs with the same seed "should" give the same results. If None, no seeding is used. lr: float learning rate for adam optimizer total_timesteps: int @@ -173,6 +169,8 @@ def learn(env, to 1.0. If set to None equals to total_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. + param_noise: bool + whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905) callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. diff --git a/baselines/deepq/utils.py b/baselines/deepq/utils.py index 0fb1569..5176f32 100644 --- a/baselines/deepq/utils.py +++ b/baselines/deepq/utils.py @@ -18,11 +18,11 @@ class TfInput(object): """Return the tf variable(s) representing the possibly postprocessed value of placeholder(s). """ - raise NotImplemented() + raise NotImplementedError def make_feed_dict(data): """Given data input it to the placeholder(s).""" - raise NotImplemented() + raise NotImplementedError class PlaceholderTfInput(TfInput): diff --git a/baselines/her/README.md b/baselines/her/README.md index 6bd02b4..9934c69 100644 --- a/baselines/her/README.md +++ b/baselines/her/README.md @@ -30,3 +30,51 @@ python -m baselines.her.experiment.train --num_cpu 19 This will require a machine with sufficient amount of physical CPU cores. In our experiments, we used [Azure's D15v2 instances](https://docs.microsoft.com/en-us/azure/virtual-machines/linux/sizes), which have 20 physical cores. We only scheduled the experiment on 19 of those to leave some head-room on the system. + + +## Hindsight Experience Replay with Demonstrations +Using pre-recorded demonstrations to Overcome the exploration problem in HER based Reinforcement learning. +For details, please read the [paper](https://arxiv.org/pdf/1709.10089.pdf). + +### Getting started +The first step is to generate the demonstration dataset. This can be done in two ways, either by using a VR system to manipulate the arm using physical VR trackers or the simpler way is to write a script to carry out the respective task. Now some tasks can be complex and thus it would be difficult to write a hardcoded script for that task (eg. Fetch Push), but here our focus is on providing an algorithm that helps the agent to learn from demonstrations, and not on the demonstration generation paradigm itself. Thus the data collection part is left to the reader's choice. + +We provide a script for the Fetch Pick and Place task, to generate demonstrations for the Pick and Place task execute: +```bash +python experiment/data_generation/fetch_data_generation.py +``` +This outputs ```data_fetch_random_100.npz``` file which is our data file. + +#### Configuration +The provided configuration is for training an agent with HER without demonstrations, we need to change a few paramters for the HER algorithm to learn through demonstrations, to do that, set: + +* bc_loss: 1 - whether or not to use the behavior cloning loss as an auxilliary loss +* q_filter: 1 - whether or not a Q value filter should be used on the Actor outputs +* num_demo: 100 - number of expert demo episodes +* demo_batch_size: 128 - number of samples to be used from the demonstrations buffer, per mpi thread +* prm_loss_weight: 0.001 - Weight corresponding to the primary loss +* aux_loss_weight: 0.0078 - Weight corresponding to the auxilliary loss also called the cloning loss + +Apart from these changes the reported results also have the following configurational changes: + +* n_cycles: 20 - per epoch +* batch_size: 1024 - per mpi thread, total batch size +* random_eps: 0.1 - percentage of time a random action is taken +* noise_eps: 0.1 - std of gaussian noise added to not-completely-random actions + +Now training an agent with pre-recorded demonstrations: +```bash +python -m baselines.her.experiment.train --env=FetchPickAndPlace-v0 --n_epochs=1000 --demo_file=/Path/to/demo_file.npz --num_cpu=1 +``` + +This will train a DDPG+HER agent on the `FetchPickAndPlace` environment by using previously generated demonstration data. +To inspect what the agent has learned, use the play script as described above. + +### Results +Training with demonstrations helps overcome the exploration problem and achieves a faster and better convergence. The following graphs contrast the difference between training with and without demonstration data, We report the mean Q values vs Epoch and the Success Rate vs Epoch: + + +