* joshim5 changes (width and height to WarpFrame wrapper) * match network output with action distribution via a linear layer only if necessary (#167) * support color vs. grayscale option in WarpFrame wrapper (#166) * support color vs. grayscale option in WarpFrame wrapper * Support color in other wrappers * Updated per Peters suggestions * fixing test failures * ppo2 with microbatches (#168) * pass microbatch_size to the model during construction * microbatch fixes and test (#169) * microbatch fixes and test * tiny cleanup * added assertions to the test * vpg-related fix * Peterz joshim5 subclass ppo2 model (#170) * microbatch fixes and test * tiny cleanup * added assertions to the test * vpg-related fix * subclassing the model to make microbatched version of model WIP * made microbatched model a subclass of ppo2 Model * flake8 complaint * mpi-less ppo2 (resolving merge conflict) * flake8 and mpi4py imports in ppo2/model.py * more un-mpying
77 lines
3.1 KiB
Python
77 lines
3.1 KiB
Python
import numpy as np
|
|
from baselines.common.runners import AbstractEnvRunner
|
|
|
|
class Runner(AbstractEnvRunner):
|
|
"""
|
|
We use this object to make a mini batch of experiences
|
|
__init__:
|
|
- Initialize the runner
|
|
|
|
run():
|
|
- Make a mini batch
|
|
"""
|
|
def __init__(self, *, env, model, nsteps, gamma, lam):
|
|
super().__init__(env=env, model=model, nsteps=nsteps)
|
|
# Lambda used in GAE (General Advantage Estimation)
|
|
self.lam = lam
|
|
# Discount rate
|
|
self.gamma = gamma
|
|
|
|
def run(self):
|
|
# Here, we init the lists that will contain the mb of experiences
|
|
mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, mb_neglogpacs = [],[],[],[],[],[]
|
|
mb_states = self.states
|
|
epinfos = []
|
|
# For n in range number of steps
|
|
for _ in range(self.nsteps):
|
|
# Given observations, get action value and neglopacs
|
|
# We already have self.obs because Runner superclass run self.obs[:] = env.reset() on init
|
|
actions, values, self.states, neglogpacs = self.model.step(self.obs, S=self.states, M=self.dones)
|
|
mb_obs.append(self.obs.copy())
|
|
mb_actions.append(actions)
|
|
mb_values.append(values)
|
|
mb_neglogpacs.append(neglogpacs)
|
|
mb_dones.append(self.dones)
|
|
|
|
# Take actions in env and look the results
|
|
# Infos contains a ton of useful informations
|
|
self.obs[:], rewards, self.dones, infos = self.env.step(actions)
|
|
for info in infos:
|
|
maybeepinfo = info.get('episode')
|
|
if maybeepinfo: epinfos.append(maybeepinfo)
|
|
mb_rewards.append(rewards)
|
|
#batch of steps to batch of rollouts
|
|
mb_obs = np.asarray(mb_obs, dtype=self.obs.dtype)
|
|
mb_rewards = np.asarray(mb_rewards, dtype=np.float32)
|
|
mb_actions = np.asarray(mb_actions)
|
|
mb_values = np.asarray(mb_values, dtype=np.float32)
|
|
mb_neglogpacs = np.asarray(mb_neglogpacs, dtype=np.float32)
|
|
mb_dones = np.asarray(mb_dones, dtype=np.bool)
|
|
last_values = self.model.value(self.obs, S=self.states, M=self.dones)
|
|
|
|
# discount/bootstrap off value fn
|
|
mb_returns = np.zeros_like(mb_rewards)
|
|
mb_advs = np.zeros_like(mb_rewards)
|
|
lastgaelam = 0
|
|
for t in reversed(range(self.nsteps)):
|
|
if t == self.nsteps - 1:
|
|
nextnonterminal = 1.0 - self.dones
|
|
nextvalues = last_values
|
|
else:
|
|
nextnonterminal = 1.0 - mb_dones[t+1]
|
|
nextvalues = mb_values[t+1]
|
|
delta = mb_rewards[t] + self.gamma * nextvalues * nextnonterminal - mb_values[t]
|
|
mb_advs[t] = lastgaelam = delta + self.gamma * self.lam * nextnonterminal * lastgaelam
|
|
mb_returns = mb_advs + mb_values
|
|
return (*map(sf01, (mb_obs, mb_returns, mb_dones, mb_actions, mb_values, mb_neglogpacs)),
|
|
mb_states, epinfos)
|
|
# obs, returns, masks, actions, values, neglogpacs, states = runner.run()
|
|
def sf01(arr):
|
|
"""
|
|
swap and then flatten axes 0 and 1
|
|
"""
|
|
s = arr.shape
|
|
return arr.swapaxes(0, 1).reshape(s[0] * s[1], *s[2:])
|
|
|
|
|