* exported rl-algs * more stuff from rl-algs * run slow tests * re-exported rl_algs * re-exported rl_algs - fixed problems with serialization test and test_cartpole * replaced atari_arg_parser with common_arg_parser * run.py can run algos from both baselines and rl_algs * added approximate humanoid reward with ppo2 into the README for reference * dummy commit to RUN BENCHMARKS * dummy commit to RUN BENCHMARKS * dummy commit to RUN BENCHMARKS * dummy commit to RUN BENCHMARKS * very dummy commit to RUN BENCHMARKS * serialize variables as a dict, not as a list * running_mean_std uses tensorflow variables * fixed import in vec_normalize * dummy commit to RUN BENCHMARKS * dummy commit to RUN BENCHMARKS * flake8 complaints * save all variables to make sure we save the vec_normalize normalization * benchmarks on ppo2 only RUN BENCHMARKS * make_atari_env compatible with mpi * run ppo_mpi benchmarks only RUN BENCHMARKS * hardcode names of retro environments * add defaults * changed default ppo2 lr schedule to linear RUN BENCHMARKS * non-tf normalization benchmark RUN BENCHMARKS * use ncpu=1 for mujoco sessions - gives a bit of a performance speedup * reverted running_mean_std to user property decorators for mean, var, count * reverted VecNormalize to use RunningMeanStd (no tf) * reverted VecNormalize to use RunningMeanStd (no tf) * profiling wip * use VecNormalize with regular RunningMeanStd * added acer runner (missing import) * flake8 complaints * added a note in README about TfRunningMeanStd and serialization of VecNormalize * dummy commit to RUN BENCHMARKS * merged benchmarks branch
50 lines
1.9 KiB
Python
50 lines
1.9 KiB
Python
from baselines.common.vec_env import VecEnvWrapper
|
|
from baselines.common.running_mean_std import RunningMeanStd
|
|
import numpy as np
|
|
|
|
class VecNormalize(VecEnvWrapper):
|
|
"""
|
|
Vectorized environment base class
|
|
"""
|
|
def __init__(self, venv, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8):
|
|
VecEnvWrapper.__init__(self, venv)
|
|
self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if ob else None
|
|
self.ret_rms = RunningMeanStd(shape=()) if ret else None
|
|
#self.ob_rms = TfRunningMeanStd(shape=self.observation_space.shape, scope='observation_running_mean_std') if ob else None
|
|
#self.ret_rms = TfRunningMeanStd(shape=(), scope='return_running_mean_std') if ret else None
|
|
self.clipob = clipob
|
|
self.cliprew = cliprew
|
|
self.ret = np.zeros(self.num_envs)
|
|
self.gamma = gamma
|
|
self.epsilon = epsilon
|
|
|
|
def step_wait(self):
|
|
"""
|
|
Apply sequence of actions to sequence of environments
|
|
actions -> (observations, rewards, news)
|
|
|
|
where 'news' is a boolean vector indicating whether each element is new.
|
|
"""
|
|
obs, rews, news, infos = self.venv.step_wait()
|
|
self.ret = self.ret * self.gamma + rews
|
|
obs = self._obfilt(obs)
|
|
if self.ret_rms:
|
|
self.ret_rms.update(self.ret)
|
|
rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew)
|
|
return obs, rews, news, infos
|
|
|
|
def _obfilt(self, obs):
|
|
if self.ob_rms:
|
|
self.ob_rms.update(obs)
|
|
obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob)
|
|
return obs
|
|
else:
|
|
return obs
|
|
|
|
def reset(self):
|
|
"""
|
|
Reset all environments
|
|
"""
|
|
obs = self.venv.reset()
|
|
return self._obfilt(obs)
|