2022-05-13 13:58:19 +01:00
|
|
|
"""Set of wrappers for normalizing actions and observations."""
|
2021-09-09 15:57:10 -04:00
|
|
|
import numpy as np
|
2022-03-31 12:50:38 -07:00
|
|
|
|
2021-09-09 15:57:10 -04:00
|
|
|
import gym
|
2022-07-10 02:18:06 +05:30
|
|
|
from gym.utils.step_api_compatibility import step_api_compatibility
|
2021-09-09 15:57:10 -04:00
|
|
|
|
|
|
|
|
|
|
|
# taken from https://github.com/openai/baselines/blob/master/baselines/common/vec_env/vec_normalize.py
|
2021-11-14 01:53:06 +01:00
|
|
|
class RunningMeanStd:
|
2022-05-13 13:58:19 +01:00
|
|
|
"""Tracks the mean, variance and count of values."""
|
|
|
|
|
2021-09-09 15:57:10 -04:00
|
|
|
# https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
|
|
|
|
def __init__(self, epsilon=1e-4, shape=()):
|
2022-05-13 13:58:19 +01:00
|
|
|
"""Tracks the mean, variance and count of values."""
|
2021-09-09 15:57:10 -04:00
|
|
|
self.mean = np.zeros(shape, "float64")
|
|
|
|
self.var = np.ones(shape, "float64")
|
|
|
|
self.count = epsilon
|
|
|
|
|
|
|
|
def update(self, x):
|
2022-05-13 13:58:19 +01:00
|
|
|
"""Updates the mean, var and count from a batch of samples."""
|
2021-09-09 15:57:10 -04:00
|
|
|
batch_mean = np.mean(x, axis=0)
|
|
|
|
batch_var = np.var(x, axis=0)
|
|
|
|
batch_count = x.shape[0]
|
|
|
|
self.update_from_moments(batch_mean, batch_var, batch_count)
|
|
|
|
|
|
|
|
def update_from_moments(self, batch_mean, batch_var, batch_count):
|
2022-05-13 13:58:19 +01:00
|
|
|
"""Updates from batch mean, variance and count moments."""
|
2021-09-09 15:57:10 -04:00
|
|
|
self.mean, self.var, self.count = update_mean_var_count_from_moments(
|
|
|
|
self.mean, self.var, self.count, batch_mean, batch_var, batch_count
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
def update_mean_var_count_from_moments(
|
|
|
|
mean, var, count, batch_mean, batch_var, batch_count
|
|
|
|
):
|
2022-05-13 13:58:19 +01:00
|
|
|
"""Updates the mean, var and count using the previous mean, var, count and batch values."""
|
2021-09-09 15:57:10 -04:00
|
|
|
delta = batch_mean - mean
|
|
|
|
tot_count = count + batch_count
|
|
|
|
|
|
|
|
new_mean = mean + delta * batch_count / tot_count
|
|
|
|
m_a = var * count
|
|
|
|
m_b = batch_var * batch_count
|
|
|
|
M2 = m_a + m_b + np.square(delta) * count * batch_count / tot_count
|
|
|
|
new_var = M2 / tot_count
|
|
|
|
new_count = tot_count
|
|
|
|
|
|
|
|
return new_mean, new_var, new_count
|
|
|
|
|
|
|
|
|
|
|
|
class NormalizeObservation(gym.core.Wrapper):
|
2022-04-30 01:43:59 +02:00
|
|
|
"""This wrapper will normalize observations s.t. each coordinate is centered with unit variance.
|
|
|
|
|
2022-05-13 13:58:19 +01:00
|
|
|
Note:
|
2022-04-30 01:43:59 +02:00
|
|
|
The normalization depends on past trajectories and observations will not be normalized correctly if the wrapper was
|
|
|
|
newly instantiated or the policy was changed recently.
|
|
|
|
"""
|
|
|
|
|
2022-07-10 02:18:06 +05:30
|
|
|
def __init__(self, env: gym.Env, epsilon: float = 1e-8, new_step_api: bool = False):
|
2022-05-13 13:58:19 +01:00
|
|
|
"""This wrapper will normalize observations s.t. each coordinate is centered with unit variance.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
env (Env): The environment to apply the wrapper
|
|
|
|
epsilon: A stability parameter that is used when scaling the observations.
|
2022-07-10 02:18:06 +05:30
|
|
|
new_step_api (bool): Whether the wrapper's step method outputs two booleans (new API) or one boolean (old API)
|
2022-05-13 13:58:19 +01:00
|
|
|
"""
|
2022-07-10 02:18:06 +05:30
|
|
|
super().__init__(env, new_step_api)
|
2021-09-09 15:57:10 -04:00
|
|
|
self.num_envs = getattr(env, "num_envs", 1)
|
|
|
|
self.is_vector_env = getattr(env, "is_vector_env", False)
|
|
|
|
if self.is_vector_env:
|
|
|
|
self.obs_rms = RunningMeanStd(shape=self.single_observation_space.shape)
|
|
|
|
else:
|
|
|
|
self.obs_rms = RunningMeanStd(shape=self.observation_space.shape)
|
|
|
|
self.epsilon = epsilon
|
|
|
|
|
|
|
|
def step(self, action):
|
2022-05-13 13:58:19 +01:00
|
|
|
"""Steps through the environment and normalizes the observation."""
|
2022-07-10 02:18:06 +05:30
|
|
|
obs, rews, terminateds, truncateds, infos = step_api_compatibility(
|
|
|
|
self.env.step(action), True, self.is_vector_env
|
|
|
|
)
|
2021-09-09 15:57:10 -04:00
|
|
|
if self.is_vector_env:
|
|
|
|
obs = self.normalize(obs)
|
|
|
|
else:
|
|
|
|
obs = self.normalize(np.array([obs]))[0]
|
2022-07-10 02:18:06 +05:30
|
|
|
return step_api_compatibility(
|
|
|
|
(obs, rews, terminateds, truncateds, infos),
|
|
|
|
self.new_step_api,
|
|
|
|
self.is_vector_env,
|
|
|
|
)
|
2021-09-09 15:57:10 -04:00
|
|
|
|
2022-02-23 14:50:21 -05:00
|
|
|
def reset(self, **kwargs):
|
2022-05-13 13:58:19 +01:00
|
|
|
"""Resets the environment and normalizes the observation."""
|
2022-07-04 18:19:25 +01:00
|
|
|
if kwargs.get("return_info", False):
|
2022-02-23 14:50:21 -05:00
|
|
|
obs, info = self.env.reset(**kwargs)
|
2022-07-04 18:19:25 +01:00
|
|
|
|
|
|
|
if self.is_vector_env:
|
|
|
|
return self.normalize(obs), info
|
|
|
|
else:
|
|
|
|
return self.normalize(np.array([obs]))[0], info
|
2022-02-06 17:28:27 -06:00
|
|
|
else:
|
2022-02-23 14:50:21 -05:00
|
|
|
obs = self.env.reset(**kwargs)
|
2022-07-04 18:19:25 +01:00
|
|
|
|
|
|
|
if self.is_vector_env:
|
|
|
|
return self.normalize(obs)
|
|
|
|
else:
|
|
|
|
return self.normalize(np.array([obs]))[0]
|
2021-09-09 15:57:10 -04:00
|
|
|
|
|
|
|
def normalize(self, obs):
|
2022-05-13 13:58:19 +01:00
|
|
|
"""Normalises the observation using the running mean and variance of the observations."""
|
2021-09-09 15:57:10 -04:00
|
|
|
self.obs_rms.update(obs)
|
|
|
|
return (obs - self.obs_rms.mean) / np.sqrt(self.obs_rms.var + self.epsilon)
|
|
|
|
|
|
|
|
|
|
|
|
class NormalizeReward(gym.core.Wrapper):
|
2022-05-13 13:58:19 +01:00
|
|
|
r"""This wrapper will normalize immediate rewards s.t. their exponential moving average has a fixed variance.
|
2022-04-30 01:43:59 +02:00
|
|
|
|
2022-05-13 13:58:19 +01:00
|
|
|
The exponential moving average will have variance :math:`(1 - \gamma)^2`.
|
2022-04-30 01:43:59 +02:00
|
|
|
|
2022-05-13 13:58:19 +01:00
|
|
|
Note:
|
2022-04-30 01:43:59 +02:00
|
|
|
The scaling depends on past trajectories and rewards will not be scaled correctly if the wrapper was newly
|
|
|
|
instantiated or the policy was changed recently.
|
|
|
|
"""
|
|
|
|
|
2021-09-09 15:57:10 -04:00
|
|
|
def __init__(
|
|
|
|
self,
|
2022-05-13 13:58:19 +01:00
|
|
|
env: gym.Env,
|
|
|
|
gamma: float = 0.99,
|
|
|
|
epsilon: float = 1e-8,
|
2022-07-10 02:18:06 +05:30
|
|
|
new_step_api: bool = False,
|
2021-09-09 15:57:10 -04:00
|
|
|
):
|
2022-05-13 13:58:19 +01:00
|
|
|
"""This wrapper will normalize immediate rewards s.t. their exponential moving average has a fixed variance.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
env (env): The environment to apply the wrapper
|
|
|
|
epsilon (float): A stability parameter
|
|
|
|
gamma (float): The discount factor that is used in the exponential moving average.
|
2022-07-10 02:18:06 +05:30
|
|
|
new_step_api (bool): Whether the wrapper's step method outputs two booleans (new API) or one boolean (old API)
|
2022-05-13 13:58:19 +01:00
|
|
|
"""
|
2022-07-10 02:18:06 +05:30
|
|
|
super().__init__(env, new_step_api)
|
2021-09-09 15:57:10 -04:00
|
|
|
self.num_envs = getattr(env, "num_envs", 1)
|
|
|
|
self.is_vector_env = getattr(env, "is_vector_env", False)
|
|
|
|
self.return_rms = RunningMeanStd(shape=())
|
|
|
|
self.returns = np.zeros(self.num_envs)
|
|
|
|
self.gamma = gamma
|
|
|
|
self.epsilon = epsilon
|
|
|
|
|
|
|
|
def step(self, action):
|
2022-05-13 13:58:19 +01:00
|
|
|
"""Steps through the environment, normalizing the rewards returned."""
|
2022-07-10 02:18:06 +05:30
|
|
|
obs, rews, terminateds, truncateds, infos = step_api_compatibility(
|
|
|
|
self.env.step(action), True, self.is_vector_env
|
|
|
|
)
|
2021-09-09 15:57:10 -04:00
|
|
|
if not self.is_vector_env:
|
|
|
|
rews = np.array([rews])
|
|
|
|
self.returns = self.returns * self.gamma + rews
|
|
|
|
rews = self.normalize(rews)
|
2022-07-10 02:18:06 +05:30
|
|
|
if not self.is_vector_env:
|
|
|
|
dones = terminateds or truncateds
|
|
|
|
else:
|
|
|
|
dones = np.bitwise_or(terminateds, truncateds)
|
2021-09-09 15:57:10 -04:00
|
|
|
self.returns[dones] = 0.0
|
|
|
|
if not self.is_vector_env:
|
|
|
|
rews = rews[0]
|
2022-07-10 02:18:06 +05:30
|
|
|
return step_api_compatibility(
|
|
|
|
(obs, rews, terminateds, truncateds, infos),
|
|
|
|
self.new_step_api,
|
|
|
|
self.is_vector_env,
|
|
|
|
)
|
2021-09-09 15:57:10 -04:00
|
|
|
|
|
|
|
def normalize(self, rews):
|
2022-05-13 13:58:19 +01:00
|
|
|
"""Normalizes the rewards with the running mean rewards and their variance."""
|
2021-09-09 15:57:10 -04:00
|
|
|
self.return_rms.update(self.returns)
|
|
|
|
return rews / np.sqrt(self.return_rms.var + self.epsilon)
|