Files
Gymnasium/gymnasium/wrappers/stateful_observation.py

571 lines
22 KiB
Python
Raw Normal View History

2022-12-05 19:14:56 +00:00
"""A collection of stateful observation wrappers.
* ``DelayObservation`` - A wrapper for delaying the returned observation
* ``TimeAwareObservation`` - A wrapper for adding time aware observations to environment observation
* ``FrameStackObservation`` - Frame stack the observations
* ``NormalizeObservation`` - Normalized the observations to have unit variance with a moving mean
* ``MaxAndSkipObservation`` - Return only every ``skip``-th frame (frameskipping) and return the max between the two last frames.
2022-12-05 19:14:56 +00:00
"""
from __future__ import annotations
from collections import deque
from copy import deepcopy
2023-07-03 17:28:18 +03:00
from typing import Any, Final, SupportsFloat
2022-12-05 19:14:56 +00:00
import numpy as np
import gymnasium as gym
2022-12-05 19:14:56 +00:00
import gymnasium.spaces as spaces
2022-12-10 22:04:14 +00:00
from gymnasium.core import ActType, ObsType, WrapperActType, WrapperObsType
from gymnasium.spaces import Box, Dict, Tuple
from gymnasium.vector.utils import batch_space, concatenate, create_empty_array
from gymnasium.wrappers.utils import RunningMeanStd, create_zero_array
__all__ = [
"DelayObservation",
"TimeAwareObservation",
"FrameStackObservation",
"NormalizeObservation",
"MaxAndSkipObservation",
]
class DelayObservation(
gym.ObservationWrapper[ObsType, ActType, ObsType], gym.utils.RecordConstructorArgs
):
"""Adds a delay to the returned observation from the environment.
2022-12-05 19:14:56 +00:00
Before reaching the :attr:`delay` number of timesteps, returned observations is an array of zeros with
the same shape as the observation space.
2022-12-05 19:14:56 +00:00
No vector version of the wrapper exists.
Note:
This does not support random delay values, if users are interested, please raise an issue or pull request to add this feature.
Example:
>>> import gymnasium as gym
>>> env = gym.make("CartPole-v1")
>>> env.reset(seed=123)
(array([ 0.01823519, -0.0446179 , -0.02796401, -0.03156282], dtype=float32), {})
2022-12-05 19:14:56 +00:00
>>> env = DelayObservation(env, delay=2)
>>> env.reset(seed=123)
(array([0., 0., 0., 0.], dtype=float32), {})
>>> env.step(env.action_space.sample())
(array([0., 0., 0., 0.], dtype=float32), 1.0, False, False, {})
>>> env.step(env.action_space.sample())
(array([ 0.01823519, -0.0446179 , -0.02796401, -0.03156282], dtype=float32), 1.0, False, False, {})
Change logs:
* v1.0.0 - Initially added
"""
def __init__(self, env: gym.Env[ObsType, ActType], delay: int):
"""Initialises the DelayObservation wrapper with an integer.
2022-12-05 19:14:56 +00:00
Args:
env: The environment to wrap
delay: The number of timesteps to delay observations
2022-12-05 19:14:56 +00:00
"""
if not np.issubdtype(type(delay), np.integer):
raise TypeError(
f"The delay is expected to be an integer, actual type: {type(delay)}"
)
if not 0 <= delay:
raise ValueError(
f"The delay needs to be greater than zero, actual value: {delay}"
)
2022-12-05 19:14:56 +00:00
gym.utils.RecordConstructorArgs.__init__(self, delay=delay)
gym.ObservationWrapper.__init__(self, env)
self.delay: Final[int] = int(delay)
self.observation_queue: Final[deque] = deque()
2022-12-05 19:14:56 +00:00
def reset(
self, *, seed: int | None = None, options: dict[str, Any] | None = None
2023-02-22 13:58:29 +00:00
) -> tuple[ObsType, dict[str, Any]]:
2022-12-05 19:14:56 +00:00
"""Resets the environment, clearing the observation queue."""
self.observation_queue.clear()
return super().reset(seed=seed, options=options)
def observation(self, observation: ObsType) -> ObsType:
"""Return the delayed observation."""
self.observation_queue.append(observation)
if len(self.observation_queue) > self.delay:
return self.observation_queue.popleft()
else:
return create_zero_array(self.observation_space)
2022-12-05 19:14:56 +00:00
class TimeAwareObservation(
gym.ObservationWrapper[WrapperObsType, ActType, ObsType],
gym.utils.RecordConstructorArgs,
):
"""Augment the observation with the number of time steps taken within an episode.
2022-12-05 19:14:56 +00:00
The :attr:`normalize_time` if ``True`` represents time as a normalized value between [0,1]
otherwise if ``False``, the current timestep is an integer.
For environments with ``Dict`` observation spaces, the time information is automatically
added in the key `"time"` (can be changed through :attr:`dict_time_key`) and for environments with ``Tuple``
observation space, the time information is added as the final element in the tuple.
Otherwise, the observation space is transformed into a ``Dict`` observation space with two keys,
`"obs"` for the base environment's observation and `"time"` for the time information.
2022-12-05 19:14:56 +00:00
To flatten the observation, use the :attr:`flatten` parameter which will use the
:func:`gymnasium.spaces.utils.flatten` function.
2022-12-05 19:14:56 +00:00
No vector version of the wrapper exists.
2022-12-05 19:14:56 +00:00
Example:
>>> import gymnasium as gym
>>> from gymnasium.wrappers import TimeAwareObservation
>>> env = gym.make("CartPole-v1")
>>> env = TimeAwareObservation(env)
2022-12-05 19:14:56 +00:00
>>> env.observation_space
Box([-4.80000019e+00 -3.40282347e+38 -4.18879032e-01 -3.40282347e+38
0.00000000e+00], [4.80000019e+00 3.40282347e+38 4.18879032e-01 3.40282347e+38
5.00000000e+02], (5,), float64)
>>> env.reset(seed=42)[0]
array([ 0.0273956 , -0.00611216, 0.03585979, 0.0197368 , 0. ])
>>> _ = env.action_space.seed(42)
2022-12-05 19:14:56 +00:00
>>> env.step(env.action_space.sample())[0]
array([ 0.02727336, -0.20172954, 0.03625453, 0.32351476, 1. ])
Normalize time observation space example:
>>> env = gym.make('CartPole-v1')
>>> env = TimeAwareObservation(env, normalize_time=True)
2022-12-05 19:14:56 +00:00
>>> env.observation_space
Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38
0.0000000e+00], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38 1.0000000e+00], (5,), float32)
>>> env.reset(seed=42)[0]
array([ 0.0273956 , -0.00611216, 0.03585979, 0.0197368 , 0. ],
dtype=float32)
>>> _ = env.action_space.seed(42)
2022-12-05 19:14:56 +00:00
>>> env.step(env.action_space.sample())[0]
array([ 0.02727336, -0.20172954, 0.03625453, 0.32351476, 0.002 ],
dtype=float32)
Flatten observation space example:
>>> env = gym.make("CartPole-v1")
>>> env = TimeAwareObservation(env, flatten=False)
>>> env.observation_space
Dict('obs': Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32), 'time': Box(0, 500, (1,), int32))
>>> env.reset(seed=42)[0]
{'obs': array([ 0.0273956 , -0.00611216, 0.03585979, 0.0197368 ], dtype=float32), 'time': array([0], dtype=int32)}
>>> _ = env.action_space.seed(42)
>>> env.step(env.action_space.sample())[0]
{'obs': array([ 0.02727336, -0.20172954, 0.03625453, 0.32351476], dtype=float32), 'time': array([1], dtype=int32)}
Change logs:
* v0.18.0 - Initially added
* v1.0.0 - Remove vector environment support, add ``flatten`` and ``normalize_time`` parameters
2022-12-05 19:14:56 +00:00
"""
def __init__(
self,
env: gym.Env[ObsType, ActType],
flatten: bool = True,
normalize_time: bool = False,
2022-12-05 19:14:56 +00:00
*,
dict_time_key: str = "time",
):
"""Initialize :class:`TimeAwareObservation`.
2022-12-05 19:14:56 +00:00
Args:
env: The environment to apply the wrapper
flatten: Flatten the observation to a `Box` of a single dimension
normalize_time: if `True` return time in the range [0,1]
otherwise return time as remaining timesteps before truncation
dict_time_key: For environment with a ``Dict`` observation space, the key for the time space. By default, `"time"`.
"""
gym.utils.RecordConstructorArgs.__init__(
self,
flatten=flatten,
normalize_time=normalize_time,
dict_time_key=dict_time_key,
)
gym.ObservationWrapper.__init__(self, env)
2022-12-05 19:14:56 +00:00
self.flatten: Final[bool] = flatten
self.normalize_time: Final[bool] = normalize_time
# We don't need to keep if a TimeLimit wrapper exists as `spec` will do that work for us now
if env.spec is not None and env.spec.max_episode_steps is not None:
2022-12-05 19:14:56 +00:00
self.max_timesteps = env.spec.max_episode_steps
else:
raise ValueError(
"The environment must be wrapped by a TimeLimit wrapper or the spec specify a `max_episode_steps`."
)
self.timesteps: int = 0
2022-12-05 19:14:56 +00:00
# Find the normalized time space
if self.normalize_time:
self._time_preprocess_func = lambda time: np.array(
[time / self.max_timesteps], dtype=np.float32
)
2022-12-05 19:14:56 +00:00
time_space = Box(0.0, 1.0)
else:
self._time_preprocess_func = lambda time: np.array([time], dtype=np.int32)
2022-12-05 19:14:56 +00:00
time_space = Box(0, self.max_timesteps, dtype=np.int32)
# Find the observation space
if isinstance(env.observation_space, Dict):
assert dict_time_key not in env.observation_space.keys()
observation_space = Dict(
2022-12-10 22:04:14 +00:00
{dict_time_key: time_space, **env.observation_space.spaces}
2022-12-05 19:14:56 +00:00
)
2022-12-10 22:04:14 +00:00
self._append_data_func = lambda obs, time: {dict_time_key: time, **obs}
2022-12-05 19:14:56 +00:00
elif isinstance(env.observation_space, Tuple):
observation_space = Tuple(env.observation_space.spaces + (time_space,))
self._append_data_func = lambda obs, time: obs + (time,)
else:
observation_space = Dict(obs=env.observation_space, time=time_space)
self._append_data_func = lambda obs, time: {"obs": obs, "time": time}
# If to flatten the observation space
if self.flatten:
self.observation_space: gym.Space[WrapperObsType] = spaces.flatten_space(
observation_space
)
2022-12-05 19:14:56 +00:00
self._obs_postprocess_func = lambda obs: spaces.flatten(
observation_space, obs
)
else:
self.observation_space: gym.Space[WrapperObsType] = observation_space
2022-12-05 19:14:56 +00:00
self._obs_postprocess_func = lambda obs: obs
def observation(self, observation: ObsType) -> WrapperObsType:
"""Adds to the observation with the current time information.
Args:
observation: The observation to add the time step to
Returns:
The observation with the time information appended to it
2022-12-05 19:14:56 +00:00
"""
return self._obs_postprocess_func(
self._append_data_func(
observation, self._time_preprocess_func(self.timesteps)
2022-12-05 19:14:56 +00:00
)
)
def step(
self, action: ActType
) -> tuple[WrapperObsType, SupportsFloat, bool, bool, dict[str, Any]]:
"""Steps through the environment, incrementing the time step.
Args:
action: The action to take
Returns:
The environment's step using the action with the next observation containing the timestep info
2022-12-05 19:14:56 +00:00
"""
self.timesteps += 1
2022-12-05 19:14:56 +00:00
return super().step(action)
def reset(
self, *, seed: int | None = None, options: dict[str, Any] | None = None
) -> tuple[WrapperObsType, dict[str, Any]]:
"""Reset the environment setting the time to zero.
Args:
seed: The seed to reset the environment
options: The options used to reset the environment
Returns:
Resets the environment with the initial timestep info added the observation
2022-12-05 19:14:56 +00:00
"""
self.timesteps = 0
2022-12-05 19:14:56 +00:00
return super().reset(seed=seed, options=options)
2022-12-10 22:04:14 +00:00
class FrameStackObservation(
gym.Wrapper[WrapperObsType, ActType, ObsType, ActType],
gym.utils.RecordConstructorArgs,
):
"""Stacks the observations from the last ``N`` time steps in a rolling manner.
2022-12-10 22:04:14 +00:00
For example, if the number of stacks is 4, then the returned observation contains
the most recent 4 observations. For environment 'Pendulum-v1', the original observation
is an array with shape [3], so if we stack 4 observations, the processed observation
has shape [4, 3].
No vector version of the wrapper exists.
2022-12-10 22:04:14 +00:00
Note:
- After :meth:`reset` is called, the frame buffer will be filled with the initial observation.
I.e. the observation returned by :meth:`reset` will consist of `num_stack` many identical frames.
Example:
>>> import gymnasium as gym
>>> from gymnasium.wrappers import FrameStackObservation
>>> env = gym.make("CarRacing-v2")
>>> env = FrameStackObservation(env, 4)
2022-12-10 22:04:14 +00:00
>>> env.observation_space
Box(0, 255, (4, 96, 96, 3), uint8)
>>> obs, _ = env.reset()
2022-12-10 22:04:14 +00:00
>>> obs.shape
(4, 96, 96, 3)
Change logs:
* v0.15.0 - Initially add as ``FrameStack`` with support for lz4
* v1.0.0 - Rename to ``FrameStackObservation`` and remove lz4 and ``LazyFrame`` support
2022-12-10 22:04:14 +00:00
"""
def __init__(
self,
env: gym.Env[ObsType, ActType],
stack_size: int,
*,
zeros_obs: ObsType | None = None,
):
2022-12-10 22:04:14 +00:00
"""Observation wrapper that stacks the observations in a rolling manner.
Args:
env: The environment to apply the wrapper
stack_size: The number of frames to stack with zero_obs being used originally.
zeros_obs: Keyword only parameter that allows a custom padding observation at :meth:`reset`
2022-12-10 22:04:14 +00:00
"""
if not np.issubdtype(type(stack_size), np.integer):
raise TypeError(
f"The stack_size is expected to be an integer, actual type: {type(stack_size)}"
)
if not 1 < stack_size:
raise ValueError(
f"The stack_size needs to be greater than one, actual value: {stack_size}"
)
gym.utils.RecordConstructorArgs.__init__(self, stack_size=stack_size)
gym.Wrapper.__init__(self, env)
2022-12-10 22:04:14 +00:00
self.observation_space = batch_space(env.observation_space, n=stack_size)
self.stack_size: Final[int] = stack_size
2022-12-10 22:04:14 +00:00
self.zero_obs: Final[ObsType] = (
zeros_obs if zeros_obs else create_zero_array(env.observation_space)
)
self._stacked_obs = deque(
[self.zero_obs for _ in range(self.stack_size)], maxlen=self.stack_size
)
self._stacked_array = create_empty_array(
env.observation_space, n=self.stack_size
)
2022-12-10 22:04:14 +00:00
def step(
self, action: WrapperActType
) -> tuple[WrapperObsType, SupportsFloat, bool, bool, dict[str, Any]]:
"""Steps through the environment, appending the observation to the frame buffer.
Args:
action: The action to step through the environment with
Returns:
Stacked observations, reward, terminated, truncated, and info from the environment
"""
obs, reward, terminated, truncated, info = super().step(action)
self._stacked_obs.append(obs)
2022-12-10 22:04:14 +00:00
updated_obs = deepcopy(
2022-12-10 22:04:14 +00:00
concatenate(
self.env.observation_space, self._stacked_obs, self._stacked_array
)
2022-12-10 22:04:14 +00:00
)
return updated_obs, reward, terminated, truncated, info
2022-12-10 22:04:14 +00:00
def reset(
self, *, seed: int | None = None, options: dict[str, Any] | None = None
) -> tuple[WrapperObsType, dict[str, Any]]:
"""Reset the environment, returning the stacked observation and info.
Args:
seed: The environment seed
options: The reset options
Returns:
The stacked observations and info
"""
obs, info = super().reset(seed=seed, options=options)
for _ in range(self.stack_size - 1):
self._stacked_obs.append(self.zero_obs)
self._stacked_obs.append(obs)
2022-12-10 22:04:14 +00:00
updated_obs = deepcopy(
2022-12-10 22:04:14 +00:00
concatenate(
self.env.observation_space, self._stacked_obs, self._stacked_array
2022-12-10 22:04:14 +00:00
)
)
return updated_obs, info
2023-05-23 15:46:04 +01:00
class NormalizeObservation(
2023-05-23 15:46:04 +01:00
gym.ObservationWrapper[WrapperObsType, ActType, ObsType],
gym.utils.RecordConstructorArgs,
):
"""Normalizes observations to be centered at the mean with unit variance.
The property :prop:`_update_running_mean` allows to freeze/continue the running mean calculation of the observation
statistics. If ``True`` (default), the ``RunningMeanStd`` will get updated every time ``step`` or ``reset`` is called.
If ``False``, the calculated statistics are used but not updated anymore; this may be used during evaluation.
2023-05-23 15:46:04 +01:00
A vector version of the wrapper exists :class:`gymnasium.wrappers.vector.NormalizeObservation`.
2023-05-23 15:46:04 +01:00
Note:
The normalization depends on past trajectories and observations will not be normalized correctly if the wrapper was
newly instantiated or the policy was changed recently.
Example:
>>> import numpy as np
>>> import gymnasium as gym
>>> env = gym.make("CartPole-v1")
>>> obs, info = env.reset(seed=123)
>>> term, trunc = False, False
>>> while not (term or trunc):
... obs, _, term, trunc, _ = env.step(1)
...
>>> obs
array([ 0.1511158 , 1.7183299 , -0.25533703, -2.8914354 ], dtype=float32)
>>> env = gym.make("CartPole-v1")
>>> env = NormalizeObservation(env)
>>> obs, info = env.reset(seed=123)
>>> term, trunc = False, False
>>> while not (term or trunc):
... obs, _, term, trunc, _ = env.step(1)
>>> obs
array([ 2.0059888, 1.5676788, -1.9944268, -1.6120394], dtype=float32)
Change logs:
* v0.21.0 - Initially add
* v1.0.0 - Add `update_running_mean` attribute to allow disabling of updating the running mean / standard, particularly useful for evaluation time.
2023-05-23 15:46:04 +01:00
"""
def __init__(self, env: gym.Env[ObsType, ActType], epsilon: float = 1e-8):
"""This wrapper will normalize observations such that each observation is centered with unit variance.
2023-05-23 15:46:04 +01:00
Args:
env (Env): The environment to apply the wrapper
epsilon: A stability parameter that is used when scaling the observations.
"""
gym.utils.RecordConstructorArgs.__init__(self, epsilon=epsilon)
gym.ObservationWrapper.__init__(self, env)
self.obs_rms = RunningMeanStd(
shape=self.observation_space.shape, dtype=self.observation_space.dtype
)
2023-05-23 15:46:04 +01:00
self.epsilon = epsilon
self._update_running_mean = True
@property
def update_running_mean(self) -> bool:
"""Property to freeze/continue the running mean calculation of the observation statistics."""
return self._update_running_mean
@update_running_mean.setter
def update_running_mean(self, setting: bool):
"""Sets the property to freeze/continue the running mean calculation of the observation statistics."""
self._update_running_mean = setting
def observation(self, observation: ObsType) -> WrapperObsType:
"""Normalises the observation using the running mean and variance of the observations."""
if self._update_running_mean:
self.obs_rms.update(np.array([observation]))
2023-05-23 15:46:04 +01:00
return (observation - self.obs_rms.mean) / np.sqrt(
self.obs_rms.var + self.epsilon
)
class MaxAndSkipObservation(
gym.Wrapper[WrapperObsType, ActType, ObsType, ActType],
gym.utils.RecordConstructorArgs,
):
"""Skips the N-th frame (observation) and return the max values between the two last observations.
No vector version of the wrapper exists.
Note:
This wrapper is based on the wrapper from [stable-baselines3](https://stable-baselines3.readthedocs.io/en/master/_modules/stable_baselines3/common/atari_wrappers.html#MaxAndSkipEnv)
Example:
>>> import gymnasium as gym
>>> env = gym.make("CartPole-v1")
>>> obs0, *_ = env.reset(seed=123)
>>> obs1, *_ = env.step(1)
>>> obs2, *_ = env.step(1)
>>> obs3, *_ = env.step(1)
>>> obs4, *_ = env.step(1)
>>> skip_and_max_obs = np.max(np.stack([obs3, obs4], axis=0), axis=0)
>>> env = gym.make("CartPole-v1")
>>> wrapped_env = MaxAndSkipObservation(env)
>>> wrapped_obs0, *_ = wrapped_env.reset(seed=123)
>>> wrapped_obs1, *_ = wrapped_env.step(1)
>>> np.all(obs0 == wrapped_obs0)
True
>>> np.all(wrapped_obs1 == skip_and_max_obs)
True
Change logs:
* v1.0.0 - Initially add
"""
def __init__(self, env: gym.Env[ObsType, ActType], skip: int = 4):
"""This wrapper will return only every ``skip``-th frame (frameskipping) and return the max between the two last frames.
Args:
env (Env): The environment to apply the wrapper
skip: The number of frames to skip
"""
gym.utils.RecordConstructorArgs.__init__(self, skip=skip)
gym.Wrapper.__init__(self, env)
if not np.issubdtype(type(skip), np.integer):
raise TypeError(
f"The skip is expected to be an integer, actual type: {type(skip)}"
)
if skip < 2:
raise ValueError(
f"The skip value needs to be equal or greater than two, actual value: {skip}"
)
if env.observation_space.shape is None:
raise ValueError("The observation space must have the shape attribute.")
self._skip = skip
self._obs_buffer = np.zeros(
(2, *env.observation_space.shape), dtype=env.observation_space.dtype
)
def step(
self, action: WrapperActType
) -> tuple[WrapperObsType, SupportsFloat, bool, bool, dict[str, Any]]:
"""Step the environment with the given action for ``skip`` steps.
Repeat action, sum reward, and max over last observations.
Args:
action: The action to step through the environment with
Returns:
Max of the last two observations, reward, terminated, truncated, and info from the environment
"""
total_reward = 0.0
terminated = truncated = False
info = {}
for i in range(self._skip):
obs, reward, terminated, truncated, info = self.env.step(action)
if i == self._skip - 2:
self._obs_buffer[0] = obs
if i == self._skip - 1:
self._obs_buffer[1] = obs
total_reward += float(reward)
if terminated or truncated:
break
max_frame = np.max(self._obs_buffer, axis=0)
return max_frame, total_reward, terminated, truncated, info