Stateful wrappers [DelayObservation ,TimeAwareObservation, StickyAction] (#165)

2025-07-31 13:54:31 +00:00 · 2022-12-02 01:04:34 +01:00
parent 678d361e62
commit d0a929c77d
10 changed files with 389 additions and 1 deletions
--- a/docs/api/experimental/wrappers.md
+++ b/docs/api/experimental/wrappers.md
@@ -19,6 +19,19 @@
 .. autoclass:: gymnasium.experimental.wrappers.ClipRewardV0
 ```
 ## Observation Wrappers
 ```{eval-rst}
 .. autoclass:: gymnasium.experimental.wrappers.TimeAwareObservationV0
 .. autoclass:: gymnasium.experimental.wrappers.DelayObservationV0
 ```
 ## Action Wrappers
 ```{eval-rst}
 .. autoclass:: gymnasium.experimental.wrappers.StickyActionV0
 ```
 ## Common Wrappers
 ```{eval-rst}
--- a/gymnasium/error.py
+++ b/gymnasium/error.py
@@ -73,6 +73,10 @@ class MissingArgument(Error):
    """Raised when a required argument in the initializer is missing."""
 class InvalidProbability(Error):
    """Raised when given an invalid value for a probability."""
 class InvalidBound(Error):
    """Raised when the clipping an array with invalid upper and/or lower bound."""
--- a/gymnasium/experimental/init.py
+++ b/gymnasium/experimental/init.py
@@ -1,4 +1,4 @@
-"""Root __init__ of the gym dev_wrappers."""
+"""Root __init__ of the gym experimental wrappers."""
 from gymnasium.experimental.functional import FuncEnv
--- a/gymnasium/experimental/wrappers/init.py
+++ b/gymnasium/experimental/wrappers/init.py
@@ -12,15 +12,23 @@ from gymnasium.experimental.wrappers.lambda_action import (
 )
 from gymnasium.experimental.wrappers.lambda_observations import LambdaObservationV0
 from gymnasium.experimental.wrappers.lambda_reward import ClipRewardV0, LambdaRewardV0
 from gymnasium.experimental.wrappers.sticky_action import StickyActionV0
 from gymnasium.experimental.wrappers.time_aware_observation import (
    TimeAwareObservationV0,
 )
 from gymnasium.experimental.wrappers.delay_observation import DelayObservationV0
 __all__ = [
    "ArgType",
    # Lambda Action
    "LambdaActionV0",
    "StickyActionV0",
    "ClipActionV0",
    "RescaleActionV0",
    # Lambda Observation
    "LambdaObservationV0",
    "DelayObservationV0",
    "TimeAwareObservationV0",
    # Lambda Reward
    "LambdaRewardV0",
    "ClipRewardV0",
--- a/gymnasium/experimental/wrappers/delay_observation.py
+++ b/gymnasium/experimental/wrappers/delay_observation.py
@@ -0,0 +1,35 @@
 """Wrapper for delaying the returned observation."""
 from collections import deque
 import jumpy as jp
 import gymnasium as gym
 from gymnasium.core import ObsType
 class DelayObservationV0(gym.ObservationWrapper):
    """Wrapper which adds a delay to the returned observation."""
    def __init__(self, env: gym.Env, delay: int):
        """Initialize the DelayObservation wrapper.
        Args:
            env (Env): the wrapped environment
            delay (int): number of timesteps for delaying the observation.
                         Before reaching the `delay` number of timesteps,
                         returned observation is an array of zeros with the
                         same shape of the observation space.
        """
        super().__init__(env)
        self.delay = delay
        self.observation_queue = deque()
    def observation(self, observation: ObsType) -> ObsType:
        """Return the delayed observation."""
        self.observation_queue.append(observation)
        if len(self.observation_queue) > self.delay:
            return self.observation_queue.popleft()
        return jp.zeros_like(observation)
--- a/gymnasium/experimental/wrappers/sticky_action.py
+++ b/gymnasium/experimental/wrappers/sticky_action.py
@@ -0,0 +1,40 @@
 """Wrapper which adds a probability of repeating the previous executed action."""
 from typing import Union
 import gymnasium as gym
 from gymnasium.core import ActType
 from gymnasium.error import InvalidProbability
 class StickyActionV0(gym.ActionWrapper):
    """Wrapper which adds a probability of repeating the previous action."""
    def __init__(self, env: gym.Env, repeat_action_probability: Union[int, float]):
        """Initialize StickyAction wrapper.
        Args:
            env (Env): the wrapped environment
            repeat_action_probability (int | float): a proability of repeating the old action.
        """
        if not 0 <= repeat_action_probability < 1:
            raise InvalidProbability(
                f"repeat_action_probability should be in the interval [0,1). Received {repeat_action_probability}"
            )
        super().__init__(env)
        self.repeat_action_probability = repeat_action_probability
        self.old_action = None
    def action(self, action: ActType):
        """Execute the action."""
        if (
            self.old_action is not None
            and self.np_random.uniform() < self.repeat_action_probability
        ):
            action = self.old_action
        self.old_action = action
        return action
    def reset(self, **kwargs):
        """Reset the environment."""
        self.old_action = None
        return super().reset(**kwargs)
--- a/gymnasium/experimental/wrappers/time_aware_observation.py
+++ b/gymnasium/experimental/wrappers/time_aware_observation.py
@@ -0,0 +1,113 @@
 """Wrapper for adding time aware observations to environment observation."""
 from collections import OrderedDict
 import gymnasium as gym
 import gymnasium.spaces as spaces
 from gymnasium.core import ActType, ObsType
 from gymnasium.spaces import Box, Dict
 class TimeAwareObservationV0(gym.ObservationWrapper):
    """Augment the observation with time information of the episode.
    Time can be represented as a normalized value between [0,1]
    or by the number of timesteps remaining before truncation occurs.
    Example:
        >>> import gym
        >>> from gym.wrappers import TimeAwareObservationV0
        >>> env = gym.make('CartPole-v1')
        >>> env = TimeAwareObservationV0(env)
        >>> env.observation_space
        Dict(obs: Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32), time: Box(0.0, 500, (1,), float32))
        >>> _ = env.reset()
        >>> env.step(env.action_space.sample())[0]
        OrderedDict([('obs',
        ...       array([ 0.02866629,  0.2310988 , -0.02614601, -0.2600732 ], dtype=float32)),
        ...      ('time', array([0.002]))])
    Flatten observation space example:
        >>> env = gym.make('CartPole-v1')
        >>> env = TimeAwareObservationV0(env, flatten=True)
        >>> env.observation_space
        Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38  0.0000000e+00], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38 500], (5,), float32)
        >>> _ = env.reset()
        >>> env.step(env.action_space.sample())[0]
        array([-0.01232257,  0.19335455, -0.02244143, -0.32388705,  0.002 ], dtype=float32)
    """
    def __init__(self, env: gym.Env, flatten=False, normalize_time=True):
        """Initialize :class:`TimeAwareObservationV0`.
        Args:
            env: The environment to apply the wrapper
            flatten: Flatten the observation to a `Box` of a single dimension
            normalize_time: if `True` return time in the range [0,1]
                otherwise return time as remaining timesteps before truncation
        """
        super().__init__(env)
        self.flatten = flatten
        self.normalize_time = normalize_time
        self.max_timesteps = getattr(env, "_max_episode_steps")
        if self.normalize_time:
            self._get_time_observation = lambda: self.timesteps / self.max_timesteps
            time_space = Box(0, 1)
        else:
            self._get_time_observation = lambda: self.max_timesteps - self.timesteps
            time_space = Box(0, self.max_timesteps)
        self.time_aware_observation_space = Dict(
            obs=env.observation_space, time=time_space
        )
        if self.flatten:
            self.observation_space = spaces.flatten_space(
                self.time_aware_observation_space
            )
            self._observation_postprocess = lambda observation: spaces.flatten(
                self.time_aware_observation_space, observation
            )
        else:
            self.observation_space = self.time_aware_observation_space
            self._observation_postprocess = lambda observation: observation
    def observation(self, observation: ObsType):
        """Adds to the observation with the current time information.
        Args:
            observation: The observation to add the time step to
        Returns:
            The observation with the time information appended to
        """
        time_observation = self._get_time_observation()
        observation = OrderedDict(obs=observation, time=time_observation)
        return self._observation_postprocess(observation)
    def step(self, action: ActType):
        """Steps through the environment, incrementing the time step.
        Args:
            action: The action to take
        Returns:
            The environment's step using the action.
        """
        self.timesteps += 1
        observation, reward, terminated, truncated, info = super().step(action)
        return observation, reward, terminated, truncated, info
    def reset(self, **kwargs):
        """Reset the environment setting the time to zero.
        Args:
            **kwargs: Kwargs to apply to env.reset()
        Returns:
            The reset environment
        """
        self.timesteps = 0
        return super().reset(**kwargs)
--- a/tests/experimental/wrappers/test_delay_observation.py
+++ b/tests/experimental/wrappers/test_delay_observation.py
@@ -0,0 +1,36 @@
 import numpy as np
 import gymnasium as gym
 from gymnasium.experimental.wrappers import DelayObservationV0
 SEED = 42
 DELAY = 3
 NUM_STEPS = 5
 def test_delay_observation():
    env = gym.make("CartPole-v1")
    env.action_space.seed(SEED)
    env.reset(seed=SEED)
    undelayed_observations = []
    for _ in range(NUM_STEPS):
        obs, _, _, _, _ = env.step(env.action_space.sample())
        undelayed_observations.append(obs)
    env.action_space.seed(SEED)
    env.reset(seed=SEED)
    env = DelayObservationV0(env, delay=DELAY)
    delayed_observations = []
    for i in range(NUM_STEPS):
        obs, _, _, _, _ = env.step(env.action_space.sample())
        if i < DELAY - 1:
            assert np.all(obs == 0)
        delayed_observations.append(obs)
    assert np.alltrue(
        np.array(delayed_observations[DELAY:])
        == np.array(undelayed_observations[: DELAY - 1])
    )
--- a/tests/experimental/wrappers/test_sticky_action.py
+++ b/tests/experimental/wrappers/test_sticky_action.py
@@ -0,0 +1,41 @@
 """Test suite for StickyActionV0."""
 import pytest
 from gymnasium.error import InvalidProbability
 from gymnasium.experimental.wrappers import StickyActionV0
 from tests.testing_env import GenericTestEnv
 SEED = 42
 DELAY = 3
 NUM_STEPS = 10
 def step_fn(self, action):
    return action
 def test_sticky_action():
    env = StickyActionV0(GenericTestEnv(step_fn=step_fn), repeat_action_probability=0.5)
    env.reset(seed=SEED)
    env.action_space.seed(SEED)
    previous_action = None
    for _ in range(NUM_STEPS):
        input_action = env.action_space.sample()
        executed_action = env.step(input_action)
        if executed_action != input_action:
            assert executed_action == previous_action
        else:
            assert executed_action == input_action
        previous_action = input_action
@pytest.mark.parametrize(("repeat_action_probability"), [-1, 1, 1.5])
 def test_sticky_action_raise(repeat_action_probability):
    with pytest.raises(InvalidProbability):
        StickyActionV0(
            GenericTestEnv(), repeat_action_probability=repeat_action_probability
        )
--- a/tests/experimental/wrappers/test_time_aware_observation.py
+++ b/tests/experimental/wrappers/test_time_aware_observation.py
@@ -0,0 +1,98 @@
 """Test suite for TimeAwareobservationV0."""
 from collections import OrderedDict
 import numpy as np
 import pytest
 import gymnasium as gym
 from gymnasium.experimental.wrappers import TimeAwareObservationV0
 from gymnasium.spaces import Box, Dict
 NUM_STEPS = 20
 SEED = 0
@pytest.mark.parametrize(
    "env",
    [
        gym.make("CartPole-v1", disable_env_checker=True),
        gym.make("CarRacing-v2", disable_env_checker=True),
    ],
 )
 def test_time_aware_observation_creation(env):
    """Test TimeAwareObservationV0 wrapper creation.
    This test checks if wrapped env with TimeAwareObservationV0
    is correctly created.
    """
    wrapped_env = TimeAwareObservationV0(env)
    obs, _ = wrapped_env.reset()
    assert isinstance(wrapped_env.observation_space, Dict)
    assert isinstance(obs, OrderedDict)
    assert np.all(obs["time"] == 0)
    assert env.observation_space == wrapped_env.observation_space["obs"]
@pytest.mark.parametrize("normalize_time", [True, False])
@pytest.mark.parametrize("flatten", [False, True])
@pytest.mark.parametrize(
    "env",
    [
        gym.make("CartPole-v1", disable_env_checker=True),
        gym.make("CarRacing-v2", disable_env_checker=True, continuous=False),
    ],
 )
 def test_time_aware_observation_step(env, flatten, normalize_time):
    """Test TimeAwareObservationV0 step.
    This test checks if wrapped env with TimeAwareObservationV0
    steps correctly.
    """
    env.action_space.seed(SEED)
    max_timesteps = env._max_episode_steps
    wrapped_env = TimeAwareObservationV0(
        env, flatten=flatten, normalize_time=normalize_time
    )
    wrapped_env.reset(seed=SEED)
    for timestep in range(1, NUM_STEPS):
        action = env.action_space.sample()
        observation, _, terminated, _, _ = wrapped_env.step(action)
        expected_time_obs = (
            timestep / max_timesteps if normalize_time else max_timesteps - timestep
        )
        if flatten:
            assert np.allclose(observation[-1], expected_time_obs)
        else:
            assert np.allclose(observation["time"], expected_time_obs)
        if terminated:
            break
@pytest.mark.parametrize(
    "env",
    [
        gym.make("CartPole-v1", disable_env_checker=True),
        gym.make("CarRacing-v2", disable_env_checker=True),
    ],
 )
 def test_time_aware_observation_creation_flatten(env):
    """Test TimeAwareObservationV0 wrapper creation with `flatten=True`.
    This test checks if wrapped env with TimeAwareObservationV0
    is correctly created when the `flatten` parameter is set to `True`.
    When flattened, the observation space should be a 1 dimension `Box`
    with time appended to the end.
    """
    wrapped_env = TimeAwareObservationV0(env, flatten=True)
    obs, _ = wrapped_env.reset()
    assert isinstance(wrapped_env.observation_space, Box)
    assert isinstance(obs, np.ndarray)
    assert env.observation_space == wrapped_env.time_aware_observation_space["obs"]
`@@ -1,4 +1,4 @@`
	`"""Root __init__ of the gym dev_wrappers."""`	`"""Root __init__ of the gym experimental wrappers."""`


	`from gymnasium.experimental.functional import FuncEnv`	`from gymnasium.experimental.functional import FuncEnv`