Fix experimental normalize reward wrapper (#277)

Co-authored-by: raphajaner <raphael.trumpp@tum.de>
2025-08-01 06:07:08 +00:00 · 2023-01-20 15:25:31 +01:00
parent b4caf9df16
commit 4b5abb694b
2 changed files with 30 additions and 1 deletions
--- a/gymnasium/experimental/wrappers/lambda_reward.py
+++ b/gymnasium/experimental/wrappers/lambda_reward.py
@@ -98,6 +98,10 @@ class NormalizeRewardV0(gym.Wrapper):

    The exponential moving average will have variance :math:`(1 - \gamma)^2`.

+    The property `_update_running_mean` allows to freeze/continue the running mean calculation of the reward
+    statistics. If `True` (default), the `RunningMeanStd` will get updated every time `self.normalize()` is called.
+    If False, the calculated statistics are used but not updated anymore; this may be used during evaluation.
+
    Note:
        The scaling depends on past trajectories and rewards will not be scaled correctly if the wrapper was newly
        instantiated or the policy was changed recently.
@@ -118,7 +122,7 @@ class NormalizeRewardV0(gym.Wrapper):
        """
        super().__init__(env)
        self.rewards_running_means = RunningMeanStd(shape=())
-        self.discounted_reward: float = 0.0
+        self.discounted_reward: np.array = np.array([0.0])
        self.gamma = gamma
        self.epsilon = epsilon

--- a/tests/experimental/wrappers/test_normalize_reward.py
+++ b/tests/experimental/wrappers/test_normalize_reward.py
@@ -1 +1,26 @@
 """Test suite for NormalizeRewardV0."""
+import numpy as np
+
+from gymnasium.core import ActType
+from gymnasium.experimental.wrappers import NormalizeRewardV0
+from tests.testing_env import GenericTestEnv
+
+
+def _make_reward_env():
+    """Function that returns a `GenericTestEnv` with reward=1."""
+
+    def step_func(self, action: ActType):
+        return self.observation_space.sample(), 1.0, False, False, {}
+
+    return GenericTestEnv(step_func=step_func)
+
+
+def test_normalize_reward_wrapper():
+    """Tests that the NormalizeReward does not throw an error."""
+    # TODO: Functional correctness should be tested
+    env = _make_reward_env()
+    wrapped_env = NormalizeRewardV0(env)
+    wrapped_env.reset()
+    _, reward, _, _, _ = wrapped_env.step(None)
+    assert np.ndim(reward) == 0
+    env.close()