doc(FrozenLake_tuto): update policy exploitation logic to handle variable sets of maximum Q-values (#1037)

2025-08-30 09:55:39 +00:00 · 2024-04-29 05:25:08 -04:00
parent 93403cb4f9
commit 5bf72690e1
1 changed files with 4 additions and 6 deletions
--- a/docs/tutorials/training_agents/FrozenLake_tuto.py
+++ b/docs/tutorials/training_agents/FrozenLake_tuto.py
@@ -161,12 +161,10 @@ class EpsilonGreedy:
        # Exploitation (taking the biggest Q-value for this state)
        else:
            # Break ties randomly
-            # If all actions are the same for this state we choose a random one
+            # Find the indices where the Q-value equals the maximum value
-            # (otherwise `np.argmax()` would always take the first one)
+            # Choose a random action from the indices where the Q-value is maximum
-            if np.all(qtable[state, :]) == qtable[state, 0]:
+            max_ids = np.where(qtable[state, :] == max(qtable[state, :]))[0]
-                action = action_space.sample()
+            action = rng.choice(max_ids)
            else:
                action = np.argmax(qtable[state, :])
        return action