mirror of
https://github.com/Farama-Foundation/Gymnasium.git
synced 2025-08-30 09:55:39 +00:00
doc(FrozenLake_tuto): update policy exploitation logic to handle variable sets of maximum Q-values (#1037)
This commit is contained in:
@@ -161,12 +161,10 @@ class EpsilonGreedy:
|
|||||||
# Exploitation (taking the biggest Q-value for this state)
|
# Exploitation (taking the biggest Q-value for this state)
|
||||||
else:
|
else:
|
||||||
# Break ties randomly
|
# Break ties randomly
|
||||||
# If all actions are the same for this state we choose a random one
|
# Find the indices where the Q-value equals the maximum value
|
||||||
# (otherwise `np.argmax()` would always take the first one)
|
# Choose a random action from the indices where the Q-value is maximum
|
||||||
if np.all(qtable[state, :]) == qtable[state, 0]:
|
max_ids = np.where(qtable[state, :] == max(qtable[state, :]))[0]
|
||||||
action = action_space.sample()
|
action = rng.choice(max_ids)
|
||||||
else:
|
|
||||||
action = np.argmax(qtable[state, :])
|
|
||||||
return action
|
return action
|
||||||
|
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user