doc(FrozenLake_tuto): update policy exploitation logic to handle variable sets of maximum Q-values (#1037)

This commit is contained in:
edelauna
2024-04-29 05:25:08 -04:00
committed by GitHub
parent 93403cb4f9
commit 5bf72690e1

View File

@@ -161,12 +161,10 @@ class EpsilonGreedy:
# Exploitation (taking the biggest Q-value for this state) # Exploitation (taking the biggest Q-value for this state)
else: else:
# Break ties randomly # Break ties randomly
# If all actions are the same for this state we choose a random one # Find the indices where the Q-value equals the maximum value
# (otherwise `np.argmax()` would always take the first one) # Choose a random action from the indices where the Q-value is maximum
if np.all(qtable[state, :]) == qtable[state, 0]: max_ids = np.where(qtable[state, :] == max(qtable[state, :]))[0]
action = action_space.sample() action = rng.choice(max_ids)
else:
action = np.argmax(qtable[state, :])
return action return action