mirror of
https://github.com/Farama-Foundation/Gymnasium.git
synced 2025-08-01 06:07:08 +00:00
573 lines
18 KiB
Python
573 lines
18 KiB
Python
"""
|
||
Frozenlake benchmark
|
||
====================
|
||
|
||
"""
|
||
|
||
# %%
|
||
# In this post we'll compare a bunch of different map sizes on the
|
||
# `FrozenLake <https://gymnasium.farama.org/environments/toy_text/frozen_lake/>`__
|
||
# environment from the reinforcement learning
|
||
# `Gymnasium <https://gymnasium.farama.org/>`__ package using the
|
||
# Q-learning algorithm.
|
||
#
|
||
|
||
|
||
# %%
|
||
# Dependencies
|
||
# ------------
|
||
#
|
||
|
||
|
||
# %%
|
||
# Let's first import a few dependencies we'll need.
|
||
#
|
||
|
||
# Author: Andrea Pierré
|
||
# License: MIT License
|
||
|
||
|
||
from pathlib import Path
|
||
from typing import NamedTuple
|
||
|
||
import matplotlib.pyplot as plt
|
||
import numpy as np
|
||
import pandas as pd
|
||
import seaborn as sns
|
||
from tqdm import tqdm
|
||
|
||
import gymnasium as gym
|
||
from gymnasium.envs.toy_text.frozen_lake import generate_random_map
|
||
|
||
|
||
sns.set_theme()
|
||
|
||
# %load_ext lab_black
|
||
|
||
|
||
# %%
|
||
# Parameters we'll use
|
||
# --------------------
|
||
#
|
||
|
||
|
||
class Params(NamedTuple):
|
||
total_episodes: int # Total episodes
|
||
learning_rate: float # Learning rate
|
||
gamma: float # Discounting rate
|
||
epsilon: float # Exploration probability
|
||
map_size: int # Number of tiles of one side of the squared environment
|
||
seed: int # Define a seed so that we get reproducible results
|
||
is_slippery: bool # If true the player will move in intended direction with probability of 1/3 else will move in either perpendicular direction with equal probability of 1/3 in both directions
|
||
n_runs: int # Number of runs
|
||
action_size: int # Number of possible actions
|
||
state_size: int # Number of possible states
|
||
proba_frozen: float # Probability that a tile is frozen
|
||
savefig_folder: Path # Root folder where plots are saved
|
||
|
||
|
||
params = Params(
|
||
total_episodes=2000,
|
||
learning_rate=0.8,
|
||
gamma=0.95,
|
||
epsilon=0.1,
|
||
map_size=5,
|
||
seed=123,
|
||
is_slippery=False,
|
||
n_runs=20,
|
||
action_size=None,
|
||
state_size=None,
|
||
proba_frozen=0.9,
|
||
savefig_folder=Path("../../_static/img/tutorials/"),
|
||
)
|
||
params
|
||
|
||
# Set the seed
|
||
rng = np.random.default_rng(params.seed)
|
||
|
||
# Create the figure folder if it doesn't exist
|
||
params.savefig_folder.mkdir(parents=True, exist_ok=True)
|
||
|
||
|
||
# %%
|
||
# The FrozenLake environment
|
||
# --------------------------
|
||
#
|
||
|
||
env = gym.make(
|
||
"FrozenLake-v1",
|
||
is_slippery=params.is_slippery,
|
||
render_mode="rgb_array",
|
||
desc=generate_random_map(
|
||
size=params.map_size, p=params.proba_frozen, seed=params.seed
|
||
),
|
||
)
|
||
|
||
|
||
# %%
|
||
# Creating the Q-table
|
||
# ~~~~~~~~~~~~~~~~~~~~
|
||
#
|
||
# In this tutorial we'll be using Q-learning as our learning algorithm and
|
||
# :math:`\epsilon`-greedy to decide which action to pick at each step. You
|
||
# can have a look at the `References section <#References>`__ for some
|
||
# refreshers on the theory. Now, let's create our Q-table initialized at
|
||
# zero with the states number as rows and the actions number as columns.
|
||
#
|
||
|
||
params = params._replace(action_size=env.action_space.n)
|
||
params = params._replace(state_size=env.observation_space.n)
|
||
print(f"Action size: {params.action_size}")
|
||
print(f"State size: {params.state_size}")
|
||
|
||
|
||
class Qlearning:
|
||
def __init__(self, learning_rate, gamma, state_size, action_size):
|
||
self.state_size = state_size
|
||
self.action_size = action_size
|
||
self.learning_rate = learning_rate
|
||
self.gamma = gamma
|
||
self.reset_qtable()
|
||
|
||
def update(self, state, action, reward, new_state):
|
||
"""Update Q(s,a):= Q(s,a) + lr [R(s,a) + gamma * max Q(s',a') - Q(s,a)]"""
|
||
delta = (
|
||
reward
|
||
+ self.gamma * np.max(self.qtable[new_state, :])
|
||
- self.qtable[state, action]
|
||
)
|
||
q_update = self.qtable[state, action] + self.learning_rate * delta
|
||
return q_update
|
||
|
||
def reset_qtable(self):
|
||
"""Reset the Q-table."""
|
||
self.qtable = np.zeros((self.state_size, self.action_size))
|
||
|
||
|
||
class EpsilonGreedy:
|
||
def __init__(self, epsilon):
|
||
self.epsilon = epsilon
|
||
|
||
def choose_action(self, action_space, state, qtable):
|
||
"""Choose an action `a` in the current world state (s)."""
|
||
# First we randomize a number
|
||
explor_exploit_tradeoff = rng.uniform(0, 1)
|
||
|
||
# Exploration
|
||
if explor_exploit_tradeoff < self.epsilon:
|
||
action = action_space.sample()
|
||
|
||
# Exploitation (taking the biggest Q-value for this state)
|
||
else:
|
||
# Break ties randomly
|
||
# Find the indices where the Q-value equals the maximum value
|
||
# Choose a random action from the indices where the Q-value is maximum
|
||
max_ids = np.where(qtable[state, :] == max(qtable[state, :]))[0]
|
||
action = rng.choice(max_ids)
|
||
return action
|
||
|
||
|
||
# %%
|
||
# Running the environment
|
||
# ~~~~~~~~~~~~~~~~~~~~~~~
|
||
#
|
||
# Let's instantiate the learner and the explorer.
|
||
#
|
||
|
||
learner = Qlearning(
|
||
learning_rate=params.learning_rate,
|
||
gamma=params.gamma,
|
||
state_size=params.state_size,
|
||
action_size=params.action_size,
|
||
)
|
||
explorer = EpsilonGreedy(
|
||
epsilon=params.epsilon,
|
||
)
|
||
|
||
|
||
# %%
|
||
# This will be our main function to run our environment until the maximum
|
||
# number of episodes ``params.total_episodes``. To account for
|
||
# stochasticity, we will also run our environment a few times.
|
||
#
|
||
|
||
|
||
def run_env():
|
||
rewards = np.zeros((params.total_episodes, params.n_runs))
|
||
steps = np.zeros((params.total_episodes, params.n_runs))
|
||
episodes = np.arange(params.total_episodes)
|
||
qtables = np.zeros((params.n_runs, params.state_size, params.action_size))
|
||
all_states = []
|
||
all_actions = []
|
||
|
||
for run in range(params.n_runs): # Run several times to account for stochasticity
|
||
learner.reset_qtable() # Reset the Q-table between runs
|
||
|
||
for episode in tqdm(
|
||
episodes, desc=f"Run {run}/{params.n_runs} - Episodes", leave=False
|
||
):
|
||
state = env.reset(seed=params.seed)[0] # Reset the environment
|
||
step = 0
|
||
done = False
|
||
total_rewards = 0
|
||
|
||
while not done:
|
||
action = explorer.choose_action(
|
||
action_space=env.action_space, state=state, qtable=learner.qtable
|
||
)
|
||
|
||
# Log all states and actions
|
||
all_states.append(state)
|
||
all_actions.append(action)
|
||
|
||
# Take the action (a) and observe the outcome state(s') and reward (r)
|
||
new_state, reward, terminated, truncated, info = env.step(action)
|
||
|
||
done = terminated or truncated
|
||
|
||
learner.qtable[state, action] = learner.update(
|
||
state, action, reward, new_state
|
||
)
|
||
|
||
total_rewards += reward
|
||
step += 1
|
||
|
||
# Our new state is state
|
||
state = new_state
|
||
|
||
# Log all rewards and steps
|
||
rewards[episode, run] = total_rewards
|
||
steps[episode, run] = step
|
||
qtables[run, :, :] = learner.qtable
|
||
|
||
return rewards, steps, episodes, qtables, all_states, all_actions
|
||
|
||
|
||
# %%
|
||
# Visualization
|
||
# ~~~~~~~~~~~~~
|
||
#
|
||
|
||
|
||
# %%
|
||
# To make it easy to plot the results with Seaborn, we'll save the main
|
||
# results of the simulation in Pandas dataframes.
|
||
#
|
||
|
||
|
||
def postprocess(episodes, params, rewards, steps, map_size):
|
||
"""Convert the results of the simulation in dataframes."""
|
||
res = pd.DataFrame(
|
||
data={
|
||
"Episodes": np.tile(episodes, reps=params.n_runs),
|
||
"Rewards": rewards.flatten(order="F"),
|
||
"Steps": steps.flatten(order="F"),
|
||
}
|
||
)
|
||
res["cum_rewards"] = rewards.cumsum(axis=0).flatten(order="F")
|
||
res["map_size"] = np.repeat(f"{map_size}x{map_size}", res.shape[0])
|
||
|
||
st = pd.DataFrame(data={"Episodes": episodes, "Steps": steps.mean(axis=1)})
|
||
st["map_size"] = np.repeat(f"{map_size}x{map_size}", st.shape[0])
|
||
return res, st
|
||
|
||
|
||
# %%
|
||
# We want to plot the policy the agent has learned in the end. To do that
|
||
# we will: 1. extract the best Q-values from the Q-table for each state,
|
||
# 2. get the corresponding best action for those Q-values, 3. map each
|
||
# action to an arrow so we can visualize it.
|
||
#
|
||
|
||
|
||
def qtable_directions_map(qtable, map_size):
|
||
"""Get the best learned action & map it to arrows."""
|
||
qtable_val_max = qtable.max(axis=1).reshape(map_size, map_size)
|
||
qtable_best_action = np.argmax(qtable, axis=1).reshape(map_size, map_size)
|
||
directions = {0: "←", 1: "↓", 2: "→", 3: "↑"}
|
||
qtable_directions = np.empty(qtable_best_action.flatten().shape, dtype=str)
|
||
eps = np.finfo(float).eps # Minimum float number on the machine
|
||
for idx, val in enumerate(qtable_best_action.flatten()):
|
||
if qtable_val_max.flatten()[idx] > eps:
|
||
# Assign an arrow only if a minimal Q-value has been learned as best action
|
||
# otherwise since 0 is a direction, it also gets mapped on the tiles where
|
||
# it didn't actually learn anything
|
||
qtable_directions[idx] = directions[val]
|
||
qtable_directions = qtable_directions.reshape(map_size, map_size)
|
||
return qtable_val_max, qtable_directions
|
||
|
||
|
||
# %%
|
||
# With the following function, we'll plot on the left the last frame of
|
||
# the simulation. If the agent learned a good policy to solve the task, we
|
||
# expect to see it on the tile of the treasure in the last frame of the
|
||
# video. On the right we'll plot the policy the agent has learned. Each
|
||
# arrow will represent the best action to choose for each tile/state.
|
||
#
|
||
|
||
|
||
def plot_q_values_map(qtable, env, map_size):
|
||
"""Plot the last frame of the simulation and the policy learned."""
|
||
qtable_val_max, qtable_directions = qtable_directions_map(qtable, map_size)
|
||
|
||
# Plot the last frame
|
||
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(15, 5))
|
||
ax[0].imshow(env.render())
|
||
ax[0].axis("off")
|
||
ax[0].set_title("Last frame")
|
||
|
||
# Plot the policy
|
||
sns.heatmap(
|
||
qtable_val_max,
|
||
annot=qtable_directions,
|
||
fmt="",
|
||
ax=ax[1],
|
||
cmap=sns.color_palette("Blues", as_cmap=True),
|
||
linewidths=0.7,
|
||
linecolor="black",
|
||
xticklabels=[],
|
||
yticklabels=[],
|
||
annot_kws={"fontsize": "xx-large"},
|
||
).set(title="Learned Q-values\nArrows represent best action")
|
||
for _, spine in ax[1].spines.items():
|
||
spine.set_visible(True)
|
||
spine.set_linewidth(0.7)
|
||
spine.set_color("black")
|
||
img_title = f"frozenlake_q_values_{map_size}x{map_size}.png"
|
||
fig.savefig(params.savefig_folder / img_title, bbox_inches="tight")
|
||
plt.show()
|
||
|
||
|
||
# %%
|
||
# As a sanity check, we will plot the distributions of states and actions
|
||
# with the following function:
|
||
#
|
||
|
||
|
||
def plot_states_actions_distribution(states, actions, map_size):
|
||
"""Plot the distributions of states and actions."""
|
||
labels = {"LEFT": 0, "DOWN": 1, "RIGHT": 2, "UP": 3}
|
||
|
||
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(15, 5))
|
||
sns.histplot(data=states, ax=ax[0], kde=True)
|
||
ax[0].set_title("States")
|
||
sns.histplot(data=actions, ax=ax[1])
|
||
ax[1].set_xticks(list(labels.values()), labels=labels.keys())
|
||
ax[1].set_title("Actions")
|
||
fig.tight_layout()
|
||
img_title = f"frozenlake_states_actions_distrib_{map_size}x{map_size}.png"
|
||
fig.savefig(params.savefig_folder / img_title, bbox_inches="tight")
|
||
plt.show()
|
||
|
||
|
||
# %%
|
||
# Now we'll be running our agent on a few increasing maps sizes: -
|
||
# :math:`4 \times 4`, - :math:`7 \times 7`, - :math:`9 \times 9`, -
|
||
# :math:`11 \times 11`.
|
||
#
|
||
# Putting it all together:
|
||
#
|
||
|
||
map_sizes = [4, 7, 9, 11]
|
||
res_all = pd.DataFrame()
|
||
st_all = pd.DataFrame()
|
||
|
||
for map_size in map_sizes:
|
||
env = gym.make(
|
||
"FrozenLake-v1",
|
||
is_slippery=params.is_slippery,
|
||
render_mode="rgb_array",
|
||
desc=generate_random_map(
|
||
size=map_size, p=params.proba_frozen, seed=params.seed
|
||
),
|
||
)
|
||
|
||
params = params._replace(action_size=env.action_space.n)
|
||
params = params._replace(state_size=env.observation_space.n)
|
||
env.action_space.seed(
|
||
params.seed
|
||
) # Set the seed to get reproducible results when sampling the action space
|
||
learner = Qlearning(
|
||
learning_rate=params.learning_rate,
|
||
gamma=params.gamma,
|
||
state_size=params.state_size,
|
||
action_size=params.action_size,
|
||
)
|
||
explorer = EpsilonGreedy(
|
||
epsilon=params.epsilon,
|
||
)
|
||
|
||
print(f"Map size: {map_size}x{map_size}")
|
||
rewards, steps, episodes, qtables, all_states, all_actions = run_env()
|
||
|
||
# Save the results in dataframes
|
||
res, st = postprocess(episodes, params, rewards, steps, map_size)
|
||
res_all = pd.concat([res_all, res])
|
||
st_all = pd.concat([st_all, st])
|
||
qtable = qtables.mean(axis=0) # Average the Q-table between runs
|
||
|
||
plot_states_actions_distribution(
|
||
states=all_states, actions=all_actions, map_size=map_size
|
||
) # Sanity check
|
||
plot_q_values_map(qtable, env, map_size)
|
||
|
||
env.close()
|
||
|
||
|
||
# %%
|
||
# Map size: :math:`4 \times 4`
|
||
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||
#
|
||
|
||
|
||
# %%
|
||
# |States actions histogram 4x4 map| |Q-values 4x4 map|
|
||
#
|
||
# .. |States actions histogram 4x4 map| image:: ../../_static/img/tutorials/frozenlake_states_actions_distrib_4x4.png
|
||
# .. |Q-values 4x4 map| image:: ../../_static/img/tutorials/frozenlake_q_values_4x4.png
|
||
#
|
||
#
|
||
|
||
|
||
# %%
|
||
# Map size: :math:`7 \times 7`
|
||
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||
#
|
||
|
||
|
||
# %%
|
||
# |States actions histogram 7x7 map| |Q-values 7x7 map|
|
||
#
|
||
# .. |States actions histogram 7x7 map| image:: ../../_static/img/tutorials/frozenlake_states_actions_distrib_7x7.png
|
||
# .. |Q-values 7x7 map| image:: ../../_static/img/tutorials/frozenlake_q_values_7x7.png
|
||
#
|
||
#
|
||
|
||
|
||
# %%
|
||
# Map size: :math:`9 \times 9`
|
||
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||
#
|
||
|
||
|
||
# %%
|
||
# |States actions histogram 9x9 map| |Q-values 9x9 map|
|
||
#
|
||
# .. |States actions histogram 9x9 map| image:: ../../_static/img/tutorials/frozenlake_states_actions_distrib_9x9.png
|
||
# .. |Q-values 9x9 map| image:: ../../_static/img/tutorials/frozenlake_q_values_9x9.png
|
||
#
|
||
#
|
||
|
||
|
||
# %%
|
||
# Map size: :math:`11 \times 11`
|
||
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||
#
|
||
|
||
|
||
# %%
|
||
# |States actions histogram 11x11 map| |Q-values 11x11 map|
|
||
#
|
||
# .. |States actions histogram 11x11 map| image:: ../../_static/img/tutorials/frozenlake_states_actions_distrib_11x11.png
|
||
# .. |Q-values 11x11 map| image:: ../../_static/img/tutorials/frozenlake_q_values_11x11.png
|
||
#
|
||
#
|
||
|
||
|
||
# %%
|
||
# The ``DOWN`` and ``RIGHT`` actions get chosen more often, which makes
|
||
# sense as the agent starts at the top left of the map and needs to find
|
||
# its way down to the bottom right. Also the bigger the map, the less
|
||
# states/tiles further away from the starting state get visited.
|
||
#
|
||
|
||
|
||
# %%
|
||
# To check if our agent is learning, we want to plot the cumulated sum of
|
||
# rewards, as well as the number of steps needed until the end of the
|
||
# episode. If our agent is learning, we expect to see the cumulated sum of
|
||
# rewards to increase and the number of steps to solve the task to
|
||
# decrease.
|
||
#
|
||
|
||
|
||
def plot_steps_and_rewards(rewards_df, steps_df):
|
||
"""Plot the steps and rewards from dataframes."""
|
||
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(15, 5))
|
||
sns.lineplot(
|
||
data=rewards_df, x="Episodes", y="cum_rewards", hue="map_size", ax=ax[0]
|
||
)
|
||
ax[0].set(ylabel="Cumulated rewards")
|
||
|
||
sns.lineplot(data=steps_df, x="Episodes", y="Steps", hue="map_size", ax=ax[1])
|
||
ax[1].set(ylabel="Averaged steps number")
|
||
|
||
for axi in ax:
|
||
axi.legend(title="map size")
|
||
fig.tight_layout()
|
||
img_title = "frozenlake_steps_and_rewards.png"
|
||
fig.savefig(params.savefig_folder / img_title, bbox_inches="tight")
|
||
plt.show()
|
||
|
||
|
||
plot_steps_and_rewards(res_all, st_all)
|
||
|
||
|
||
# %%
|
||
# |Steps and rewards|
|
||
#
|
||
# .. |Steps and rewards| image:: ../../_static/img/tutorials/frozenlake_steps_and_rewards.png
|
||
#
|
||
#
|
||
|
||
|
||
# %%
|
||
# On the :math:`4 \times 4` map, learning converges pretty quickly,
|
||
# whereas on the :math:`7 \times 7` map, the agent needs :math:`\sim 300`
|
||
# episodes, on the :math:`9 \times 9` map it needs :math:`\sim 800`
|
||
# episodes, and the :math:`11 \times 11` map, it needs :math:`\sim 1800`
|
||
# episodes to converge. Interestingly, the agent seems to be getting more
|
||
# rewards on the :math:`9 \times 9` map than on the :math:`7 \times 7`
|
||
# map, which could mean it didn't reach an optimal policy on the
|
||
# :math:`7 \times 7` map.
|
||
#
|
||
|
||
|
||
# %%
|
||
# In the end, if agent doesn't get any rewards, rewards don't get
|
||
# propagated in the Q-values, and the agent doesn't learn anything. In my
|
||
# experience on this environment using :math:`\epsilon`-greedy and those
|
||
# hyperparameters and environment settings, maps having more than
|
||
# :math:`11 \times 11` tiles start to be difficult to solve. Maybe using a
|
||
# different exploration algorithm could overcome this. The other parameter
|
||
# having a big impact is the ``proba_frozen``, the probability of the tile
|
||
# being frozen. With too many holes, i.e. :math:`p<0.9`, Q-learning is
|
||
# having a hard time in not falling into holes and getting a reward
|
||
# signal.
|
||
#
|
||
|
||
|
||
# %%
|
||
# References
|
||
# ----------
|
||
#
|
||
# - Code inspired by `Deep Reinforcement Learning
|
||
# Course <https://simoninithomas.github.io/Deep_reinforcement_learning_Course/>`__
|
||
# by Thomas Simonini (http://simoninithomas.com/)
|
||
# - `Dissecting Reinforcement
|
||
# Learning-Part.2 <https://mpatacchiola.github.io/blog/2017/01/15/dissecting-reinforcement-learning-2.html>`__
|
||
# - `David Silver’s course <https://www.davidsilver.uk/teaching/>`__ in
|
||
# particular lesson 4 and lesson 5
|
||
# - `Q-learning article on
|
||
# Wikipedia <https://en.wikipedia.org/wiki/Q-learning>`__
|
||
# - `Q-Learning: Off-Policy TD
|
||
# Control <http://incompleteideas.net/book/ebook/node65.html>`__ in
|
||
# `Reinforcement Learning: An Introduction, by Richard S. Sutton and
|
||
# Andrew G. Barto <http://incompleteideas.net/book/ebook/>`__
|
||
# - `Epsilon-Greedy
|
||
# Q-learning <https://www.baeldung.com/cs/epsilon-greedy-q-learning>`__
|
||
# - `Introduction to Reinforcement
|
||
# Learning <https://gibberblot.github.io/rl-notes/index.html>`__ by Tim
|
||
# Miller (University of Melbourne)
|
||
#
|