Files
Gymnasium/v1.0.0a1/tutorials/training_agents/FrozenLake_tuto/index.html
2024-02-13 17:04:55 +00:00

1317 lines
117 KiB
HTML
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<!doctype html>
<html class="no-js" lang="en" data-content_root="../../../">
<head><meta charset="utf-8"/>
<meta name="viewport" content="width=device-width,initial-scale=1"/>
<meta name="color-scheme" content="light dark">
<meta name="description" content="A standard API for reinforcement learning and a diverse set of reference environments (formerly Gym)">
<meta property="og:title" content="Gymnasium Documentation" />
<meta property="og:type" content="website" />
<meta property="og:description" content="A standard API for reinforcement learning and a diverse set of reference environments (formerly Gym)" />
<meta property="og:url" content="https://gymnasium.farama.org/tutorials/training_agents/FrozenLake_tuto.html" /><meta property="og:image" content="https://gymnasium.farama.org/_static/img/gymnasium-github.png" /><meta name="twitter:card" content="summary_large_image"><meta name="viewport" content="width=device-width, initial-scale=1" />
<link rel="index" title="Index" href="../../../genindex/" /><link rel="search" title="Search" href="../../../search/" /><link rel="next" title="Third-Party Tutorials" href="../../third-party-tutorials/" /><link rel="prev" title="Solving Blackjack with Q-Learning" href="../blackjack_tutorial/" />
<link rel="canonical" href="https://gymnasium.farama.org/tutorials/training_agents/FrozenLake_tuto.html" />
<link rel="shortcut icon" href="../../../_static/favicon.png"/><!-- Generated with Sphinx 7.2.6 and Furo 2023.08.19.dev1 -->
<title>Frozenlake benchmark - Gymnasium Documentation</title>
<link rel="stylesheet" type="text/css" href="../../../_static/pygments.css?v=a746c00c" />
<link rel="stylesheet" type="text/css" href="../../../_static/styles/furo.css?v=3e7f4c72" />
<link rel="stylesheet" type="text/css" href="../../../_static/sg_gallery.css?v=61a4c737" />
<link rel="stylesheet" type="text/css" href="../../../_static/sg_gallery-binder.css?v=f4aeca0c" />
<link rel="stylesheet" type="text/css" href="../../../_static/sg_gallery-dataframe.css?v=2082cf3c" />
<link rel="stylesheet" type="text/css" href="../../../_static/sg_gallery-rendered-html.css?v=1277b6f3" />
<link rel="stylesheet" type="text/css" href="../../../_static/styles/furo-extensions.css?v=49cbaffd" />
<style>
body {
--color-code-background: #f8f8f8;
--color-code-foreground: black;
}
@media not print {
body[data-theme="dark"] {
--color-code-background: #202020;
--color-code-foreground: #d0d0d0;
}
@media (prefers-color-scheme: dark) {
body:not([data-theme="light"]) {
--color-code-background: #202020;
--color-code-foreground: #d0d0d0;
}
}
}
</style></head>
<body>
<header class="farama-header" aria-label="Farama header">
<div class="farama-header__container">
<div class="farama-header__left--mobile">
<label class="nav-overlay-icon" for="__navigation">
<div class="visually-hidden">Toggle site navigation sidebar</div>
<svg viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg">
<defs></defs>
<line x1="0.5" y1="4" x2="23.5" y2="4"></line>
<line x1="0.232" y1="12" x2="23.5" y2="12"></line>
<line x1="0.232" y1="20" x2="23.5" y2="20"></line>
</svg>
</label>
</div>
<div class="farama-header__left farama-header__center--mobile">
<a href="../../../">
<img class="farama-header__logo only-light" src="../../../_static/img/gymnasium_black.svg" alt="Light Logo"/>
<img class="farama-header__logo only-dark" src="../../../_static/img/gymnasium_white.svg" alt="Dark Logo"/>
<span class="farama-header__title">Gymnasium Documentation</span>
</a>
</div>
<div class="farama-header__right">
<div class="farama-header-menu">
<button class="farama-header-menu__btn" aria-label="Open Farama Menu" aria-expanded="false" aria-haspopup="true" aria-controls="farama-menu">
<img class="farama-black-logo-invert" src="../../../_static/img/farama-logo-header.svg">
<svg viewBox="0 0 24 24" viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg">
<polyline style="stroke-linecap: round; stroke-linejoin: round; fill: none; stroke-width: 2px;" points="1 7 12 18 23 7"></polyline>
</svg>
</button>
<div class="farama-header-menu-container farama-hidden" aria-hidden="true" id="farama-menu">
<div class="farama-header-menu__header">
<a href="https://farama.org">
<img class="farama-header-menu__logo farama-white-logo-invert" src="../../../_static/img/farama_solid_white.svg" alt="Farama Foundation logo">
<span>Farama Foundation</span>
</a>
<div class="farama-header-menu-header__right">
<button id="farama-close-menu">
<svg viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg" fill="none" stroke="currentColor"
stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="icon-close">
<line x1="3" y1="21" x2="21" y2="3"></line>
<line x1="3" y1="3" x2="21" y2="21"></line>
</svg>
</button>
</div>
</div>
<div class="farama-header-menu__body">
<!-- Response from farama.org/api/projects.json -->
</div>
</div>
</div>
</div>
</div>
</header>
<script>
document.body.dataset.theme = localStorage.getItem("theme") || "auto";
</script>
<svg xmlns="http://www.w3.org/2000/svg" style="display: none;">
<symbol id="svg-toc" viewBox="0 0 24 24">
<title>Contents</title>
<svg stroke="currentColor" fill="currentColor" stroke-width="0" viewBox="0 0 1024 1024">
<path d="M408 442h480c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8H408c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8zm-8 204c0 4.4 3.6 8 8 8h480c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8H408c-4.4 0-8 3.6-8 8v56zm504-486H120c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8h784c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8zm0 632H120c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8h784c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8zM115.4 518.9L271.7 642c5.8 4.6 14.4.5 14.4-6.9V388.9c0-7.4-8.5-11.5-14.4-6.9L115.4 505.1a8.74 8.74 0 0 0 0 13.8z"/>
</svg>
</symbol>
<symbol id="svg-menu" viewBox="0 0 24 24">
<title>Menu</title>
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather-menu">
<line x1="3" y1="12" x2="21" y2="12"></line>
<line x1="3" y1="6" x2="21" y2="6"></line>
<line x1="3" y1="18" x2="21" y2="18"></line>
</svg>
</symbol>
<symbol id="svg-arrow-right" viewBox="0 0 24 24">
<title>Expand</title>
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather-chevron-right">
<polyline points="9 18 15 12 9 6"></polyline>
</svg>
</symbol>
<symbol id="svg-sun" viewBox="0 0 24 24">
<title>Light mode</title>
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="feather-sun">
<circle cx="12" cy="12" r="5"></circle>
<line x1="12" y1="1" x2="12" y2="3"></line>
<line x1="12" y1="21" x2="12" y2="23"></line>
<line x1="4.22" y1="4.22" x2="5.64" y2="5.64"></line>
<line x1="18.36" y1="18.36" x2="19.78" y2="19.78"></line>
<line x1="1" y1="12" x2="3" y2="12"></line>
<line x1="21" y1="12" x2="23" y2="12"></line>
<line x1="4.22" y1="19.78" x2="5.64" y2="18.36"></line>
<line x1="18.36" y1="5.64" x2="19.78" y2="4.22"></line>
</svg>
</symbol>
<symbol id="svg-moon" viewBox="0 0 24 24">
<title>Dark mode</title>
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="icon-tabler-moon">
<path stroke="none" d="M0 0h24v24H0z" fill="none" />
<path d="M12 3c.132 0 .263 0 .393 0a7.5 7.5 0 0 0 7.92 12.446a9 9 0 1 1 -8.313 -12.454z" />
</svg>
</symbol>
<symbol id="svg-sun-half" viewBox="0 0 24 24">
<title>Auto light/dark mode</title>
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="icon-tabler-shadow">
<path stroke="none" d="M0 0h24v24H0z" fill="none"/>
<circle cx="12" cy="12" r="9" />
<path d="M13 12h5" />
<path d="M13 15h4" />
<path d="M13 18h1" />
<path d="M13 9h4" />
<path d="M13 6h1" />
</svg>
</symbol>
</svg>
<input type="checkbox" class="sidebar-toggle" name="__navigation" id="__navigation">
<input type="checkbox" class="sidebar-toggle" name="__toc" id="__toc">
<label class="overlay sidebar-overlay" for="__navigation">
<div class="visually-hidden">Hide navigation sidebar</div>
</label>
<label class="overlay toc-overlay" for="__toc">
<div class="visually-hidden">Hide table of contents sidebar</div>
</label>
<div class="page">
<!--<header class="mobile-header">
<div class="header-left">
<label class="nav-overlay-icon" for="__navigation">
<div class="visually-hidden">Toggle site navigation sidebar</div>
<i class="icon"><svg><use href="#svg-menu"></use></svg></i>
</label>
</div>
<div class="header-center">
<a href="../../../"><div class="brand">Gymnasium Documentation</div></a>
</div>
<div class="header-right">
<div class="theme-toggle-container theme-toggle-header">
<button class="theme-toggle">
<div class="visually-hidden">Toggle Light / Dark / Auto color theme</div>
<svg class="theme-icon-when-auto"><use href="#svg-sun-half"></use></svg>
<svg class="theme-icon-when-dark"><use href="#svg-moon"></use></svg>
<svg class="theme-icon-when-light"><use href="#svg-sun"></use></svg>
</button>
</div>
<label class="toc-overlay-icon toc-header-icon" for="__toc">
<div class="visually-hidden">Toggle table of contents sidebar</div>
<i class="icon"><svg><use href="#svg-toc"></use></svg></i>
</label>
</div>
</header>-->
<aside class="sidebar-drawer">
<div class="sidebar-container">
<div class="sidebar-sticky"><a class="farama-sidebar__title" href="../../../">
<img class="farama-header__logo only-light" src="../../../_static/img/gymnasium_black.svg" alt="Light Logo"/>
<img class="farama-header__logo only-dark" src="../../../_static/img/gymnasium_white.svg" alt="Dark Logo"/>
<span class="farama-header__title">Gymnasium Documentation</span>
</a><form class="sidebar-search-container" method="get" action="../../../search/" role="search">
<input class="sidebar-search" placeholder="Search" name="q" aria-label="Search">
<input type="hidden" name="check_keywords" value="yes">
<input type="hidden" name="area" value="default">
</form>
<div id="searchbox"></div><div class="sidebar-scroll"><div class="sidebar-tree">
<p class="caption" role="heading"><span class="caption-text">Introduction</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../../../introduction/basic_usage/">Basic Usage</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../introduction/train_agent/">Training an Agent</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../introduction/create_custom_env/">Create a Custom Environment</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../introduction/record_agent/">Recording Agents</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../introduction/speed_up_env/">Speeding Up Training</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../introduction/gym_compatibility/">Compatibility with Gym</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../introduction/migration_guide/">Migration Guide - v0.21 to v1.0.0</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">API</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../../../api/env/">Env</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../api/registry/">Make and register</a></li>
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../api/spaces/">Spaces</a><input class="toctree-checkbox" id="toctree-checkbox-1" name="toctree-checkbox-1" role="switch" type="checkbox"/><label for="toctree-checkbox-1"><div class="visually-hidden">Toggle navigation of Spaces</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
<li class="toctree-l2"><a class="reference internal" href="../../../api/spaces/fundamental/">Fundamental Spaces</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../api/spaces/composite/">Composite Spaces</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../api/spaces/utils/">Spaces Utils</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../api/vector/utils/">Utility functions</a></li>
</ul>
</li>
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../api/wrappers/">Wrappers</a><input class="toctree-checkbox" id="toctree-checkbox-2" name="toctree-checkbox-2" role="switch" type="checkbox"/><label for="toctree-checkbox-2"><div class="visually-hidden">Toggle navigation of Wrappers</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
<li class="toctree-l2"><a class="reference internal" href="../../../api/wrappers/table/">List of Wrappers</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../api/wrappers/misc_wrappers/">Misc Wrappers</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../api/wrappers/action_wrappers/">Action Wrappers</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../api/wrappers/observation_wrappers/">Observation Wrappers</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../api/wrappers/reward_wrappers/">Reward Wrappers</a></li>
</ul>
</li>
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../api/vector/">Vectorize</a><input class="toctree-checkbox" id="toctree-checkbox-3" name="toctree-checkbox-3" role="switch" type="checkbox"/><label for="toctree-checkbox-3"><div class="visually-hidden">Toggle navigation of Vectorize</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
<li class="toctree-l2"><a class="reference internal" href="../../../api/vector/wrappers/">Wrappers</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../api/vector/async_vector_env/">AsyncVectorEnv</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../api/vector/sync_vector_env/">SyncVectorEnv</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../api/vector/utils/">Utility functions</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="../../../api/utils/">Utility functions</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../api/functional/">Functional Env</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Environments</span></p>
<ul>
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../environments/classic_control/">Classic Control</a><input class="toctree-checkbox" id="toctree-checkbox-4" name="toctree-checkbox-4" role="switch" type="checkbox"/><label for="toctree-checkbox-4"><div class="visually-hidden">Toggle navigation of Classic Control</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/classic_control/acrobot/">Acrobot</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/classic_control/cart_pole/">Cart Pole</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/classic_control/mountain_car_continuous/">Mountain Car Continuous</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/classic_control/mountain_car/">Mountain Car</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/classic_control/pendulum/">Pendulum</a></li>
</ul>
</li>
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../environments/box2d/">Box2D</a><input class="toctree-checkbox" id="toctree-checkbox-5" name="toctree-checkbox-5" role="switch" type="checkbox"/><label for="toctree-checkbox-5"><div class="visually-hidden">Toggle navigation of Box2D</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/box2d/bipedal_walker/">Bipedal Walker</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/box2d/car_racing/">Car Racing</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/box2d/lunar_lander/">Lunar Lander</a></li>
</ul>
</li>
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../environments/toy_text/">Toy Text</a><input class="toctree-checkbox" id="toctree-checkbox-6" name="toctree-checkbox-6" role="switch" type="checkbox"/><label for="toctree-checkbox-6"><div class="visually-hidden">Toggle navigation of Toy Text</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/toy_text/blackjack/">Blackjack</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/toy_text/taxi/">Taxi</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/toy_text/cliff_walking/">Cliff Walking</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/toy_text/frozen_lake/">Frozen Lake</a></li>
</ul>
</li>
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../environments/mujoco/">MuJoCo</a><input class="toctree-checkbox" id="toctree-checkbox-7" name="toctree-checkbox-7" role="switch" type="checkbox"/><label for="toctree-checkbox-7"><div class="visually-hidden">Toggle navigation of MuJoCo</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/mujoco/ant/">Ant</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/mujoco/half_cheetah/">Half Cheetah</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/mujoco/hopper/">Hopper</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/mujoco/humanoid/">Humanoid</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/mujoco/humanoid_standup/">Humanoid Standup</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/mujoco/inverted_double_pendulum/">Inverted Double Pendulum</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/mujoco/inverted_pendulum/">Inverted Pendulum</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/mujoco/pusher/">Pusher</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/mujoco/reacher/">Reacher</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/mujoco/swimmer/">Swimmer</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/mujoco/walker2d/">Walker2D</a></li>
</ul>
</li>
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../environments/atari/">Atari</a><input class="toctree-checkbox" id="toctree-checkbox-8" name="toctree-checkbox-8" role="switch" type="checkbox"/><label for="toctree-checkbox-8"><div class="visually-hidden">Toggle navigation of Atari</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/adventure/">Adventure</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/air_raid/">AirRaid</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/alien/">Alien</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/amidar/">Amidar</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/assault/">Assault</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/asterix/">Asterix</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/asteroids/">Asteroids</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/atlantis/">Atlantis</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/atlantis2/">Atlantis2</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/backgammon/">Backgammon</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/bank_heist/">BankHeist</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/basic_math/">BasicMath</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/battle_zone/">BattleZone</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/beam_rider/">BeamRider</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/berzerk/">Berzerk</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/blackjack/">Blackjack</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/bowling/">Bowling</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/boxing/">Boxing</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/breakout/">Breakout</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/carnival/">Carnival</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/casino/">Casino</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/centipede/">Centipede</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/chopper_command/">ChopperCommand</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/crazy_climber/">CrazyClimber</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/crossbow/">Crossbow</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/darkchambers/">Darkchambers</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/defender/">Defender</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/demon_attack/">DemonAttack</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/donkey_kong/">DonkeyKong</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/double_dunk/">DoubleDunk</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/earthworld/">Earthworld</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/elevator_action/">ElevatorAction</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/enduro/">Enduro</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/entombed/">Entombed</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/et/">Et</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/fishing_derby/">FishingDerby</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/flag_capture/">FlagCapture</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/freeway/">Freeway</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/frogger/">Frogger</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/frostbite/">Frostbite</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/galaxian/">Galaxian</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/gopher/">Gopher</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/gravitar/">Gravitar</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/hangman/">Hangman</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/haunted_house/">HauntedHouse</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/hero/">Hero</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/human_cannonball/">HumanCannonball</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/ice_hockey/">IceHockey</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/jamesbond/">Jamesbond</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/journey_escape/">JourneyEscape</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/kaboom/">Kaboom</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/kangaroo/">Kangaroo</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/keystone_kapers/">KeystoneKapers</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/king_kong/">KingKong</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/klax/">Klax</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/koolaid/">Koolaid</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/krull/">Krull</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/kung_fu_master/">KungFuMaster</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/laser_gates/">LaserGates</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/lost_luggage/">LostLuggage</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/mario_bros/">MarioBros</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/miniature_golf/">MiniatureGolf</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/montezuma_revenge/">MontezumaRevenge</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/mr_do/">MrDo</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/ms_pacman/">MsPacman</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/name_this_game/">NameThisGame</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/othello/">Othello</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/pacman/">Pacman</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/phoenix/">Phoenix</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/pitfall/">Pitfall</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/pitfall2/">Pitfall2</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/pong/">Pong</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/pooyan/">Pooyan</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/private_eye/">PrivateEye</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/qbert/">Qbert</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/riverraid/">Riverraid</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/road_runner/">RoadRunner</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/robotank/">Robotank</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/seaquest/">Seaquest</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/sir_lancelot/">SirLancelot</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/skiing/">Skiing</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/solaris/">Solaris</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/space_invaders/">SpaceInvaders</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/space_war/">SpaceWar</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/star_gunner/">StarGunner</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/superman/">Superman</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/surround/">Surround</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/tennis/">Tennis</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/tetris/">Tetris</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/tic_tac_toe_3d/">TicTacToe3D</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/time_pilot/">TimePilot</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/trondead/">Trondead</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/turmoil/">Turmoil</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/tutankham/">Tutankham</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/up_n_down/">UpNDown</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/venture/">Venture</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/video_checkers/">VideoCheckers</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/video_chess/">VideoChess</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/video_cube/">VideoCube</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/video_pinball/">VideoPinball</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/wizard_of_wor/">WizardOfWor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/word_zapper/">WordZapper</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/yars_revenge/">YarsRevenge</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/atari/zaxxon/">Zaxxon</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="../../../environments/third_party_environments/">Third-Party Environments</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Tutorials</span></p>
<ul class="current">
<li class="toctree-l1 has-children"><a class="reference internal" href="../../gymnasium_basics/">Gymnasium Basics</a><input class="toctree-checkbox" id="toctree-checkbox-9" name="toctree-checkbox-9" role="switch" type="checkbox"/><label for="toctree-checkbox-9"><div class="visually-hidden">Toggle navigation of Gymnasium Basics</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
<li class="toctree-l2"><a class="reference internal" href="../../gymnasium_basics/handling_time_limits/">Handling Time Limits</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../gymnasium_basics/implementing_custom_wrappers/">Implementing Custom Wrappers</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../gymnasium_basics/environment_creation/">Make your own custom environment</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../gymnasium_basics/vector_envs_tutorial/">Training A2C with Vector Envs and Domain Randomization</a></li>
</ul>
</li>
<li class="toctree-l1 current has-children"><a class="reference internal" href="../">Training Agents</a><input checked="" class="toctree-checkbox" id="toctree-checkbox-10" name="toctree-checkbox-10" role="switch" type="checkbox"/><label for="toctree-checkbox-10"><div class="visually-hidden">Toggle navigation of Training Agents</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul class="current">
<li class="toctree-l2"><a class="reference internal" href="../reinforce_invpend_gym_v26/">Training using REINFORCE for Mujoco</a></li>
<li class="toctree-l2"><a class="reference internal" href="../blackjack_tutorial/">Solving Blackjack with Q-Learning</a></li>
<li class="toctree-l2 current current-page"><a class="current reference internal" href="#">Frozenlake benchmark</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="../../third-party-tutorials/">Third-Party Tutorials</a></li>
<li class="toctree-l1"><a class="reference external" href="https://www.comet.com/docs/v2/integrations/ml-frameworks/gymnasium/?utm_source=gymnasium&amp;utm_medium=partner&amp;utm_campaign=partner_gymnasium_2023&amp;utm_content=docs_gymnasium">Comet Tutorial</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Development</span></p>
<ul>
<li class="toctree-l1"><a class="reference external" href="https://github.com/Farama-Foundation/Gymnasium">Github</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../gymnasium_release_notes/">Gymnasium Release Notes</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../gym_release_notes/">Gym Release Notes</a></li>
<li class="toctree-l1"><a class="reference external" href="https://github.com/Farama-Foundation/Gymnasium/blob/main/docs/README.md">Contribute to the Docs</a></li>
</ul>
</div>
</div>
</div>
</div>
</aside>
<div class="main-container">
<div class="main">
<div class="content">
<div class="article-container">
<a href="#" class="back-to-top muted-link">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24">
<path d="M13 20h-2V8l-5.5 5.5-1.42-1.42L12 4.16l7.92 7.92-1.42 1.42L13 8v12z"></path>
</svg>
<span>Back to top</span>
</a>
<div class="content-icon-container">
<div class="edit-this-page">
<a class="muted-link" href="https://github.com/Farama-Foundation/Gymnasium/edit/main/docs/tutorials/training_agents/FrozenLake_tuto.py" title="Edit this page">
<svg aria-hidden="true" viewBox="0 0 24 24" stroke-width="1.5" stroke="currentColor" fill="none" stroke-linecap="round" stroke-linejoin="round">
<path stroke="none" d="M0 0h24v24H0z" fill="none"/>
<path d="M4 20h4l10.5 -10.5a1.5 1.5 0 0 0 -4 -4l-10.5 10.5v4" />
<line x1="13.5" y1="6.5" x2="17.5" y2="10.5" />
</svg>
<span class="visually-hidden">Edit this page</span>
</a>
</div><div class="theme-toggle-container theme-toggle-content">
<button class="theme-toggle" title="Toggle color theme">
<div class="visually-hidden">Toggle Light / Dark / Auto color theme</div>
<svg class="theme-icon-when-auto">
<use href="#svg-sun-half"></use>
</svg>
<svg class="theme-icon-when-dark">
<use href="#svg-moon"></use>
</svg>
<svg class="theme-icon-when-light">
<use href="#svg-sun"></use>
</svg>
</button>
</div>
<label class="toc-overlay-icon toc-content-icon" for="__toc">
<div class="visually-hidden">Toggle table of contents sidebar</div>
<i class="icon"><svg>
<use href="#svg-toc"></use>
</svg></i>
</label>
</div>
<article role="main">
<section class="sphx-glr-example-title" id="frozenlake-benchmark">
<span id="sphx-glr-tutorials-training-agents-frozenlake-tuto-py"></span><h1>Frozenlake benchmark<a class="headerlink" href="#frozenlake-benchmark" title="Link to this heading">#</a></h1>
<p>In this post well compare a bunch of different map sizes on the
<a class="reference external" href="https://gymnasium.farama.org/environments/toy_text/frozen_lake/">FrozenLake</a>
environment from the reinforcement learning
<a class="reference external" href="https://gymnasium.farama.org/">Gymnasium</a> package using the
Q-learning algorithm.</p>
<section id="dependencies">
<h2>Dependencies<a class="headerlink" href="#dependencies" title="Link to this heading">#</a></h2>
<p>Lets first import a few dependencies well need.</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="c1"># Author: Andrea Pierré</span>
<span class="c1"># License: MIT License</span>
<span class="kn">from</span> <span class="nn">pathlib</span> <span class="kn">import</span> <span class="n">Path</span>
<span class="kn">from</span> <span class="nn">typing</span> <span class="kn">import</span> <span class="n">NamedTuple</span>
<span class="kn">import</span> <span class="nn">matplotlib.pyplot</span> <span class="k">as</span> <span class="nn">plt</span>
<span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
<span class="kn">import</span> <span class="nn">pandas</span> <span class="k">as</span> <span class="nn">pd</span>
<span class="kn">import</span> <span class="nn">seaborn</span> <span class="k">as</span> <span class="nn">sns</span>
<span class="kn">from</span> <span class="nn">tqdm</span> <span class="kn">import</span> <span class="n">tqdm</span>
<span class="kn">import</span> <span class="nn">gymnasium</span> <span class="k">as</span> <span class="nn">gym</span>
<span class="kn">from</span> <span class="nn">gymnasium.envs.toy_text.frozen_lake</span> <span class="kn">import</span> <span class="n">generate_random_map</span>
<span class="n">sns</span><span class="o">.</span><span class="n">set_theme</span><span class="p">()</span>
<span class="c1"># %load_ext lab_black</span>
</pre></div>
</div>
</section>
<section id="parameters-we-ll-use">
<h2>Parameters well use<a class="headerlink" href="#parameters-we-ll-use" title="Link to this heading">#</a></h2>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="k">class</span> <span class="nc">Params</span><span class="p">(</span><span class="n">NamedTuple</span><span class="p">):</span>
<span class="n">total_episodes</span><span class="p">:</span> <span class="nb">int</span> <span class="c1"># Total episodes</span>
<span class="n">learning_rate</span><span class="p">:</span> <span class="nb">float</span> <span class="c1"># Learning rate</span>
<span class="n">gamma</span><span class="p">:</span> <span class="nb">float</span> <span class="c1"># Discounting rate</span>
<span class="n">epsilon</span><span class="p">:</span> <span class="nb">float</span> <span class="c1"># Exploration probability</span>
<span class="n">map_size</span><span class="p">:</span> <span class="nb">int</span> <span class="c1"># Number of tiles of one side of the squared environment</span>
<span class="n">seed</span><span class="p">:</span> <span class="nb">int</span> <span class="c1"># Define a seed so that we get reproducible results</span>
<span class="n">is_slippery</span><span class="p">:</span> <span class="nb">bool</span> <span class="c1"># If true the player will move in intended direction with probability of 1/3 else will move in either perpendicular direction with equal probability of 1/3 in both directions</span>
<span class="n">n_runs</span><span class="p">:</span> <span class="nb">int</span> <span class="c1"># Number of runs</span>
<span class="n">action_size</span><span class="p">:</span> <span class="nb">int</span> <span class="c1"># Number of possible actions</span>
<span class="n">state_size</span><span class="p">:</span> <span class="nb">int</span> <span class="c1"># Number of possible states</span>
<span class="n">proba_frozen</span><span class="p">:</span> <span class="nb">float</span> <span class="c1"># Probability that a tile is frozen</span>
<span class="n">savefig_folder</span><span class="p">:</span> <span class="n">Path</span> <span class="c1"># Root folder where plots are saved</span>
<span class="n">params</span> <span class="o">=</span> <span class="n">Params</span><span class="p">(</span>
<span class="n">total_episodes</span><span class="o">=</span><span class="mi">2000</span><span class="p">,</span>
<span class="n">learning_rate</span><span class="o">=</span><span class="mf">0.8</span><span class="p">,</span>
<span class="n">gamma</span><span class="o">=</span><span class="mf">0.95</span><span class="p">,</span>
<span class="n">epsilon</span><span class="o">=</span><span class="mf">0.1</span><span class="p">,</span>
<span class="n">map_size</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span>
<span class="n">seed</span><span class="o">=</span><span class="mi">123</span><span class="p">,</span>
<span class="n">is_slippery</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
<span class="n">n_runs</span><span class="o">=</span><span class="mi">20</span><span class="p">,</span>
<span class="n">action_size</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">state_size</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">proba_frozen</span><span class="o">=</span><span class="mf">0.9</span><span class="p">,</span>
<span class="n">savefig_folder</span><span class="o">=</span><span class="n">Path</span><span class="p">(</span><span class="s2">&quot;../../_static/img/tutorials/&quot;</span><span class="p">),</span>
<span class="p">)</span>
<span class="n">params</span>
<span class="c1"># Set the seed</span>
<span class="n">rng</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">default_rng</span><span class="p">(</span><span class="n">params</span><span class="o">.</span><span class="n">seed</span><span class="p">)</span>
<span class="c1"># Create the figure folder if it doesn&#39;t exist</span>
<span class="n">params</span><span class="o">.</span><span class="n">savefig_folder</span><span class="o">.</span><span class="n">mkdir</span><span class="p">(</span><span class="n">parents</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">exist_ok</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
</pre></div>
</div>
</section>
<section id="the-frozenlake-environment">
<h2>The FrozenLake environment<a class="headerlink" href="#the-frozenlake-environment" title="Link to this heading">#</a></h2>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">env</span> <span class="o">=</span> <span class="n">gym</span><span class="o">.</span><span class="n">make</span><span class="p">(</span>
<span class="s2">&quot;FrozenLake-v1&quot;</span><span class="p">,</span>
<span class="n">is_slippery</span><span class="o">=</span><span class="n">params</span><span class="o">.</span><span class="n">is_slippery</span><span class="p">,</span>
<span class="n">render_mode</span><span class="o">=</span><span class="s2">&quot;rgb_array&quot;</span><span class="p">,</span>
<span class="n">desc</span><span class="o">=</span><span class="n">generate_random_map</span><span class="p">(</span>
<span class="n">size</span><span class="o">=</span><span class="n">params</span><span class="o">.</span><span class="n">map_size</span><span class="p">,</span> <span class="n">p</span><span class="o">=</span><span class="n">params</span><span class="o">.</span><span class="n">proba_frozen</span><span class="p">,</span> <span class="n">seed</span><span class="o">=</span><span class="n">params</span><span class="o">.</span><span class="n">seed</span>
<span class="p">),</span>
<span class="p">)</span>
</pre></div>
</div>
<section id="creating-the-q-table">
<h3>Creating the Q-table<a class="headerlink" href="#creating-the-q-table" title="Link to this heading">#</a></h3>
<p>In this tutorial well be using Q-learning as our learning algorithm and
<span class="math notranslate nohighlight">\(\epsilon\)</span>-greedy to decide which action to pick at each step. You
can have a look at the <a class="reference external" href="#References">References section</a> for some
refreshers on the theory. Now, lets create our Q-table initialized at
zero with the states number as rows and the actions number as columns.</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">params</span> <span class="o">=</span> <span class="n">params</span><span class="o">.</span><span class="n">_replace</span><span class="p">(</span><span class="n">action_size</span><span class="o">=</span><span class="n">env</span><span class="o">.</span><span class="n">action_space</span><span class="o">.</span><span class="n">n</span><span class="p">)</span>
<span class="n">params</span> <span class="o">=</span> <span class="n">params</span><span class="o">.</span><span class="n">_replace</span><span class="p">(</span><span class="n">state_size</span><span class="o">=</span><span class="n">env</span><span class="o">.</span><span class="n">observation_space</span><span class="o">.</span><span class="n">n</span><span class="p">)</span>
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Action size: </span><span class="si">{</span><span class="n">params</span><span class="o">.</span><span class="n">action_size</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;State size: </span><span class="si">{</span><span class="n">params</span><span class="o">.</span><span class="n">state_size</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
<span class="k">class</span> <span class="nc">Qlearning</span><span class="p">:</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">learning_rate</span><span class="p">,</span> <span class="n">gamma</span><span class="p">,</span> <span class="n">state_size</span><span class="p">,</span> <span class="n">action_size</span><span class="p">):</span>
<span class="bp">self</span><span class="o">.</span><span class="n">state_size</span> <span class="o">=</span> <span class="n">state_size</span>
<span class="bp">self</span><span class="o">.</span><span class="n">action_size</span> <span class="o">=</span> <span class="n">action_size</span>
<span class="bp">self</span><span class="o">.</span><span class="n">learning_rate</span> <span class="o">=</span> <span class="n">learning_rate</span>
<span class="bp">self</span><span class="o">.</span><span class="n">gamma</span> <span class="o">=</span> <span class="n">gamma</span>
<span class="bp">self</span><span class="o">.</span><span class="n">reset_qtable</span><span class="p">()</span>
<span class="k">def</span> <span class="nf">update</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">state</span><span class="p">,</span> <span class="n">action</span><span class="p">,</span> <span class="n">reward</span><span class="p">,</span> <span class="n">new_state</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Update Q(s,a):= Q(s,a) + lr [R(s,a) + gamma * max Q(s&#39;,a&#39;) - Q(s,a)]&quot;&quot;&quot;</span>
<span class="n">delta</span> <span class="o">=</span> <span class="p">(</span>
<span class="n">reward</span>
<span class="o">+</span> <span class="bp">self</span><span class="o">.</span><span class="n">gamma</span> <span class="o">*</span> <span class="n">np</span><span class="o">.</span><span class="n">max</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">qtable</span><span class="p">[</span><span class="n">new_state</span><span class="p">,</span> <span class="p">:])</span>
<span class="o">-</span> <span class="bp">self</span><span class="o">.</span><span class="n">qtable</span><span class="p">[</span><span class="n">state</span><span class="p">,</span> <span class="n">action</span><span class="p">]</span>
<span class="p">)</span>
<span class="n">q_update</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">qtable</span><span class="p">[</span><span class="n">state</span><span class="p">,</span> <span class="n">action</span><span class="p">]</span> <span class="o">+</span> <span class="bp">self</span><span class="o">.</span><span class="n">learning_rate</span> <span class="o">*</span> <span class="n">delta</span>
<span class="k">return</span> <span class="n">q_update</span>
<span class="k">def</span> <span class="nf">reset_qtable</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Reset the Q-table.&quot;&quot;&quot;</span>
<span class="bp">self</span><span class="o">.</span><span class="n">qtable</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">zeros</span><span class="p">((</span><span class="bp">self</span><span class="o">.</span><span class="n">state_size</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">action_size</span><span class="p">))</span>
<span class="k">class</span> <span class="nc">EpsilonGreedy</span><span class="p">:</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">epsilon</span><span class="p">):</span>
<span class="bp">self</span><span class="o">.</span><span class="n">epsilon</span> <span class="o">=</span> <span class="n">epsilon</span>
<span class="k">def</span> <span class="nf">choose_action</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">action_space</span><span class="p">,</span> <span class="n">state</span><span class="p">,</span> <span class="n">qtable</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Choose an action `a` in the current world state (s).&quot;&quot;&quot;</span>
<span class="c1"># First we randomize a number</span>
<span class="n">explor_exploit_tradeoff</span> <span class="o">=</span> <span class="n">rng</span><span class="o">.</span><span class="n">uniform</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span>
<span class="c1"># Exploration</span>
<span class="k">if</span> <span class="n">explor_exploit_tradeoff</span> <span class="o">&lt;</span> <span class="bp">self</span><span class="o">.</span><span class="n">epsilon</span><span class="p">:</span>
<span class="n">action</span> <span class="o">=</span> <span class="n">action_space</span><span class="o">.</span><span class="n">sample</span><span class="p">()</span>
<span class="c1"># Exploitation (taking the biggest Q-value for this state)</span>
<span class="k">else</span><span class="p">:</span>
<span class="c1"># Break ties randomly</span>
<span class="c1"># If all actions are the same for this state we choose a random one</span>
<span class="c1"># (otherwise `np.argmax()` would always take the first one)</span>
<span class="k">if</span> <span class="n">np</span><span class="o">.</span><span class="n">all</span><span class="p">(</span><span class="n">qtable</span><span class="p">[</span><span class="n">state</span><span class="p">,</span> <span class="p">:])</span> <span class="o">==</span> <span class="n">qtable</span><span class="p">[</span><span class="n">state</span><span class="p">,</span> <span class="mi">0</span><span class="p">]:</span>
<span class="n">action</span> <span class="o">=</span> <span class="n">action_space</span><span class="o">.</span><span class="n">sample</span><span class="p">()</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">action</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">argmax</span><span class="p">(</span><span class="n">qtable</span><span class="p">[</span><span class="n">state</span><span class="p">,</span> <span class="p">:])</span>
<span class="k">return</span> <span class="n">action</span>
</pre></div>
</div>
</section>
<section id="running-the-environment">
<h3>Running the environment<a class="headerlink" href="#running-the-environment" title="Link to this heading">#</a></h3>
<p>Lets instantiate the learner and the explorer.</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">learner</span> <span class="o">=</span> <span class="n">Qlearning</span><span class="p">(</span>
<span class="n">learning_rate</span><span class="o">=</span><span class="n">params</span><span class="o">.</span><span class="n">learning_rate</span><span class="p">,</span>
<span class="n">gamma</span><span class="o">=</span><span class="n">params</span><span class="o">.</span><span class="n">gamma</span><span class="p">,</span>
<span class="n">state_size</span><span class="o">=</span><span class="n">params</span><span class="o">.</span><span class="n">state_size</span><span class="p">,</span>
<span class="n">action_size</span><span class="o">=</span><span class="n">params</span><span class="o">.</span><span class="n">action_size</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">explorer</span> <span class="o">=</span> <span class="n">EpsilonGreedy</span><span class="p">(</span>
<span class="n">epsilon</span><span class="o">=</span><span class="n">params</span><span class="o">.</span><span class="n">epsilon</span><span class="p">,</span>
<span class="p">)</span>
</pre></div>
</div>
<p>This will be our main function to run our environment until the maximum
number of episodes <code class="docutils literal notranslate"><span class="pre">params.total_episodes</span></code>. To account for
stochasticity, we will also run our environment a few times.</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="k">def</span> <span class="nf">run_env</span><span class="p">():</span>
<span class="n">rewards</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">zeros</span><span class="p">((</span><span class="n">params</span><span class="o">.</span><span class="n">total_episodes</span><span class="p">,</span> <span class="n">params</span><span class="o">.</span><span class="n">n_runs</span><span class="p">))</span>
<span class="n">steps</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">zeros</span><span class="p">((</span><span class="n">params</span><span class="o">.</span><span class="n">total_episodes</span><span class="p">,</span> <span class="n">params</span><span class="o">.</span><span class="n">n_runs</span><span class="p">))</span>
<span class="n">episodes</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">arange</span><span class="p">(</span><span class="n">params</span><span class="o">.</span><span class="n">total_episodes</span><span class="p">)</span>
<span class="n">qtables</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">zeros</span><span class="p">((</span><span class="n">params</span><span class="o">.</span><span class="n">n_runs</span><span class="p">,</span> <span class="n">params</span><span class="o">.</span><span class="n">state_size</span><span class="p">,</span> <span class="n">params</span><span class="o">.</span><span class="n">action_size</span><span class="p">))</span>
<span class="n">all_states</span> <span class="o">=</span> <span class="p">[]</span>
<span class="n">all_actions</span> <span class="o">=</span> <span class="p">[]</span>
<span class="k">for</span> <span class="n">run</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">params</span><span class="o">.</span><span class="n">n_runs</span><span class="p">):</span> <span class="c1"># Run several times to account for stochasticity</span>
<span class="n">learner</span><span class="o">.</span><span class="n">reset_qtable</span><span class="p">()</span> <span class="c1"># Reset the Q-table between runs</span>
<span class="k">for</span> <span class="n">episode</span> <span class="ow">in</span> <span class="n">tqdm</span><span class="p">(</span>
<span class="n">episodes</span><span class="p">,</span> <span class="n">desc</span><span class="o">=</span><span class="sa">f</span><span class="s2">&quot;Run </span><span class="si">{</span><span class="n">run</span><span class="si">}</span><span class="s2">/</span><span class="si">{</span><span class="n">params</span><span class="o">.</span><span class="n">n_runs</span><span class="si">}</span><span class="s2"> - Episodes&quot;</span><span class="p">,</span> <span class="n">leave</span><span class="o">=</span><span class="kc">False</span>
<span class="p">):</span>
<span class="n">state</span> <span class="o">=</span> <span class="n">env</span><span class="o">.</span><span class="n">reset</span><span class="p">(</span><span class="n">seed</span><span class="o">=</span><span class="n">params</span><span class="o">.</span><span class="n">seed</span><span class="p">)[</span><span class="mi">0</span><span class="p">]</span> <span class="c1"># Reset the environment</span>
<span class="n">step</span> <span class="o">=</span> <span class="mi">0</span>
<span class="n">done</span> <span class="o">=</span> <span class="kc">False</span>
<span class="n">total_rewards</span> <span class="o">=</span> <span class="mi">0</span>
<span class="k">while</span> <span class="ow">not</span> <span class="n">done</span><span class="p">:</span>
<span class="n">action</span> <span class="o">=</span> <span class="n">explorer</span><span class="o">.</span><span class="n">choose_action</span><span class="p">(</span>
<span class="n">action_space</span><span class="o">=</span><span class="n">env</span><span class="o">.</span><span class="n">action_space</span><span class="p">,</span> <span class="n">state</span><span class="o">=</span><span class="n">state</span><span class="p">,</span> <span class="n">qtable</span><span class="o">=</span><span class="n">learner</span><span class="o">.</span><span class="n">qtable</span>
<span class="p">)</span>
<span class="c1"># Log all states and actions</span>
<span class="n">all_states</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">state</span><span class="p">)</span>
<span class="n">all_actions</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">action</span><span class="p">)</span>
<span class="c1"># Take the action (a) and observe the outcome state(s&#39;) and reward (r)</span>
<span class="n">new_state</span><span class="p">,</span> <span class="n">reward</span><span class="p">,</span> <span class="n">terminated</span><span class="p">,</span> <span class="n">truncated</span><span class="p">,</span> <span class="n">info</span> <span class="o">=</span> <span class="n">env</span><span class="o">.</span><span class="n">step</span><span class="p">(</span><span class="n">action</span><span class="p">)</span>
<span class="n">done</span> <span class="o">=</span> <span class="n">terminated</span> <span class="ow">or</span> <span class="n">truncated</span>
<span class="n">learner</span><span class="o">.</span><span class="n">qtable</span><span class="p">[</span><span class="n">state</span><span class="p">,</span> <span class="n">action</span><span class="p">]</span> <span class="o">=</span> <span class="n">learner</span><span class="o">.</span><span class="n">update</span><span class="p">(</span>
<span class="n">state</span><span class="p">,</span> <span class="n">action</span><span class="p">,</span> <span class="n">reward</span><span class="p">,</span> <span class="n">new_state</span>
<span class="p">)</span>
<span class="n">total_rewards</span> <span class="o">+=</span> <span class="n">reward</span>
<span class="n">step</span> <span class="o">+=</span> <span class="mi">1</span>
<span class="c1"># Our new state is state</span>
<span class="n">state</span> <span class="o">=</span> <span class="n">new_state</span>
<span class="c1"># Log all rewards and steps</span>
<span class="n">rewards</span><span class="p">[</span><span class="n">episode</span><span class="p">,</span> <span class="n">run</span><span class="p">]</span> <span class="o">=</span> <span class="n">total_rewards</span>
<span class="n">steps</span><span class="p">[</span><span class="n">episode</span><span class="p">,</span> <span class="n">run</span><span class="p">]</span> <span class="o">=</span> <span class="n">step</span>
<span class="n">qtables</span><span class="p">[</span><span class="n">run</span><span class="p">,</span> <span class="p">:,</span> <span class="p">:]</span> <span class="o">=</span> <span class="n">learner</span><span class="o">.</span><span class="n">qtable</span>
<span class="k">return</span> <span class="n">rewards</span><span class="p">,</span> <span class="n">steps</span><span class="p">,</span> <span class="n">episodes</span><span class="p">,</span> <span class="n">qtables</span><span class="p">,</span> <span class="n">all_states</span><span class="p">,</span> <span class="n">all_actions</span>
</pre></div>
</div>
</section>
<section id="visualization">
<h3>Visualization<a class="headerlink" href="#visualization" title="Link to this heading">#</a></h3>
<p>To make it easy to plot the results with Seaborn, well save the main
results of the simulation in Pandas dataframes.</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="k">def</span> <span class="nf">postprocess</span><span class="p">(</span><span class="n">episodes</span><span class="p">,</span> <span class="n">params</span><span class="p">,</span> <span class="n">rewards</span><span class="p">,</span> <span class="n">steps</span><span class="p">,</span> <span class="n">map_size</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Convert the results of the simulation in dataframes.&quot;&quot;&quot;</span>
<span class="n">res</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">(</span>
<span class="n">data</span><span class="o">=</span><span class="p">{</span>
<span class="s2">&quot;Episodes&quot;</span><span class="p">:</span> <span class="n">np</span><span class="o">.</span><span class="n">tile</span><span class="p">(</span><span class="n">episodes</span><span class="p">,</span> <span class="n">reps</span><span class="o">=</span><span class="n">params</span><span class="o">.</span><span class="n">n_runs</span><span class="p">),</span>
<span class="s2">&quot;Rewards&quot;</span><span class="p">:</span> <span class="n">rewards</span><span class="o">.</span><span class="n">flatten</span><span class="p">(</span><span class="n">order</span><span class="o">=</span><span class="s2">&quot;F&quot;</span><span class="p">),</span>
<span class="s2">&quot;Steps&quot;</span><span class="p">:</span> <span class="n">steps</span><span class="o">.</span><span class="n">flatten</span><span class="p">(</span><span class="n">order</span><span class="o">=</span><span class="s2">&quot;F&quot;</span><span class="p">),</span>
<span class="p">}</span>
<span class="p">)</span>
<span class="n">res</span><span class="p">[</span><span class="s2">&quot;cum_rewards&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">rewards</span><span class="o">.</span><span class="n">cumsum</span><span class="p">(</span><span class="n">axis</span><span class="o">=</span><span class="mi">0</span><span class="p">)</span><span class="o">.</span><span class="n">flatten</span><span class="p">(</span><span class="n">order</span><span class="o">=</span><span class="s2">&quot;F&quot;</span><span class="p">)</span>
<span class="n">res</span><span class="p">[</span><span class="s2">&quot;map_size&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">repeat</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;</span><span class="si">{</span><span class="n">map_size</span><span class="si">}</span><span class="s2">x</span><span class="si">{</span><span class="n">map_size</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">,</span> <span class="n">res</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span>
<span class="n">st</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">data</span><span class="o">=</span><span class="p">{</span><span class="s2">&quot;Episodes&quot;</span><span class="p">:</span> <span class="n">episodes</span><span class="p">,</span> <span class="s2">&quot;Steps&quot;</span><span class="p">:</span> <span class="n">steps</span><span class="o">.</span><span class="n">mean</span><span class="p">(</span><span class="n">axis</span><span class="o">=</span><span class="mi">1</span><span class="p">)})</span>
<span class="n">st</span><span class="p">[</span><span class="s2">&quot;map_size&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">repeat</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;</span><span class="si">{</span><span class="n">map_size</span><span class="si">}</span><span class="s2">x</span><span class="si">{</span><span class="n">map_size</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">,</span> <span class="n">st</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span>
<span class="k">return</span> <span class="n">res</span><span class="p">,</span> <span class="n">st</span>
</pre></div>
</div>
<p>We want to plot the policy the agent has learned in the end. To do that
we will: 1. extract the best Q-values from the Q-table for each state,
2. get the corresponding best action for those Q-values, 3. map each
action to an arrow so we can visualize it.</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="k">def</span> <span class="nf">qtable_directions_map</span><span class="p">(</span><span class="n">qtable</span><span class="p">,</span> <span class="n">map_size</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Get the best learned action &amp; map it to arrows.&quot;&quot;&quot;</span>
<span class="n">qtable_val_max</span> <span class="o">=</span> <span class="n">qtable</span><span class="o">.</span><span class="n">max</span><span class="p">(</span><span class="n">axis</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">reshape</span><span class="p">(</span><span class="n">map_size</span><span class="p">,</span> <span class="n">map_size</span><span class="p">)</span>
<span class="n">qtable_best_action</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">argmax</span><span class="p">(</span><span class="n">qtable</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">reshape</span><span class="p">(</span><span class="n">map_size</span><span class="p">,</span> <span class="n">map_size</span><span class="p">)</span>
<span class="n">directions</span> <span class="o">=</span> <span class="p">{</span><span class="mi">0</span><span class="p">:</span> <span class="s2">&quot;&quot;</span><span class="p">,</span> <span class="mi">1</span><span class="p">:</span> <span class="s2">&quot;&quot;</span><span class="p">,</span> <span class="mi">2</span><span class="p">:</span> <span class="s2">&quot;&quot;</span><span class="p">,</span> <span class="mi">3</span><span class="p">:</span> <span class="s2">&quot;&quot;</span><span class="p">}</span>
<span class="n">qtable_directions</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">empty</span><span class="p">(</span><span class="n">qtable_best_action</span><span class="o">.</span><span class="n">flatten</span><span class="p">()</span><span class="o">.</span><span class="n">shape</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="nb">str</span><span class="p">)</span>
<span class="n">eps</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">finfo</span><span class="p">(</span><span class="nb">float</span><span class="p">)</span><span class="o">.</span><span class="n">eps</span> <span class="c1"># Minimum float number on the machine</span>
<span class="k">for</span> <span class="n">idx</span><span class="p">,</span> <span class="n">val</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">qtable_best_action</span><span class="o">.</span><span class="n">flatten</span><span class="p">()):</span>
<span class="k">if</span> <span class="n">qtable_val_max</span><span class="o">.</span><span class="n">flatten</span><span class="p">()[</span><span class="n">idx</span><span class="p">]</span> <span class="o">&gt;</span> <span class="n">eps</span><span class="p">:</span>
<span class="c1"># Assign an arrow only if a minimal Q-value has been learned as best action</span>
<span class="c1"># otherwise since 0 is a direction, it also gets mapped on the tiles where</span>
<span class="c1"># it didn&#39;t actually learn anything</span>
<span class="n">qtable_directions</span><span class="p">[</span><span class="n">idx</span><span class="p">]</span> <span class="o">=</span> <span class="n">directions</span><span class="p">[</span><span class="n">val</span><span class="p">]</span>
<span class="n">qtable_directions</span> <span class="o">=</span> <span class="n">qtable_directions</span><span class="o">.</span><span class="n">reshape</span><span class="p">(</span><span class="n">map_size</span><span class="p">,</span> <span class="n">map_size</span><span class="p">)</span>
<span class="k">return</span> <span class="n">qtable_val_max</span><span class="p">,</span> <span class="n">qtable_directions</span>
</pre></div>
</div>
<p>With the following function, well plot on the left the last frame of
the simulation. If the agent learned a good policy to solve the task, we
expect to see it on the tile of the treasure in the last frame of the
video. On the right well plot the policy the agent has learned. Each
arrow will represent the best action to choose for each tile/state.</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="k">def</span> <span class="nf">plot_q_values_map</span><span class="p">(</span><span class="n">qtable</span><span class="p">,</span> <span class="n">env</span><span class="p">,</span> <span class="n">map_size</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Plot the last frame of the simulation and the policy learned.&quot;&quot;&quot;</span>
<span class="n">qtable_val_max</span><span class="p">,</span> <span class="n">qtable_directions</span> <span class="o">=</span> <span class="n">qtable_directions_map</span><span class="p">(</span><span class="n">qtable</span><span class="p">,</span> <span class="n">map_size</span><span class="p">)</span>
<span class="c1"># Plot the last frame</span>
<span class="n">fig</span><span class="p">,</span> <span class="n">ax</span> <span class="o">=</span> <span class="n">plt</span><span class="o">.</span><span class="n">subplots</span><span class="p">(</span><span class="n">nrows</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span> <span class="n">ncols</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">figsize</span><span class="o">=</span><span class="p">(</span><span class="mi">15</span><span class="p">,</span> <span class="mi">5</span><span class="p">))</span>
<span class="n">ax</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">imshow</span><span class="p">(</span><span class="n">env</span><span class="o">.</span><span class="n">render</span><span class="p">())</span>
<span class="n">ax</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">axis</span><span class="p">(</span><span class="s2">&quot;off&quot;</span><span class="p">)</span>
<span class="n">ax</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">set_title</span><span class="p">(</span><span class="s2">&quot;Last frame&quot;</span><span class="p">)</span>
<span class="c1"># Plot the policy</span>
<span class="n">sns</span><span class="o">.</span><span class="n">heatmap</span><span class="p">(</span>
<span class="n">qtable_val_max</span><span class="p">,</span>
<span class="n">annot</span><span class="o">=</span><span class="n">qtable_directions</span><span class="p">,</span>
<span class="n">fmt</span><span class="o">=</span><span class="s2">&quot;&quot;</span><span class="p">,</span>
<span class="n">ax</span><span class="o">=</span><span class="n">ax</span><span class="p">[</span><span class="mi">1</span><span class="p">],</span>
<span class="n">cmap</span><span class="o">=</span><span class="n">sns</span><span class="o">.</span><span class="n">color_palette</span><span class="p">(</span><span class="s2">&quot;Blues&quot;</span><span class="p">,</span> <span class="n">as_cmap</span><span class="o">=</span><span class="kc">True</span><span class="p">),</span>
<span class="n">linewidths</span><span class="o">=</span><span class="mf">0.7</span><span class="p">,</span>
<span class="n">linecolor</span><span class="o">=</span><span class="s2">&quot;black&quot;</span><span class="p">,</span>
<span class="n">xticklabels</span><span class="o">=</span><span class="p">[],</span>
<span class="n">yticklabels</span><span class="o">=</span><span class="p">[],</span>
<span class="n">annot_kws</span><span class="o">=</span><span class="p">{</span><span class="s2">&quot;fontsize&quot;</span><span class="p">:</span> <span class="s2">&quot;xx-large&quot;</span><span class="p">},</span>
<span class="p">)</span><span class="o">.</span><span class="n">set</span><span class="p">(</span><span class="n">title</span><span class="o">=</span><span class="s2">&quot;Learned Q-values</span><span class="se">\n</span><span class="s2">Arrows represent best action&quot;</span><span class="p">)</span>
<span class="k">for</span> <span class="n">_</span><span class="p">,</span> <span class="n">spine</span> <span class="ow">in</span> <span class="n">ax</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span><span class="o">.</span><span class="n">spines</span><span class="o">.</span><span class="n">items</span><span class="p">():</span>
<span class="n">spine</span><span class="o">.</span><span class="n">set_visible</span><span class="p">(</span><span class="kc">True</span><span class="p">)</span>
<span class="n">spine</span><span class="o">.</span><span class="n">set_linewidth</span><span class="p">(</span><span class="mf">0.7</span><span class="p">)</span>
<span class="n">spine</span><span class="o">.</span><span class="n">set_color</span><span class="p">(</span><span class="s2">&quot;black&quot;</span><span class="p">)</span>
<span class="n">img_title</span> <span class="o">=</span> <span class="sa">f</span><span class="s2">&quot;frozenlake_q_values_</span><span class="si">{</span><span class="n">map_size</span><span class="si">}</span><span class="s2">x</span><span class="si">{</span><span class="n">map_size</span><span class="si">}</span><span class="s2">.png&quot;</span>
<span class="n">fig</span><span class="o">.</span><span class="n">savefig</span><span class="p">(</span><span class="n">params</span><span class="o">.</span><span class="n">savefig_folder</span> <span class="o">/</span> <span class="n">img_title</span><span class="p">,</span> <span class="n">bbox_inches</span><span class="o">=</span><span class="s2">&quot;tight&quot;</span><span class="p">)</span>
<span class="n">plt</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
</pre></div>
</div>
<p>As a sanity check, we will plot the distributions of states and actions
with the following function:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="k">def</span> <span class="nf">plot_states_actions_distribution</span><span class="p">(</span><span class="n">states</span><span class="p">,</span> <span class="n">actions</span><span class="p">,</span> <span class="n">map_size</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Plot the distributions of states and actions.&quot;&quot;&quot;</span>
<span class="n">labels</span> <span class="o">=</span> <span class="p">{</span><span class="s2">&quot;LEFT&quot;</span><span class="p">:</span> <span class="mi">0</span><span class="p">,</span> <span class="s2">&quot;DOWN&quot;</span><span class="p">:</span> <span class="mi">1</span><span class="p">,</span> <span class="s2">&quot;RIGHT&quot;</span><span class="p">:</span> <span class="mi">2</span><span class="p">,</span> <span class="s2">&quot;UP&quot;</span><span class="p">:</span> <span class="mi">3</span><span class="p">}</span>
<span class="n">fig</span><span class="p">,</span> <span class="n">ax</span> <span class="o">=</span> <span class="n">plt</span><span class="o">.</span><span class="n">subplots</span><span class="p">(</span><span class="n">nrows</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span> <span class="n">ncols</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">figsize</span><span class="o">=</span><span class="p">(</span><span class="mi">15</span><span class="p">,</span> <span class="mi">5</span><span class="p">))</span>
<span class="n">sns</span><span class="o">.</span><span class="n">histplot</span><span class="p">(</span><span class="n">data</span><span class="o">=</span><span class="n">states</span><span class="p">,</span> <span class="n">ax</span><span class="o">=</span><span class="n">ax</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="n">kde</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
<span class="n">ax</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">set_title</span><span class="p">(</span><span class="s2">&quot;States&quot;</span><span class="p">)</span>
<span class="n">sns</span><span class="o">.</span><span class="n">histplot</span><span class="p">(</span><span class="n">data</span><span class="o">=</span><span class="n">actions</span><span class="p">,</span> <span class="n">ax</span><span class="o">=</span><span class="n">ax</span><span class="p">[</span><span class="mi">1</span><span class="p">])</span>
<span class="n">ax</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span><span class="o">.</span><span class="n">set_xticks</span><span class="p">(</span><span class="nb">list</span><span class="p">(</span><span class="n">labels</span><span class="o">.</span><span class="n">values</span><span class="p">()),</span> <span class="n">labels</span><span class="o">=</span><span class="n">labels</span><span class="o">.</span><span class="n">keys</span><span class="p">())</span>
<span class="n">ax</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span><span class="o">.</span><span class="n">set_title</span><span class="p">(</span><span class="s2">&quot;Actions&quot;</span><span class="p">)</span>
<span class="n">fig</span><span class="o">.</span><span class="n">tight_layout</span><span class="p">()</span>
<span class="n">img_title</span> <span class="o">=</span> <span class="sa">f</span><span class="s2">&quot;frozenlake_states_actions_distrib_</span><span class="si">{</span><span class="n">map_size</span><span class="si">}</span><span class="s2">x</span><span class="si">{</span><span class="n">map_size</span><span class="si">}</span><span class="s2">.png&quot;</span>
<span class="n">fig</span><span class="o">.</span><span class="n">savefig</span><span class="p">(</span><span class="n">params</span><span class="o">.</span><span class="n">savefig_folder</span> <span class="o">/</span> <span class="n">img_title</span><span class="p">,</span> <span class="n">bbox_inches</span><span class="o">=</span><span class="s2">&quot;tight&quot;</span><span class="p">)</span>
<span class="n">plt</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
</pre></div>
</div>
<p>Now well be running our agent on a few increasing maps sizes: -
<span class="math notranslate nohighlight">\(4 \times 4\)</span>, - <span class="math notranslate nohighlight">\(7 \times 7\)</span>, - <span class="math notranslate nohighlight">\(9 \times 9\)</span>, -
<span class="math notranslate nohighlight">\(11 \times 11\)</span>.</p>
<p>Putting it all together:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">map_sizes</span> <span class="o">=</span> <span class="p">[</span><span class="mi">4</span><span class="p">,</span> <span class="mi">7</span><span class="p">,</span> <span class="mi">9</span><span class="p">,</span> <span class="mi">11</span><span class="p">]</span>
<span class="n">res_all</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">()</span>
<span class="n">st_all</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">()</span>
<span class="k">for</span> <span class="n">map_size</span> <span class="ow">in</span> <span class="n">map_sizes</span><span class="p">:</span>
<span class="n">env</span> <span class="o">=</span> <span class="n">gym</span><span class="o">.</span><span class="n">make</span><span class="p">(</span>
<span class="s2">&quot;FrozenLake-v1&quot;</span><span class="p">,</span>
<span class="n">is_slippery</span><span class="o">=</span><span class="n">params</span><span class="o">.</span><span class="n">is_slippery</span><span class="p">,</span>
<span class="n">render_mode</span><span class="o">=</span><span class="s2">&quot;rgb_array&quot;</span><span class="p">,</span>
<span class="n">desc</span><span class="o">=</span><span class="n">generate_random_map</span><span class="p">(</span>
<span class="n">size</span><span class="o">=</span><span class="n">map_size</span><span class="p">,</span> <span class="n">p</span><span class="o">=</span><span class="n">params</span><span class="o">.</span><span class="n">proba_frozen</span><span class="p">,</span> <span class="n">seed</span><span class="o">=</span><span class="n">params</span><span class="o">.</span><span class="n">seed</span>
<span class="p">),</span>
<span class="p">)</span>
<span class="n">params</span> <span class="o">=</span> <span class="n">params</span><span class="o">.</span><span class="n">_replace</span><span class="p">(</span><span class="n">action_size</span><span class="o">=</span><span class="n">env</span><span class="o">.</span><span class="n">action_space</span><span class="o">.</span><span class="n">n</span><span class="p">)</span>
<span class="n">params</span> <span class="o">=</span> <span class="n">params</span><span class="o">.</span><span class="n">_replace</span><span class="p">(</span><span class="n">state_size</span><span class="o">=</span><span class="n">env</span><span class="o">.</span><span class="n">observation_space</span><span class="o">.</span><span class="n">n</span><span class="p">)</span>
<span class="n">env</span><span class="o">.</span><span class="n">action_space</span><span class="o">.</span><span class="n">seed</span><span class="p">(</span>
<span class="n">params</span><span class="o">.</span><span class="n">seed</span>
<span class="p">)</span> <span class="c1"># Set the seed to get reproducible results when sampling the action space</span>
<span class="n">learner</span> <span class="o">=</span> <span class="n">Qlearning</span><span class="p">(</span>
<span class="n">learning_rate</span><span class="o">=</span><span class="n">params</span><span class="o">.</span><span class="n">learning_rate</span><span class="p">,</span>
<span class="n">gamma</span><span class="o">=</span><span class="n">params</span><span class="o">.</span><span class="n">gamma</span><span class="p">,</span>
<span class="n">state_size</span><span class="o">=</span><span class="n">params</span><span class="o">.</span><span class="n">state_size</span><span class="p">,</span>
<span class="n">action_size</span><span class="o">=</span><span class="n">params</span><span class="o">.</span><span class="n">action_size</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">explorer</span> <span class="o">=</span> <span class="n">EpsilonGreedy</span><span class="p">(</span>
<span class="n">epsilon</span><span class="o">=</span><span class="n">params</span><span class="o">.</span><span class="n">epsilon</span><span class="p">,</span>
<span class="p">)</span>
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Map size: </span><span class="si">{</span><span class="n">map_size</span><span class="si">}</span><span class="s2">x</span><span class="si">{</span><span class="n">map_size</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
<span class="n">rewards</span><span class="p">,</span> <span class="n">steps</span><span class="p">,</span> <span class="n">episodes</span><span class="p">,</span> <span class="n">qtables</span><span class="p">,</span> <span class="n">all_states</span><span class="p">,</span> <span class="n">all_actions</span> <span class="o">=</span> <span class="n">run_env</span><span class="p">()</span>
<span class="c1"># Save the results in dataframes</span>
<span class="n">res</span><span class="p">,</span> <span class="n">st</span> <span class="o">=</span> <span class="n">postprocess</span><span class="p">(</span><span class="n">episodes</span><span class="p">,</span> <span class="n">params</span><span class="p">,</span> <span class="n">rewards</span><span class="p">,</span> <span class="n">steps</span><span class="p">,</span> <span class="n">map_size</span><span class="p">)</span>
<span class="n">res_all</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">concat</span><span class="p">([</span><span class="n">res_all</span><span class="p">,</span> <span class="n">res</span><span class="p">])</span>
<span class="n">st_all</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">concat</span><span class="p">([</span><span class="n">st_all</span><span class="p">,</span> <span class="n">st</span><span class="p">])</span>
<span class="n">qtable</span> <span class="o">=</span> <span class="n">qtables</span><span class="o">.</span><span class="n">mean</span><span class="p">(</span><span class="n">axis</span><span class="o">=</span><span class="mi">0</span><span class="p">)</span> <span class="c1"># Average the Q-table between runs</span>
<span class="n">plot_states_actions_distribution</span><span class="p">(</span>
<span class="n">states</span><span class="o">=</span><span class="n">all_states</span><span class="p">,</span> <span class="n">actions</span><span class="o">=</span><span class="n">all_actions</span><span class="p">,</span> <span class="n">map_size</span><span class="o">=</span><span class="n">map_size</span>
<span class="p">)</span> <span class="c1"># Sanity check</span>
<span class="n">plot_q_values_map</span><span class="p">(</span><span class="n">qtable</span><span class="p">,</span> <span class="n">env</span><span class="p">,</span> <span class="n">map_size</span><span class="p">)</span>
<span class="n">env</span><span class="o">.</span><span class="n">close</span><span class="p">()</span>
</pre></div>
</div>
<section id="map-size-4-times-4">
<h4>Map size: <span class="math notranslate nohighlight">\(4 \times 4\)</span><a class="headerlink" href="#map-size-4-times-4" title="Link to this heading">#</a></h4>
<p><img alt="States actions histogram 4x4 map" src="../../../_images/frozenlake_states_actions_distrib_4x4.png" /> <img alt="Q-values 4x4 map" src="../../../_images/frozenlake_q_values_4x4.png" /></p>
</section>
<section id="map-size-7-times-7">
<h4>Map size: <span class="math notranslate nohighlight">\(7 \times 7\)</span><a class="headerlink" href="#map-size-7-times-7" title="Link to this heading">#</a></h4>
<p><img alt="States actions histogram 7x7 map" src="../../../_images/frozenlake_states_actions_distrib_7x7.png" /> <img alt="Q-values 7x7 map" src="../../../_images/frozenlake_q_values_7x7.png" /></p>
</section>
<section id="map-size-9-times-9">
<h4>Map size: <span class="math notranslate nohighlight">\(9 \times 9\)</span><a class="headerlink" href="#map-size-9-times-9" title="Link to this heading">#</a></h4>
<p><img alt="States actions histogram 9x9 map" src="../../../_images/frozenlake_states_actions_distrib_9x9.png" /> <img alt="Q-values 9x9 map" src="../../../_images/frozenlake_q_values_9x9.png" /></p>
</section>
<section id="map-size-11-times-11">
<h4>Map size: <span class="math notranslate nohighlight">\(11 \times 11\)</span><a class="headerlink" href="#map-size-11-times-11" title="Link to this heading">#</a></h4>
<p><img alt="States actions histogram 11x11 map" src="../../../_images/frozenlake_states_actions_distrib_11x11.png" /> <img alt="Q-values 11x11 map" src="../../../_images/frozenlake_q_values_11x11.png" /></p>
<p>The <code class="docutils literal notranslate"><span class="pre">DOWN</span></code> and <code class="docutils literal notranslate"><span class="pre">RIGHT</span></code> actions get chosen more often, which makes
sense as the agent starts at the top left of the map and needs to find
its way down to the bottom right. Also the bigger the map, the less
states/tiles further away from the starting state get visited.</p>
<p>To check if our agent is learning, we want to plot the cumulated sum of
rewards, as well as the number of steps needed until the end of the
episode. If our agent is learning, we expect to see the cumulated sum of
rewards to increase and the number of steps to solve the task to
decrease.</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="k">def</span> <span class="nf">plot_steps_and_rewards</span><span class="p">(</span><span class="n">rewards_df</span><span class="p">,</span> <span class="n">steps_df</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Plot the steps and rewards from dataframes.&quot;&quot;&quot;</span>
<span class="n">fig</span><span class="p">,</span> <span class="n">ax</span> <span class="o">=</span> <span class="n">plt</span><span class="o">.</span><span class="n">subplots</span><span class="p">(</span><span class="n">nrows</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span> <span class="n">ncols</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">figsize</span><span class="o">=</span><span class="p">(</span><span class="mi">15</span><span class="p">,</span> <span class="mi">5</span><span class="p">))</span>
<span class="n">sns</span><span class="o">.</span><span class="n">lineplot</span><span class="p">(</span>
<span class="n">data</span><span class="o">=</span><span class="n">rewards_df</span><span class="p">,</span> <span class="n">x</span><span class="o">=</span><span class="s2">&quot;Episodes&quot;</span><span class="p">,</span> <span class="n">y</span><span class="o">=</span><span class="s2">&quot;cum_rewards&quot;</span><span class="p">,</span> <span class="n">hue</span><span class="o">=</span><span class="s2">&quot;map_size&quot;</span><span class="p">,</span> <span class="n">ax</span><span class="o">=</span><span class="n">ax</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
<span class="p">)</span>
<span class="n">ax</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">set</span><span class="p">(</span><span class="n">ylabel</span><span class="o">=</span><span class="s2">&quot;Cumulated rewards&quot;</span><span class="p">)</span>
<span class="n">sns</span><span class="o">.</span><span class="n">lineplot</span><span class="p">(</span><span class="n">data</span><span class="o">=</span><span class="n">steps_df</span><span class="p">,</span> <span class="n">x</span><span class="o">=</span><span class="s2">&quot;Episodes&quot;</span><span class="p">,</span> <span class="n">y</span><span class="o">=</span><span class="s2">&quot;Steps&quot;</span><span class="p">,</span> <span class="n">hue</span><span class="o">=</span><span class="s2">&quot;map_size&quot;</span><span class="p">,</span> <span class="n">ax</span><span class="o">=</span><span class="n">ax</span><span class="p">[</span><span class="mi">1</span><span class="p">])</span>
<span class="n">ax</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span><span class="o">.</span><span class="n">set</span><span class="p">(</span><span class="n">ylabel</span><span class="o">=</span><span class="s2">&quot;Averaged steps number&quot;</span><span class="p">)</span>
<span class="k">for</span> <span class="n">axi</span> <span class="ow">in</span> <span class="n">ax</span><span class="p">:</span>
<span class="n">axi</span><span class="o">.</span><span class="n">legend</span><span class="p">(</span><span class="n">title</span><span class="o">=</span><span class="s2">&quot;map size&quot;</span><span class="p">)</span>
<span class="n">fig</span><span class="o">.</span><span class="n">tight_layout</span><span class="p">()</span>
<span class="n">img_title</span> <span class="o">=</span> <span class="s2">&quot;frozenlake_steps_and_rewards.png&quot;</span>
<span class="n">fig</span><span class="o">.</span><span class="n">savefig</span><span class="p">(</span><span class="n">params</span><span class="o">.</span><span class="n">savefig_folder</span> <span class="o">/</span> <span class="n">img_title</span><span class="p">,</span> <span class="n">bbox_inches</span><span class="o">=</span><span class="s2">&quot;tight&quot;</span><span class="p">)</span>
<span class="n">plt</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
<span class="n">plot_steps_and_rewards</span><span class="p">(</span><span class="n">res_all</span><span class="p">,</span> <span class="n">st_all</span><span class="p">)</span>
</pre></div>
</div>
<p><img alt="Steps and rewards" src="../../../_images/frozenlake_steps_and_rewards.png" /></p>
<p>On the <span class="math notranslate nohighlight">\(4 \times 4\)</span> map, learning converges pretty quickly,
whereas on the <span class="math notranslate nohighlight">\(7 \times 7\)</span> map, the agent needs <span class="math notranslate nohighlight">\(\sim 300\)</span>
episodes, on the <span class="math notranslate nohighlight">\(9 \times 9\)</span> map it needs <span class="math notranslate nohighlight">\(\sim 800\)</span>
episodes, and the <span class="math notranslate nohighlight">\(11 \times 11\)</span> map, it needs <span class="math notranslate nohighlight">\(\sim 1800\)</span>
episodes to converge. Interestingly, the agent seems to be getting more
rewards on the <span class="math notranslate nohighlight">\(9 \times 9\)</span> map than on the <span class="math notranslate nohighlight">\(7 \times 7\)</span>
map, which could mean it didnt reach an optimal policy on the
<span class="math notranslate nohighlight">\(7 \times 7\)</span> map.</p>
<p>In the end, if agent doesnt get any rewards, rewards dont get
propagated in the Q-values, and the agent doesnt learn anything. In my
experience on this environment using <span class="math notranslate nohighlight">\(\epsilon\)</span>-greedy and those
hyperparameters and environment settings, maps having more than
<span class="math notranslate nohighlight">\(11 \times 11\)</span> tiles start to be difficult to solve. Maybe using a
different exploration algorithm could overcome this. The other parameter
having a big impact is the <code class="docutils literal notranslate"><span class="pre">proba_frozen</span></code>, the probability of the tile
being frozen. With too many holes, i.e. <span class="math notranslate nohighlight">\(p&lt;0.9\)</span>, Q-learning is
having a hard time in not falling into holes and getting a reward
signal.</p>
</section>
</section>
</section>
<section id="references">
<h2>References<a class="headerlink" href="#references" title="Link to this heading">#</a></h2>
<ul class="simple">
<li><p>Code inspired by <a class="reference external" href="https://simoninithomas.github.io/Deep_reinforcement_learning_Course/">Deep Reinforcement Learning
Course</a>
by Thomas Simonini (<a class="reference external" href="http://simoninithomas.com/">http://simoninithomas.com/</a>)</p></li>
<li><p><a class="reference external" href="https://mpatacchiola.github.io/blog/2017/01/15/dissecting-reinforcement-learning-2.html">Dissecting Reinforcement
Learning-Part.2</a></p></li>
<li><p><a class="reference external" href="https://www.davidsilver.uk/teaching/">David Silvers course</a> in
particular lesson 4 and lesson 5</p></li>
<li><p><a class="reference external" href="https://en.wikipedia.org/wiki/Q-learning">Q-learning article on
Wikipedia</a></p></li>
<li><p><a class="reference external" href="http://incompleteideas.net/book/ebook/node65.html">Q-Learning: Off-Policy TD
Control</a> in
<a class="reference external" href="http://incompleteideas.net/book/ebook/">Reinforcement Learning: An Introduction, by Richard S. Sutton and
Andrew G. Barto</a></p></li>
<li><p><a class="reference external" href="https://www.baeldung.com/cs/epsilon-greedy-q-learning">Epsilon-Greedy
Q-learning</a></p></li>
<li><p><a class="reference external" href="https://gibberblot.github.io/rl-notes/index.html">Introduction to Reinforcement
Learning</a> by Tim
Miller (University of Melbourne)</p></li>
</ul>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-tutorials-training-agents-frozenlake-tuto-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../../_downloads/7720e20eb2c7f37162a67699a7b0ba23/FrozenLake_tuto.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">FrozenLake_tuto.py</span></code></a></p>
</div>
<div class="sphx-glr-download sphx-glr-download-jupyter docutils container">
<p><a class="reference download internal" download="" href="../../../_downloads/6ae5209ec80e231987d889c490d131a9/FrozenLake_tuto.ipynb"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Jupyter</span> <span class="pre">notebook:</span> <span class="pre">FrozenLake_tuto.ipynb</span></code></a></p>
</div>
</div>
</section>
</section>
</article>
</div>
<footer>
<div class="related-pages">
<a class="next-page" href="../../third-party-tutorials/">
<div class="page-info">
<div class="context">
<span>Next</span>
</div>
<div class="title">Third-Party Tutorials</div>
</div>
<svg class="furo-related-icon">
<use href="#svg-arrow-right"></use>
</svg>
</a>
<a class="prev-page" href="../blackjack_tutorial/">
<svg class="furo-related-icon">
<use href="#svg-arrow-right"></use>
</svg>
<div class="page-info">
<div class="context">
<span>Previous</span>
</div>
<div class="title">Solving Blackjack with Q-Learning</div>
</div>
</a>
</div>
<div class="bottom-of-page">
<div class="left-details">
<div class="copyright">
Copyright &#169; 2023 Farama Foundation
</div>
<!--
Made with <a href="https://www.sphinx-doc.org/">Sphinx</a> and <a class="muted-link" href="https://pradyunsg.me">@pradyunsg</a>'s
<a href="https://github.com/pradyunsg/furo">Furo</a>
-->
</div>
<div class="right-details">
<div class="icons">
<a class="muted-link" href="https://github.com/Farama-Foundation/Gymnasium/"
aria-label="On GitHub">
<svg stroke="currentColor" fill="currentColor" stroke-width="0" viewBox="0 0 16 16">
<path fill-rule="evenodd"
d="M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27.68 0 1.36.09 2 .27 1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55.38A8.013 8.013 0 0 0 16 8c0-4.42-3.58-8-8-8z">
</path>
</svg>
</a>
</div>
</div>
</div>
</footer>
</div>
<aside class="toc-drawer">
<div class="toc-sticky toc-scroll">
<div class="toc-title-container">
<span class="toc-title">
On this page
</span>
</div>
<div class="toc-tree-container">
<div class="toc-tree">
<ul>
<li><a class="reference internal" href="#">Frozenlake benchmark</a><ul>
<li><a class="reference internal" href="#dependencies">Dependencies</a></li>
<li><a class="reference internal" href="#parameters-we-ll-use">Parameters well use</a></li>
<li><a class="reference internal" href="#the-frozenlake-environment">The FrozenLake environment</a><ul>
<li><a class="reference internal" href="#creating-the-q-table">Creating the Q-table</a></li>
<li><a class="reference internal" href="#running-the-environment">Running the environment</a></li>
<li><a class="reference internal" href="#visualization">Visualization</a><ul>
<li><a class="reference internal" href="#map-size-4-times-4">Map size: <span class="math notranslate nohighlight">\(4 \times 4\)</span></a></li>
<li><a class="reference internal" href="#map-size-7-times-7">Map size: <span class="math notranslate nohighlight">\(7 \times 7\)</span></a></li>
<li><a class="reference internal" href="#map-size-9-times-9">Map size: <span class="math notranslate nohighlight">\(9 \times 9\)</span></a></li>
<li><a class="reference internal" href="#map-size-11-times-11">Map size: <span class="math notranslate nohighlight">\(11 \times 11\)</span></a></li>
</ul>
</li>
</ul>
</li>
<li><a class="reference internal" href="#references">References</a></li>
</ul>
</li>
</ul>
</div>
</div>
</div>
</aside>
</div>
</div>
</div>
<script>
const toggleMenu = () => {
const menuBtn = document.querySelector(".farama-header-menu__btn");
const menuContainer = document.querySelector(".farama-header-menu-container");
if (document.querySelector(".farama-header-menu").classList.contains("active")) {
menuBtn.setAttribute("aria-expanded", "false");
menuContainer.setAttribute("aria-hidden", "true");
} else {
menuBtn.setAttribute("aria-expanded", "true");
menuContainer.setAttribute("aria-hidden", "false");
}
document.querySelector(".farama-header-menu").classList.toggle("active");
}
document.querySelector(".farama-header-menu__btn").addEventListener("click", toggleMenu);
document.getElementById("farama-close-menu").addEventListener("click", toggleMenu);
</script>
<script async src="https://www.googletagmanager.com/gtag/js?id=G-6H9C8TWXZ8"></script>
<script>
const enableGtag = () => {
window.dataLayer = window.dataLayer || [];
function gtag(){dataLayer.push(arguments);}
gtag('js', new Date());
gtag('config', 'G-6H9C8TWXZ8');
}
(() => {
if (!localStorage.getItem("acceptedCookieAlert")) {
const boxElem = document.createElement("div");
boxElem.classList.add("cookie-alert");
const containerElem = document.createElement("div");
containerElem.classList.add("cookie-alert__container");
const textElem = document.createElement("p");
textElem.innerHTML = `This page uses <a href="https://analytics.google.com/">
Google Analytics</a> to collect statistics.`;
containerElem.appendChild(textElem);
const declineBtn = Object.assign(document.createElement("button"),
{
innerText: "Deny",
className: "farama-btn cookie-alert__button",
id: "cookie-alert__decline",
}
);
declineBtn.addEventListener("click", () => {
localStorage.setItem("acceptedCookieAlert", false);
boxElem.remove();
});
const acceptBtn = Object.assign(document.createElement("button"),
{
innerText: "Allow",
className: "farama-btn cookie-alert__button",
id: "cookie-alert__accept",
}
);
acceptBtn.addEventListener("click", () => {
localStorage.setItem("acceptedCookieAlert", true);
boxElem.remove();
enableGtag();
});
containerElem.appendChild(declineBtn);
containerElem.appendChild(acceptBtn);
boxElem.appendChild(containerElem);
document.body.appendChild(boxElem);
} else if (localStorage.getItem("acceptedCookieAlert") === "true") {
enableGtag();
}
})()
</script>
<script src="../../../_static/documentation_options.js?v=ed34540e"></script>
<script src="../../../_static/doctools.js?v=888ff710"></script>
<script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
<script src="../../../_static/scripts/furo.js?v=7660844c"></script>
<script>window.MathJax = {"options": {"processHtmlClass": "tex2jax_process|mathjax_process|math|output_area"}}</script>
<script defer="defer" src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>
<script>
const createProjectsList = (projects, displayImages) => {
const ulElem = Object.assign(document.createElement('ul'),
{
className:'farama-header-menu-list',
}
)
for (let project of projects) {
const liElem = document.createElement("li");
const aElem = Object.assign(document.createElement("a"),
{
href: project.link
}
);
liElem.appendChild(aElem);
if (displayImages) {
const imgElem = Object.assign(document.createElement("img"),
{
src: project.image ? imagesBasepath + project.image : imagesBasepath + "/farama_black.svg",
alt: `${project.name} logo`,
className: "farama-black-logo-invert"
}
);
aElem.appendChild(imgElem);
}
aElem.appendChild(document.createTextNode(project.name));
ulElem.appendChild(liElem);
}
return ulElem;
}
// Create menu with Farama projects by using the API at farama.org/api/projects.json
const createCORSRequest = (method, url) => {
let xhr = new XMLHttpRequest();
xhr.responseType = 'json';
if ("withCredentials" in xhr) {
xhr.open(method, url, true);
} else if (typeof XDomainRequest != "undefined") {
// IE8 & IE9
xhr = new XDomainRequest();
xhr.open(method, url);
} else {
// CORS not supported.
xhr = null;
}
return xhr;
};
const url = 'https://farama.org/api/projects.json';
const imagesBasepath = "https://farama.org/assets/images"
const method = 'GET';
let xhr = createCORSRequest(method, url);
xhr.onload = () => {
const jsonResponse = xhr.response;
const sections = {
"Core Projects": [],
"Mature Projects": {
"Documentation": [],
"Repositories": [],
},
"Incubating Projects": {
"Documentation": [],
"Repositories": [],
},
"Foundation": [
{
name: "About",
link: "https://farama.org/about"
},
{
name: "Standards",
link: "https://farama.org/project_standards",
},
{
name: "Donate",
link: "https://farama.org/donations"
}
]
}
// Categorize projects
Object.keys(jsonResponse).forEach(key => {
projectJson = jsonResponse[key];
if (projectJson.website !== null) {
projectJson.link = projectJson.website;
} else {
projectJson.link = projectJson.github;
}
if (projectJson.type === "core") {
sections["Core Projects"].push(projectJson)
} else if (projectJson.type == "mature") {
if (projectJson.website !== null) {
sections["Mature Projects"]["Documentation"].push(projectJson)
} else {
sections["Mature Projects"]["Repositories"].push(projectJson)
}
} else {
if (projectJson.website !== null) {
sections["Incubating Projects"]["Documentation"].push(projectJson)
} else {
sections["Incubating Projects"]["Repositories"].push(projectJson)
}
}
})
const menuContainer = document.querySelector(".farama-header-menu__body");
Object.keys(sections).forEach((key, i) => {
const sectionElem = Object.assign(
document.createElement('div'), {
className:'farama-header-menu__section',
}
)
sectionElem.appendChild(Object.assign(document.createElement('span'),
{
className:'farama-header-menu__section-title' ,
innerText: key
}
))
// is not a list
if (sections[key].constructor !== Array) {
const subSections = sections[key];
const subSectionContainerElem = Object.assign(
document.createElement('div'), {
className:'farama-header-menu__subsections-container',
style: 'display: flex'
}
)
Object.keys(subSections).forEach((subKey, i) => {
const subSectionElem = Object.assign(
document.createElement('div'), {
className:'farama-header-menu__subsection',
}
)
subSectionElem.appendChild(Object.assign(document.createElement('span'),
{
className:'farama-header-menu__subsection-title' ,
innerText: subKey
}
))
const ulElem = createProjectsList(subSections[subKey], key !== 'Foundation');
subSectionElem.appendChild(ulElem);
subSectionContainerElem.appendChild(subSectionElem);
})
sectionElem.appendChild(subSectionContainerElem);
} else {
const projects = sections[key];
const ulElem = createProjectsList(projects, true);
sectionElem.appendChild(ulElem);
}
menuContainer.appendChild(sectionElem)
});
}
xhr.onerror = function() {
console.error("Unable to load projects");
};
xhr.send();
</script>
<script>
const versioningConfig = {
githubUser: 'Farama-Foundation',
githubRepo: 'Gymnasium',
};
fetch('/main/_static/versioning/versioning_menu.html').then(response => {
if (response.status === 200) {
response.text().then(text => {
const container = document.createElement("div");
container.innerHTML = text;
document.querySelector("body").appendChild(container);
// innerHtml doenst evaluate scripts, we need to add them dynamically
Array.from(container.querySelectorAll("script")).forEach(oldScript => {
const newScript = document.createElement("script");
Array.from(oldScript.attributes).forEach(attr => newScript.setAttribute(attr.name, attr.value));
newScript.appendChild(document.createTextNode(oldScript.innerHTML));
oldScript.parentNode.replaceChild(newScript, oldScript);
});
});
} else {
console.warn("Unable to load versioning menu", response);
}
});
</script>
</body>
</html>