Files
Gymnasium/v1.1.0/tutorials/training_agents/blackjack_tutorial/index.html
2025-02-26 11:56:16 +00:00

1144 lines
90 KiB
HTML
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<!doctype html>
<html class="no-js" lang="en" data-content_root="../../../">
<head><meta charset="utf-8"/>
<meta name="viewport" content="width=device-width,initial-scale=1"/>
<meta name="color-scheme" content="light dark">
<meta name="description" content="A standard API for reinforcement learning and a diverse set of reference environments (formerly Gym)">
<meta property="og:title" content="Gymnasium Documentation" />
<meta property="og:type" content="website" />
<meta property="og:description" content="A standard API for reinforcement learning and a diverse set of reference environments (formerly Gym)" />
<meta property="og:url" content="https://gymnasium.farama.org/tutorials/training_agents/blackjack_tutorial.html" /><meta property="og:image" content="https://gymnasium.farama.org/_static/img/gymnasium-github.png" /><meta name="twitter:card" content="summary_large_image"><meta name="viewport" content="width=device-width, initial-scale=1" />
<link rel="index" title="Index" href="../../../genindex/" /><link rel="search" title="Search" href="../../../search/" /><link rel="next" title="Frozenlake benchmark" href="../FrozenLake_tuto/" /><link rel="prev" title="Training using REINFORCE for Mujoco" href="../reinforce_invpend_gym_v26/" />
<link rel="canonical" href="https://gymnasium.farama.org/tutorials/training_agents/blackjack_tutorial.html" />
<link rel="shortcut icon" href="../../../_static/favicon.png"/><!-- Generated with Sphinx 7.4.7 and Furo 2023.08.19.dev1 -->
<title>Solving Blackjack with Q-Learning - Gymnasium Documentation</title>
<link rel="stylesheet" type="text/css" href="../../../_static/pygments.css?v=8f2a1f02" />
<link rel="stylesheet" type="text/css" href="../../../_static/styles/furo.css?v=3e7f4c72" />
<link rel="stylesheet" type="text/css" href="../../../_static/sg_gallery.css?v=61a4c737" />
<link rel="stylesheet" type="text/css" href="../../../_static/sg_gallery-binder.css?v=f4aeca0c" />
<link rel="stylesheet" type="text/css" href="../../../_static/sg_gallery-dataframe.css?v=2082cf3c" />
<link rel="stylesheet" type="text/css" href="../../../_static/sg_gallery-rendered-html.css?v=1277b6f3" />
<link rel="stylesheet" type="text/css" href="../../../_static/styles/furo-extensions.css?v=82c8b628" />
<style>
body {
--color-code-background: #f8f8f8;
--color-code-foreground: black;
}
@media not print {
body[data-theme="dark"] {
--color-code-background: #202020;
--color-code-foreground: #d0d0d0;
}
@media (prefers-color-scheme: dark) {
body:not([data-theme="light"]) {
--color-code-background: #202020;
--color-code-foreground: #d0d0d0;
}
}
}
</style></head>
<body>
<header class="farama-header" aria-label="Farama header">
<div class="farama-header__container">
<div class="farama-header__left--mobile">
<label class="nav-overlay-icon" for="__navigation">
<div class="visually-hidden">Toggle site navigation sidebar</div>
<svg viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg">
<defs></defs>
<line x1="0.5" y1="4" x2="23.5" y2="4"></line>
<line x1="0.232" y1="12" x2="23.5" y2="12"></line>
<line x1="0.232" y1="20" x2="23.5" y2="20"></line>
</svg>
</label>
</div>
<div class="farama-header__left farama-header__center--mobile">
<a href="../../../">
<img class="farama-header__logo only-light" src="../../../_static/img/gymnasium_black.svg" alt="Light Logo"/>
<img class="farama-header__logo only-dark" src="../../../_static/img/gymnasium_white.svg" alt="Dark Logo"/>
<span class="farama-header__title">Gymnasium Documentation</span>
</a>
</div>
<div class="farama-header__right">
<div class="farama-header-menu">
<button class="farama-header-menu__btn" aria-label="Open Farama Menu" aria-expanded="false" aria-haspopup="true" aria-controls="farama-menu">
<img class="farama-black-logo-invert" src="../../../_static/img/farama-logo-header.svg">
<svg viewBox="0 0 24 24" viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg">
<polyline style="stroke-linecap: round; stroke-linejoin: round; fill: none; stroke-width: 2px;" points="1 7 12 18 23 7"></polyline>
</svg>
</button>
<div class="farama-header-menu-container farama-hidden" aria-hidden="true" id="farama-menu">
<div class="farama-header-menu__header">
<a href="https://farama.org">
<img class="farama-header-menu__logo farama-white-logo-invert" src="../../../_static/img/farama_solid_white.svg" alt="Farama Foundation logo">
<span>Farama Foundation</span>
</a>
<div class="farama-header-menu-header__right">
<button id="farama-close-menu">
<svg viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg" fill="none" stroke="currentColor"
stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="icon-close">
<line x1="3" y1="21" x2="21" y2="3"></line>
<line x1="3" y1="3" x2="21" y2="21"></line>
</svg>
</button>
</div>
</div>
<div class="farama-header-menu__body">
<!-- Response from farama.org/api/projects.json -->
</div>
</div>
</div>
</div>
</div>
</header>
<script>
document.body.dataset.theme = localStorage.getItem("theme") || "auto";
</script>
<svg xmlns="http://www.w3.org/2000/svg" style="display: none;">
<symbol id="svg-toc" viewBox="0 0 24 24">
<title>Contents</title>
<svg stroke="currentColor" fill="currentColor" stroke-width="0" viewBox="0 0 1024 1024">
<path d="M408 442h480c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8H408c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8zm-8 204c0 4.4 3.6 8 8 8h480c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8H408c-4.4 0-8 3.6-8 8v56zm504-486H120c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8h784c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8zm0 632H120c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8h784c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8zM115.4 518.9L271.7 642c5.8 4.6 14.4.5 14.4-6.9V388.9c0-7.4-8.5-11.5-14.4-6.9L115.4 505.1a8.74 8.74 0 0 0 0 13.8z"/>
</svg>
</symbol>
<symbol id="svg-menu" viewBox="0 0 24 24">
<title>Menu</title>
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather-menu">
<line x1="3" y1="12" x2="21" y2="12"></line>
<line x1="3" y1="6" x2="21" y2="6"></line>
<line x1="3" y1="18" x2="21" y2="18"></line>
</svg>
</symbol>
<symbol id="svg-arrow-right" viewBox="0 0 24 24">
<title>Expand</title>
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather-chevron-right">
<polyline points="9 18 15 12 9 6"></polyline>
</svg>
</symbol>
<symbol id="svg-sun" viewBox="0 0 24 24">
<title>Light mode</title>
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="feather-sun">
<circle cx="12" cy="12" r="5"></circle>
<line x1="12" y1="1" x2="12" y2="3"></line>
<line x1="12" y1="21" x2="12" y2="23"></line>
<line x1="4.22" y1="4.22" x2="5.64" y2="5.64"></line>
<line x1="18.36" y1="18.36" x2="19.78" y2="19.78"></line>
<line x1="1" y1="12" x2="3" y2="12"></line>
<line x1="21" y1="12" x2="23" y2="12"></line>
<line x1="4.22" y1="19.78" x2="5.64" y2="18.36"></line>
<line x1="18.36" y1="5.64" x2="19.78" y2="4.22"></line>
</svg>
</symbol>
<symbol id="svg-moon" viewBox="0 0 24 24">
<title>Dark mode</title>
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="icon-tabler-moon">
<path stroke="none" d="M0 0h24v24H0z" fill="none" />
<path d="M12 3c.132 0 .263 0 .393 0a7.5 7.5 0 0 0 7.92 12.446a9 9 0 1 1 -8.313 -12.454z" />
</svg>
</symbol>
<symbol id="svg-sun-half" viewBox="0 0 24 24">
<title>Auto light/dark mode</title>
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="icon-tabler-shadow">
<path stroke="none" d="M0 0h24v24H0z" fill="none"/>
<circle cx="12" cy="12" r="9" />
<path d="M13 12h5" />
<path d="M13 15h4" />
<path d="M13 18h1" />
<path d="M13 9h4" />
<path d="M13 6h1" />
</svg>
</symbol>
</svg>
<input type="checkbox" class="sidebar-toggle" name="__navigation" id="__navigation">
<input type="checkbox" class="sidebar-toggle" name="__toc" id="__toc">
<label class="overlay sidebar-overlay" for="__navigation">
<div class="visually-hidden">Hide navigation sidebar</div>
</label>
<label class="overlay toc-overlay" for="__toc">
<div class="visually-hidden">Hide table of contents sidebar</div>
</label>
<div class="page">
<!--<header class="mobile-header">
<div class="header-left">
<label class="nav-overlay-icon" for="__navigation">
<div class="visually-hidden">Toggle site navigation sidebar</div>
<i class="icon"><svg><use href="#svg-menu"></use></svg></i>
</label>
</div>
<div class="header-center">
<a href="../../../"><div class="brand">Gymnasium Documentation</div></a>
</div>
<div class="header-right">
<div class="theme-toggle-container theme-toggle-header">
<button class="theme-toggle">
<div class="visually-hidden">Toggle Light / Dark / Auto color theme</div>
<svg class="theme-icon-when-auto"><use href="#svg-sun-half"></use></svg>
<svg class="theme-icon-when-dark"><use href="#svg-moon"></use></svg>
<svg class="theme-icon-when-light"><use href="#svg-sun"></use></svg>
</button>
</div>
<label class="toc-overlay-icon toc-header-icon" for="__toc">
<div class="visually-hidden">Toggle table of contents sidebar</div>
<i class="icon"><svg><use href="#svg-toc"></use></svg></i>
</label>
</div>
</header>-->
<aside class="sidebar-drawer">
<div class="sidebar-container">
<div class="sidebar-sticky"><a class="farama-sidebar__title" href="../../../">
<img class="farama-header__logo only-light" src="../../../_static/img/gymnasium_black.svg" alt="Light Logo"/>
<img class="farama-header__logo only-dark" src="../../../_static/img/gymnasium_white.svg" alt="Dark Logo"/>
<span class="farama-header__title">Gymnasium Documentation</span>
</a><form class="sidebar-search-container" method="get" action="../../../search/" role="search">
<input class="sidebar-search" placeholder="Search" name="q" aria-label="Search">
<input type="hidden" name="check_keywords" value="yes">
<input type="hidden" name="area" value="default">
</form>
<div id="searchbox"></div><div class="sidebar-scroll"><div class="sidebar-tree">
<p class="caption" role="heading"><span class="caption-text">Introduction</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../../../introduction/basic_usage/">Basic Usage</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../introduction/train_agent/">Training an Agent</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../introduction/create_custom_env/">Create a Custom Environment</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../introduction/record_agent/">Recording Agents</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../introduction/speed_up_env/">Speeding Up Training</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../introduction/gym_compatibility/">Compatibility with Gym</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../introduction/migration_guide/">Migration Guide - v0.21 to v1.0.0</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">API</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../../../api/env/">Env</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../api/registry/">Make and register</a></li>
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../api/spaces/">Spaces</a><input class="toctree-checkbox" id="toctree-checkbox-1" name="toctree-checkbox-1" role="switch" type="checkbox"/><label for="toctree-checkbox-1"><div class="visually-hidden">Toggle navigation of Spaces</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
<li class="toctree-l2"><a class="reference internal" href="../../../api/spaces/fundamental/">Fundamental Spaces</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../api/spaces/composite/">Composite Spaces</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../api/spaces/utils/">Spaces Utils</a></li>
</ul>
</li>
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../api/wrappers/">Wrappers</a><input class="toctree-checkbox" id="toctree-checkbox-2" name="toctree-checkbox-2" role="switch" type="checkbox"/><label for="toctree-checkbox-2"><div class="visually-hidden">Toggle navigation of Wrappers</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
<li class="toctree-l2"><a class="reference internal" href="../../../api/wrappers/table/">List of Wrappers</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../api/wrappers/misc_wrappers/">Misc Wrappers</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../api/wrappers/action_wrappers/">Action Wrappers</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../api/wrappers/observation_wrappers/">Observation Wrappers</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../api/wrappers/reward_wrappers/">Reward Wrappers</a></li>
</ul>
</li>
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../api/vector/">Vectorize</a><input class="toctree-checkbox" id="toctree-checkbox-3" name="toctree-checkbox-3" role="switch" type="checkbox"/><label for="toctree-checkbox-3"><div class="visually-hidden">Toggle navigation of Vectorize</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
<li class="toctree-l2"><a class="reference internal" href="../../../api/vector/wrappers/">Wrappers</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../api/vector/async_vector_env/">AsyncVectorEnv</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../api/vector/sync_vector_env/">SyncVectorEnv</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../api/vector/utils/">Utility functions</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="../../../api/utils/">Utility functions</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../api/functional/">Functional Env</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Environments</span></p>
<ul>
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../environments/classic_control/">Classic Control</a><input class="toctree-checkbox" id="toctree-checkbox-4" name="toctree-checkbox-4" role="switch" type="checkbox"/><label for="toctree-checkbox-4"><div class="visually-hidden">Toggle navigation of Classic Control</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/classic_control/acrobot/">Acrobot</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/classic_control/cart_pole/">Cart Pole</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/classic_control/mountain_car_continuous/">Mountain Car Continuous</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/classic_control/mountain_car/">Mountain Car</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/classic_control/pendulum/">Pendulum</a></li>
</ul>
</li>
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../environments/box2d/">Box2D</a><input class="toctree-checkbox" id="toctree-checkbox-5" name="toctree-checkbox-5" role="switch" type="checkbox"/><label for="toctree-checkbox-5"><div class="visually-hidden">Toggle navigation of Box2D</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/box2d/bipedal_walker/">Bipedal Walker</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/box2d/car_racing/">Car Racing</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/box2d/lunar_lander/">Lunar Lander</a></li>
</ul>
</li>
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../environments/toy_text/">Toy Text</a><input class="toctree-checkbox" id="toctree-checkbox-6" name="toctree-checkbox-6" role="switch" type="checkbox"/><label for="toctree-checkbox-6"><div class="visually-hidden">Toggle navigation of Toy Text</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/toy_text/blackjack/">Blackjack</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/toy_text/taxi/">Taxi</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/toy_text/cliff_walking/">Cliff Walking</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/toy_text/frozen_lake/">Frozen Lake</a></li>
</ul>
</li>
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../environments/mujoco/">MuJoCo</a><input class="toctree-checkbox" id="toctree-checkbox-7" name="toctree-checkbox-7" role="switch" type="checkbox"/><label for="toctree-checkbox-7"><div class="visually-hidden">Toggle navigation of MuJoCo</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/mujoco/ant/">Ant</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/mujoco/half_cheetah/">Half Cheetah</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/mujoco/hopper/">Hopper</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/mujoco/humanoid/">Humanoid</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/mujoco/humanoid_standup/">Humanoid Standup</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/mujoco/inverted_double_pendulum/">Inverted Double Pendulum</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/mujoco/inverted_pendulum/">Inverted Pendulum</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/mujoco/pusher/">Pusher</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/mujoco/reacher/">Reacher</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/mujoco/swimmer/">Swimmer</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../environments/mujoco/walker2d/">Walker2D</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="../../../environments/atari/">Atari</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../environments/third_party_environments/">External Environments</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Tutorials</span></p>
<ul class="current">
<li class="toctree-l1 has-children"><a class="reference internal" href="../../gymnasium_basics/">Gymnasium Basics Documentation Links</a><input class="toctree-checkbox" id="toctree-checkbox-8" name="toctree-checkbox-8" role="switch" type="checkbox"/><label for="toctree-checkbox-8"><div class="visually-hidden">Toggle navigation of Gymnasium Basics Documentation Links</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
<li class="toctree-l2"><a class="reference internal" href="../../gymnasium_basics/load_quadruped_model/">Load custom quadruped robot environments</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../gymnasium_basics/handling_time_limits/">Handling Time Limits</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../gymnasium_basics/implementing_custom_wrappers/">Implementing Custom Wrappers</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../gymnasium_basics/environment_creation/">Make your own custom environment</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../gymnasium_basics/vector_envs_tutorial/">Training A2C with Vector Envs and Domain Randomization</a></li>
</ul>
</li>
<li class="toctree-l1 current has-children"><a class="reference internal" href="../">Training Agents links in the Gymnasium Documentation</a><input checked="" class="toctree-checkbox" id="toctree-checkbox-9" name="toctree-checkbox-9" role="switch" type="checkbox"/><label for="toctree-checkbox-9"><div class="visually-hidden">Toggle navigation of Training Agents links in the Gymnasium Documentation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul class="current">
<li class="toctree-l2"><a class="reference internal" href="../reinforce_invpend_gym_v26/">Training using REINFORCE for Mujoco</a></li>
<li class="toctree-l2 current current-page"><a class="current reference internal" href="#">Solving Blackjack with Q-Learning</a></li>
<li class="toctree-l2"><a class="reference internal" href="../FrozenLake_tuto/">Frozenlake benchmark</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="../../third-party-tutorials/">Third-Party Tutorials</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Development</span></p>
<ul>
<li class="toctree-l1"><a class="reference external" href="https://github.com/Farama-Foundation/Gymnasium">Github</a></li>
<li class="toctree-l1"><a class="reference external" href="https://arxiv.org/abs/2407.17032">Paper</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../gymnasium_release_notes/">Gymnasium Release Notes</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../gym_release_notes/">Gym Release Notes</a></li>
<li class="toctree-l1"><a class="reference external" href="https://github.com/Farama-Foundation/Gymnasium/blob/main/docs/README.md">Contribute to the Docs</a></li>
</ul>
</div>
</div>
</div>
</div>
</aside>
<div class="main-container">
<div class="main">
<div class="content">
<div class="article-container">
<a href="#" class="back-to-top muted-link">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24">
<path d="M13 20h-2V8l-5.5 5.5-1.42-1.42L12 4.16l7.92 7.92-1.42 1.42L13 8v12z"></path>
</svg>
<span>Back to top</span>
</a>
<div class="content-icon-container">
<div class="edit-this-page">
<a class="muted-link" href="https://github.com/Farama-Foundation/Gymnasium/edit/main/docs/tutorials/training_agents/blackjack_tutorial.py" title="Edit this page">
<svg aria-hidden="true" viewBox="0 0 24 24" stroke-width="1.5" stroke="currentColor" fill="none" stroke-linecap="round" stroke-linejoin="round">
<path stroke="none" d="M0 0h24v24H0z" fill="none"/>
<path d="M4 20h4l10.5 -10.5a1.5 1.5 0 0 0 -4 -4l-10.5 10.5v4" />
<line x1="13.5" y1="6.5" x2="17.5" y2="10.5" />
</svg>
<span class="visually-hidden">Edit this page</span>
</a>
</div><div class="theme-toggle-container theme-toggle-content">
<button class="theme-toggle" title="Toggle color theme">
<div class="visually-hidden">Toggle Light / Dark / Auto color theme</div>
<svg class="theme-icon-when-auto">
<use href="#svg-sun-half"></use>
</svg>
<svg class="theme-icon-when-dark">
<use href="#svg-moon"></use>
</svg>
<svg class="theme-icon-when-light">
<use href="#svg-sun"></use>
</svg>
</button>
</div>
<label class="toc-overlay-icon toc-content-icon" for="__toc">
<div class="visually-hidden">Toggle table of contents sidebar</div>
<i class="icon"><svg>
<use href="#svg-toc"></use>
</svg></i>
</label>
</div>
<article role="main">
<section class="sphx-glr-example-title" id="solving-blackjack-with-q-learning">
<span id="sphx-glr-tutorials-training-agents-blackjack-tutorial-py"></span><h1>Solving Blackjack with Q-Learning<a class="headerlink" href="#solving-blackjack-with-q-learning" title="Link to this heading"></a></h1>
<a class="only-light reference internal image-reference" href="../../../_images/blackjack_AE_loop.jpg"><img alt="agent-environment-diagram" class="only-light" src="../../../_images/blackjack_AE_loop.jpg" style="width: 650px;" />
</a>
<a class="only-dark reference internal image-reference" href="../../../_images/blackjack_AE_loop_dark.png"><img alt="agent-environment-diagram" class="only-dark" src="../../../_images/blackjack_AE_loop_dark.png" style="width: 650px;" />
</a>
<p>In this tutorial, well explore and solve the <em>Blackjack-v1</em>
environment.</p>
<p><strong>Blackjack</strong> is one of the most popular casino card games that is also
infamous for being beatable under certain conditions. This version of
the game uses an infinite deck (we draw the cards with replacement), so
counting cards wont be a viable strategy in our simulated game.
Full documentation can be found at <a class="reference external" href="https://gymnasium.farama.org/environments/toy_text/blackjack">https://gymnasium.farama.org/environments/toy_text/blackjack</a></p>
<p><strong>Objective</strong>: To win, your card sum should be greater than the
dealers without exceeding 21.</p>
<dl class="simple">
<dt><strong>Actions</strong>: Agents can pick between two actions:</dt><dd><ul class="simple">
<li><p>stand (0): the player takes no more cards</p></li>
<li><p>hit (1): the player will be given another card, however the player could get over 21 and bust</p></li>
</ul>
</dd>
</dl>
<p><strong>Approach</strong>: To solve this environment by yourself, you can pick your
favorite discrete RL algorithm. The presented solution uses <em>Q-learning</em>
(a model-free RL algorithm).</p>
<section id="imports-and-environment-setup">
<h2>Imports and Environment Setup<a class="headerlink" href="#imports-and-environment-setup" title="Link to this heading"></a></h2>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="c1"># Author: Till Zemann</span>
<span class="c1"># License: MIT License</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">__future__</span><span class="w"> </span><span class="kn">import</span> <span class="n">annotations</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">collections</span><span class="w"> </span><span class="kn">import</span> <span class="n">defaultdict</span>
<span class="kn">import</span><span class="w"> </span><span class="nn">matplotlib.pyplot</span><span class="w"> </span><span class="k">as</span><span class="w"> </span><span class="nn">plt</span>
<span class="kn">import</span><span class="w"> </span><span class="nn">numpy</span><span class="w"> </span><span class="k">as</span><span class="w"> </span><span class="nn">np</span>
<span class="kn">import</span><span class="w"> </span><span class="nn">seaborn</span><span class="w"> </span><span class="k">as</span><span class="w"> </span><span class="nn">sns</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">matplotlib.patches</span><span class="w"> </span><span class="kn">import</span> <span class="n">Patch</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">tqdm</span><span class="w"> </span><span class="kn">import</span> <span class="n">tqdm</span>
<span class="kn">import</span><span class="w"> </span><span class="nn">gymnasium</span><span class="w"> </span><span class="k">as</span><span class="w"> </span><span class="nn">gym</span>
<span class="c1"># Let&#39;s start by creating the blackjack environment.</span>
<span class="c1"># Note: We are going to follow the rules from Sutton &amp; Barto.</span>
<span class="c1"># Other versions of the game can be found below for you to experiment.</span>
<span class="n">env</span> <span class="o">=</span> <span class="n">gym</span><span class="o">.</span><span class="n">make</span><span class="p">(</span><span class="s2">&quot;Blackjack-v1&quot;</span><span class="p">,</span> <span class="n">sab</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
</pre></div>
</div>
<div class="highlight-py notranslate"><div class="highlight"><pre><span></span><span class="c1"># Other possible environment configurations are:</span>
<span class="n">env</span> <span class="o">=</span> <span class="n">gym</span><span class="o">.</span><span class="n">make</span><span class="p">(</span><span class="s1">&#39;Blackjack-v1&#39;</span><span class="p">,</span> <span class="n">natural</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">sab</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span>
<span class="c1"># Whether to give an additional reward for starting with a natural blackjack, i.e. starting with an ace and ten (sum is 21).</span>
<span class="n">env</span> <span class="o">=</span> <span class="n">gym</span><span class="o">.</span><span class="n">make</span><span class="p">(</span><span class="s1">&#39;Blackjack-v1&#39;</span><span class="p">,</span> <span class="n">natural</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">sab</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span>
<span class="c1"># Whether to follow the exact rules outlined in the book by Sutton and Barto. If `sab` is `True`, the keyword argument `natural` will be ignored.</span>
</pre></div>
</div>
</section>
<section id="observing-the-environment">
<h2>Observing the environment<a class="headerlink" href="#observing-the-environment" title="Link to this heading"></a></h2>
<p>First of all, we call <code class="docutils literal notranslate"><span class="pre">env.reset()</span></code> to start an episode. This function
resets the environment to a starting position and returns an initial
<code class="docutils literal notranslate"><span class="pre">observation</span></code>. We usually also set <code class="docutils literal notranslate"><span class="pre">done</span> <span class="pre">=</span> <span class="pre">False</span></code>. This variable
will be useful later to check if a game is terminated (i.e., the player wins or loses).</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="c1"># reset the environment to get the first observation</span>
<span class="n">done</span> <span class="o">=</span> <span class="kc">False</span>
<span class="n">observation</span><span class="p">,</span> <span class="n">info</span> <span class="o">=</span> <span class="n">env</span><span class="o">.</span><span class="n">reset</span><span class="p">()</span>
<span class="c1"># observation = (16, 9, False)</span>
</pre></div>
</div>
<p>Note that our observation is a 3-tuple consisting of 3 values:</p>
<ul class="simple">
<li><p>The players current sum</p></li>
<li><p>Value of the dealers face-up card</p></li>
<li><p>Boolean whether the player holds a usable ace (An ace is usable if it
counts as 11 without busting)</p></li>
</ul>
</section>
<section id="executing-an-action">
<h2>Executing an action<a class="headerlink" href="#executing-an-action" title="Link to this heading"></a></h2>
<p>After receiving our first observation, we are only going to use the
<code class="docutils literal notranslate"><span class="pre">env.step(action)</span></code> function to interact with the environment. This
function takes an action as input and executes it in the environment.
Because that action changes the state of the environment, it returns
four useful variables to us. These are:</p>
<ul class="simple">
<li><p><code class="docutils literal notranslate"><span class="pre">next_state</span></code>: This is the observation that the agent will receive
after taking the action.</p></li>
<li><p><code class="docutils literal notranslate"><span class="pre">reward</span></code>: This is the reward that the agent will receive after
taking the action.</p></li>
<li><p><code class="docutils literal notranslate"><span class="pre">terminated</span></code>: This is a boolean variable that indicates whether or
not the environment has terminated.</p></li>
<li><p><code class="docutils literal notranslate"><span class="pre">truncated</span></code>: This is a boolean variable that also indicates whether
the episode ended by early truncation, i.e., a time limit is reached.</p></li>
<li><p><code class="docutils literal notranslate"><span class="pre">info</span></code>: This is a dictionary that might contain additional
information about the environment.</p></li>
</ul>
<p>The <code class="docutils literal notranslate"><span class="pre">next_state</span></code>, <code class="docutils literal notranslate"><span class="pre">reward</span></code>, <code class="docutils literal notranslate"><span class="pre">terminated</span></code> and <code class="docutils literal notranslate"><span class="pre">truncated</span></code> variables are
self-explanatory, but the <code class="docutils literal notranslate"><span class="pre">info</span></code> variable requires some additional
explanation. This variable contains a dictionary that might have some
extra information about the environment, but in the Blackjack-v1
environment you can ignore it. For example in Atari environments the
info dictionary has a <code class="docutils literal notranslate"><span class="pre">ale.lives</span></code> key that tells us how many lives the
agent has left. If the agent has 0 lives, then the episode is over.</p>
<p>Note that it is not a good idea to call <code class="docutils literal notranslate"><span class="pre">env.render()</span></code> in your training
loop because rendering slows down training by a lot. Rather try to build
an extra loop to evaluate and showcase the agent after training.</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="c1"># sample a random action from all valid actions</span>
<span class="n">action</span> <span class="o">=</span> <span class="n">env</span><span class="o">.</span><span class="n">action_space</span><span class="o">.</span><span class="n">sample</span><span class="p">()</span>
<span class="c1"># action=1</span>
<span class="c1"># execute the action in our environment and receive infos from the environment</span>
<span class="n">observation</span><span class="p">,</span> <span class="n">reward</span><span class="p">,</span> <span class="n">terminated</span><span class="p">,</span> <span class="n">truncated</span><span class="p">,</span> <span class="n">info</span> <span class="o">=</span> <span class="n">env</span><span class="o">.</span><span class="n">step</span><span class="p">(</span><span class="n">action</span><span class="p">)</span>
<span class="c1"># observation=(24, 10, False)</span>
<span class="c1"># reward=-1.0</span>
<span class="c1"># terminated=True</span>
<span class="c1"># truncated=False</span>
<span class="c1"># info={}</span>
</pre></div>
</div>
<p>Once <code class="docutils literal notranslate"><span class="pre">terminated</span> <span class="pre">=</span> <span class="pre">True</span></code> or <code class="docutils literal notranslate"><span class="pre">truncated=True</span></code>, we should stop the
current episode and begin a new one with <code class="docutils literal notranslate"><span class="pre">env.reset()</span></code>. If you
continue executing actions without resetting the environment, it still
responds but the output wont be useful for training (it might even be
harmful if the agent learns on invalid data).</p>
</section>
<section id="building-an-agent">
<h2>Building an agent<a class="headerlink" href="#building-an-agent" title="Link to this heading"></a></h2>
<p>Lets build a <code class="docutils literal notranslate"><span class="pre">Q-learning</span> <span class="pre">agent</span></code> to solve <em>Blackjack-v1</em>! Well need
some functions for picking an action and updating the agents action
values. To ensure that the agents explores the environment, one possible
solution is the <code class="docutils literal notranslate"><span class="pre">epsilon-greedy</span></code> strategy, where we pick a random
action with the percentage <code class="docutils literal notranslate"><span class="pre">epsilon</span></code> and the greedy action (currently
valued as the best) <code class="docutils literal notranslate"><span class="pre">1</span> <span class="pre">-</span> <span class="pre">epsilon</span></code>.</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="k">class</span><span class="w"> </span><span class="nc">BlackjackAgent</span><span class="p">:</span>
<span class="k">def</span><span class="w"> </span><span class="fm">__init__</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">env</span><span class="p">,</span>
<span class="n">learning_rate</span><span class="p">:</span> <span class="nb">float</span><span class="p">,</span>
<span class="n">initial_epsilon</span><span class="p">:</span> <span class="nb">float</span><span class="p">,</span>
<span class="n">epsilon_decay</span><span class="p">:</span> <span class="nb">float</span><span class="p">,</span>
<span class="n">final_epsilon</span><span class="p">:</span> <span class="nb">float</span><span class="p">,</span>
<span class="n">discount_factor</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.95</span><span class="p">,</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Initialize a Reinforcement Learning agent with an empty dictionary</span>
<span class="sd"> of state-action values (q_values), a learning rate and an epsilon.</span>
<span class="sd"> Args:</span>
<span class="sd"> learning_rate: The learning rate</span>
<span class="sd"> initial_epsilon: The initial epsilon value</span>
<span class="sd"> epsilon_decay: The decay for epsilon</span>
<span class="sd"> final_epsilon: The final epsilon value</span>
<span class="sd"> discount_factor: The discount factor for computing the Q-value</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="bp">self</span><span class="o">.</span><span class="n">q_values</span> <span class="o">=</span> <span class="n">defaultdict</span><span class="p">(</span><span class="k">lambda</span><span class="p">:</span> <span class="n">np</span><span class="o">.</span><span class="n">zeros</span><span class="p">(</span><span class="n">env</span><span class="o">.</span><span class="n">action_space</span><span class="o">.</span><span class="n">n</span><span class="p">))</span>
<span class="bp">self</span><span class="o">.</span><span class="n">lr</span> <span class="o">=</span> <span class="n">learning_rate</span>
<span class="bp">self</span><span class="o">.</span><span class="n">discount_factor</span> <span class="o">=</span> <span class="n">discount_factor</span>
<span class="bp">self</span><span class="o">.</span><span class="n">epsilon</span> <span class="o">=</span> <span class="n">initial_epsilon</span>
<span class="bp">self</span><span class="o">.</span><span class="n">epsilon_decay</span> <span class="o">=</span> <span class="n">epsilon_decay</span>
<span class="bp">self</span><span class="o">.</span><span class="n">final_epsilon</span> <span class="o">=</span> <span class="n">final_epsilon</span>
<span class="bp">self</span><span class="o">.</span><span class="n">training_error</span> <span class="o">=</span> <span class="p">[]</span>
<span class="k">def</span><span class="w"> </span><span class="nf">get_action</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">env</span><span class="p">,</span> <span class="n">obs</span><span class="p">:</span> <span class="nb">tuple</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="nb">int</span><span class="p">,</span> <span class="nb">bool</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="nb">int</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the best action with probability (1 - epsilon)</span>
<span class="sd"> otherwise a random action with probability epsilon to ensure exploration.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="c1"># with probability epsilon return a random action to explore the environment</span>
<span class="k">if</span> <span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">random</span><span class="p">()</span> <span class="o">&lt;</span> <span class="bp">self</span><span class="o">.</span><span class="n">epsilon</span><span class="p">:</span>
<span class="k">return</span> <span class="n">env</span><span class="o">.</span><span class="n">action_space</span><span class="o">.</span><span class="n">sample</span><span class="p">()</span>
<span class="c1"># with probability (1 - epsilon) act greedily (exploit)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="nb">int</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">argmax</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">q_values</span><span class="p">[</span><span class="n">obs</span><span class="p">]))</span>
<span class="k">def</span><span class="w"> </span><span class="nf">update</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">obs</span><span class="p">:</span> <span class="nb">tuple</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="nb">int</span><span class="p">,</span> <span class="nb">bool</span><span class="p">],</span>
<span class="n">action</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span>
<span class="n">reward</span><span class="p">:</span> <span class="nb">float</span><span class="p">,</span>
<span class="n">terminated</span><span class="p">:</span> <span class="nb">bool</span><span class="p">,</span>
<span class="n">next_obs</span><span class="p">:</span> <span class="nb">tuple</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="nb">int</span><span class="p">,</span> <span class="nb">bool</span><span class="p">],</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Updates the Q-value of an action.&quot;&quot;&quot;</span>
<span class="n">future_q_value</span> <span class="o">=</span> <span class="p">(</span><span class="ow">not</span> <span class="n">terminated</span><span class="p">)</span> <span class="o">*</span> <span class="n">np</span><span class="o">.</span><span class="n">max</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">q_values</span><span class="p">[</span><span class="n">next_obs</span><span class="p">])</span>
<span class="n">temporal_difference</span> <span class="o">=</span> <span class="p">(</span>
<span class="n">reward</span> <span class="o">+</span> <span class="bp">self</span><span class="o">.</span><span class="n">discount_factor</span> <span class="o">*</span> <span class="n">future_q_value</span> <span class="o">-</span> <span class="bp">self</span><span class="o">.</span><span class="n">q_values</span><span class="p">[</span><span class="n">obs</span><span class="p">][</span><span class="n">action</span><span class="p">]</span>
<span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">q_values</span><span class="p">[</span><span class="n">obs</span><span class="p">][</span><span class="n">action</span><span class="p">]</span> <span class="o">=</span> <span class="p">(</span>
<span class="bp">self</span><span class="o">.</span><span class="n">q_values</span><span class="p">[</span><span class="n">obs</span><span class="p">][</span><span class="n">action</span><span class="p">]</span> <span class="o">+</span> <span class="bp">self</span><span class="o">.</span><span class="n">lr</span> <span class="o">*</span> <span class="n">temporal_difference</span>
<span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">training_error</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">temporal_difference</span><span class="p">)</span>
<span class="k">def</span><span class="w"> </span><span class="nf">decay_epsilon</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="bp">self</span><span class="o">.</span><span class="n">epsilon</span> <span class="o">=</span> <span class="nb">max</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">final_epsilon</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">epsilon</span> <span class="o">-</span> <span class="bp">self</span><span class="o">.</span><span class="n">epsilon_decay</span><span class="p">)</span>
</pre></div>
</div>
<p>To train the agent, we will let the agent play one episode (one complete
game is called an episode) at a time and then update its Q-values after
each step (one single action in a game is called a step).</p>
<p>The agent will have to experience a lot of episodes to explore the
environment sufficiently.</p>
<p>Now we should be ready to build the training loop.</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="c1"># hyperparameters</span>
<span class="n">learning_rate</span> <span class="o">=</span> <span class="mf">0.01</span>
<span class="n">n_episodes</span> <span class="o">=</span> <span class="mi">100_000</span>
<span class="n">start_epsilon</span> <span class="o">=</span> <span class="mf">1.0</span>
<span class="n">epsilon_decay</span> <span class="o">=</span> <span class="n">start_epsilon</span> <span class="o">/</span> <span class="p">(</span><span class="n">n_episodes</span> <span class="o">/</span> <span class="mi">2</span><span class="p">)</span> <span class="c1"># reduce the exploration over time</span>
<span class="n">final_epsilon</span> <span class="o">=</span> <span class="mf">0.1</span>
<span class="n">agent</span> <span class="o">=</span> <span class="n">BlackjackAgent</span><span class="p">(</span>
<span class="n">env</span><span class="o">=</span><span class="n">env</span><span class="p">,</span>
<span class="n">learning_rate</span><span class="o">=</span><span class="n">learning_rate</span><span class="p">,</span>
<span class="n">initial_epsilon</span><span class="o">=</span><span class="n">start_epsilon</span><span class="p">,</span>
<span class="n">epsilon_decay</span><span class="o">=</span><span class="n">epsilon_decay</span><span class="p">,</span>
<span class="n">final_epsilon</span><span class="o">=</span><span class="n">final_epsilon</span><span class="p">,</span>
<span class="p">)</span>
</pre></div>
</div>
<p>Great, lets train!</p>
<p>Info: The current hyperparameters are set to quickly train a decent agent.
If you want to converge to the optimal policy, try increasing
the n_episodes by 10x and lower the learning_rate (e.g. to 0.001).</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">env</span> <span class="o">=</span> <span class="n">gym</span><span class="o">.</span><span class="n">wrappers</span><span class="o">.</span><span class="n">RecordEpisodeStatistics</span><span class="p">(</span><span class="n">env</span><span class="p">,</span> <span class="n">deque_size</span><span class="o">=</span><span class="n">n_episodes</span><span class="p">)</span>
<span class="k">for</span> <span class="n">episode</span> <span class="ow">in</span> <span class="n">tqdm</span><span class="p">(</span><span class="nb">range</span><span class="p">(</span><span class="n">n_episodes</span><span class="p">)):</span>
<span class="n">obs</span><span class="p">,</span> <span class="n">info</span> <span class="o">=</span> <span class="n">env</span><span class="o">.</span><span class="n">reset</span><span class="p">()</span>
<span class="n">done</span> <span class="o">=</span> <span class="kc">False</span>
<span class="c1"># play one episode</span>
<span class="k">while</span> <span class="ow">not</span> <span class="n">done</span><span class="p">:</span>
<span class="n">action</span> <span class="o">=</span> <span class="n">agent</span><span class="o">.</span><span class="n">get_action</span><span class="p">(</span><span class="n">env</span><span class="p">,</span> <span class="n">obs</span><span class="p">)</span>
<span class="n">next_obs</span><span class="p">,</span> <span class="n">reward</span><span class="p">,</span> <span class="n">terminated</span><span class="p">,</span> <span class="n">truncated</span><span class="p">,</span> <span class="n">info</span> <span class="o">=</span> <span class="n">env</span><span class="o">.</span><span class="n">step</span><span class="p">(</span><span class="n">action</span><span class="p">)</span>
<span class="c1"># update the agent</span>
<span class="n">agent</span><span class="o">.</span><span class="n">update</span><span class="p">(</span><span class="n">obs</span><span class="p">,</span> <span class="n">action</span><span class="p">,</span> <span class="n">reward</span><span class="p">,</span> <span class="n">terminated</span><span class="p">,</span> <span class="n">next_obs</span><span class="p">)</span>
<span class="c1"># update if the environment is done and the current obs</span>
<span class="n">done</span> <span class="o">=</span> <span class="n">terminated</span> <span class="ow">or</span> <span class="n">truncated</span>
<span class="n">obs</span> <span class="o">=</span> <span class="n">next_obs</span>
<span class="n">agent</span><span class="o">.</span><span class="n">decay_epsilon</span><span class="p">()</span>
</pre></div>
</div>
</section>
<section id="visualizing-the-training">
<h2>Visualizing the training<a class="headerlink" href="#visualizing-the-training" title="Link to this heading"></a></h2>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">rolling_length</span> <span class="o">=</span> <span class="mi">500</span>
<span class="n">fig</span><span class="p">,</span> <span class="n">axs</span> <span class="o">=</span> <span class="n">plt</span><span class="o">.</span><span class="n">subplots</span><span class="p">(</span><span class="n">ncols</span><span class="o">=</span><span class="mi">3</span><span class="p">,</span> <span class="n">figsize</span><span class="o">=</span><span class="p">(</span><span class="mi">12</span><span class="p">,</span> <span class="mi">5</span><span class="p">))</span>
<span class="n">axs</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">set_title</span><span class="p">(</span><span class="s2">&quot;Episode rewards&quot;</span><span class="p">)</span>
<span class="c1"># compute and assign a rolling average of the data to provide a smoother graph</span>
<span class="n">reward_moving_average</span> <span class="o">=</span> <span class="p">(</span>
<span class="n">np</span><span class="o">.</span><span class="n">convolve</span><span class="p">(</span>
<span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">(</span><span class="n">env</span><span class="o">.</span><span class="n">return_queue</span><span class="p">)</span><span class="o">.</span><span class="n">flatten</span><span class="p">(),</span> <span class="n">np</span><span class="o">.</span><span class="n">ones</span><span class="p">(</span><span class="n">rolling_length</span><span class="p">),</span> <span class="n">mode</span><span class="o">=</span><span class="s2">&quot;valid&quot;</span>
<span class="p">)</span>
<span class="o">/</span> <span class="n">rolling_length</span>
<span class="p">)</span>
<span class="n">axs</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">plot</span><span class="p">(</span><span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">reward_moving_average</span><span class="p">)),</span> <span class="n">reward_moving_average</span><span class="p">)</span>
<span class="n">axs</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span><span class="o">.</span><span class="n">set_title</span><span class="p">(</span><span class="s2">&quot;Episode lengths&quot;</span><span class="p">)</span>
<span class="n">length_moving_average</span> <span class="o">=</span> <span class="p">(</span>
<span class="n">np</span><span class="o">.</span><span class="n">convolve</span><span class="p">(</span>
<span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">(</span><span class="n">env</span><span class="o">.</span><span class="n">length_queue</span><span class="p">)</span><span class="o">.</span><span class="n">flatten</span><span class="p">(),</span> <span class="n">np</span><span class="o">.</span><span class="n">ones</span><span class="p">(</span><span class="n">rolling_length</span><span class="p">),</span> <span class="n">mode</span><span class="o">=</span><span class="s2">&quot;same&quot;</span>
<span class="p">)</span>
<span class="o">/</span> <span class="n">rolling_length</span>
<span class="p">)</span>
<span class="n">axs</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span><span class="o">.</span><span class="n">plot</span><span class="p">(</span><span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">length_moving_average</span><span class="p">)),</span> <span class="n">length_moving_average</span><span class="p">)</span>
<span class="n">axs</span><span class="p">[</span><span class="mi">2</span><span class="p">]</span><span class="o">.</span><span class="n">set_title</span><span class="p">(</span><span class="s2">&quot;Training Error&quot;</span><span class="p">)</span>
<span class="n">training_error_moving_average</span> <span class="o">=</span> <span class="p">(</span>
<span class="n">np</span><span class="o">.</span><span class="n">convolve</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">(</span><span class="n">agent</span><span class="o">.</span><span class="n">training_error</span><span class="p">),</span> <span class="n">np</span><span class="o">.</span><span class="n">ones</span><span class="p">(</span><span class="n">rolling_length</span><span class="p">),</span> <span class="n">mode</span><span class="o">=</span><span class="s2">&quot;same&quot;</span><span class="p">)</span>
<span class="o">/</span> <span class="n">rolling_length</span>
<span class="p">)</span>
<span class="n">axs</span><span class="p">[</span><span class="mi">2</span><span class="p">]</span><span class="o">.</span><span class="n">plot</span><span class="p">(</span><span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">training_error_moving_average</span><span class="p">)),</span> <span class="n">training_error_moving_average</span><span class="p">)</span>
<span class="n">plt</span><span class="o">.</span><span class="n">tight_layout</span><span class="p">()</span>
<span class="n">plt</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
</pre></div>
</div>
<img alt="../../../_images/blackjack_training_plots.png" src="../../../_images/blackjack_training_plots.png" />
</section>
<section id="visualising-the-policy">
<h2>Visualising the policy<a class="headerlink" href="#visualising-the-policy" title="Link to this heading"></a></h2>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="k">def</span><span class="w"> </span><span class="nf">create_grids</span><span class="p">(</span><span class="n">agent</span><span class="p">,</span> <span class="n">usable_ace</span><span class="o">=</span><span class="kc">False</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Create value and policy grid given an agent.&quot;&quot;&quot;</span>
<span class="c1"># convert our state-action values to state values</span>
<span class="c1"># and build a policy dictionary that maps observations to actions</span>
<span class="n">state_value</span> <span class="o">=</span> <span class="n">defaultdict</span><span class="p">(</span><span class="nb">float</span><span class="p">)</span>
<span class="n">policy</span> <span class="o">=</span> <span class="n">defaultdict</span><span class="p">(</span><span class="nb">int</span><span class="p">)</span>
<span class="k">for</span> <span class="n">obs</span><span class="p">,</span> <span class="n">action_values</span> <span class="ow">in</span> <span class="n">agent</span><span class="o">.</span><span class="n">q_values</span><span class="o">.</span><span class="n">items</span><span class="p">():</span>
<span class="n">state_value</span><span class="p">[</span><span class="n">obs</span><span class="p">]</span> <span class="o">=</span> <span class="nb">float</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">max</span><span class="p">(</span><span class="n">action_values</span><span class="p">))</span>
<span class="n">policy</span><span class="p">[</span><span class="n">obs</span><span class="p">]</span> <span class="o">=</span> <span class="nb">int</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">argmax</span><span class="p">(</span><span class="n">action_values</span><span class="p">))</span>
<span class="n">player_count</span><span class="p">,</span> <span class="n">dealer_count</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">meshgrid</span><span class="p">(</span>
<span class="c1"># players count, dealers face-up card</span>
<span class="n">np</span><span class="o">.</span><span class="n">arange</span><span class="p">(</span><span class="mi">12</span><span class="p">,</span> <span class="mi">22</span><span class="p">),</span>
<span class="n">np</span><span class="o">.</span><span class="n">arange</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">11</span><span class="p">),</span>
<span class="p">)</span>
<span class="c1"># create the value grid for plotting</span>
<span class="n">value</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">apply_along_axis</span><span class="p">(</span>
<span class="k">lambda</span> <span class="n">obs</span><span class="p">:</span> <span class="n">state_value</span><span class="p">[(</span><span class="n">obs</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="n">obs</span><span class="p">[</span><span class="mi">1</span><span class="p">],</span> <span class="n">usable_ace</span><span class="p">)],</span>
<span class="n">axis</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span>
<span class="n">arr</span><span class="o">=</span><span class="n">np</span><span class="o">.</span><span class="n">dstack</span><span class="p">([</span><span class="n">player_count</span><span class="p">,</span> <span class="n">dealer_count</span><span class="p">]),</span>
<span class="p">)</span>
<span class="n">value_grid</span> <span class="o">=</span> <span class="n">player_count</span><span class="p">,</span> <span class="n">dealer_count</span><span class="p">,</span> <span class="n">value</span>
<span class="c1"># create the policy grid for plotting</span>
<span class="n">policy_grid</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">apply_along_axis</span><span class="p">(</span>
<span class="k">lambda</span> <span class="n">obs</span><span class="p">:</span> <span class="n">policy</span><span class="p">[(</span><span class="n">obs</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="n">obs</span><span class="p">[</span><span class="mi">1</span><span class="p">],</span> <span class="n">usable_ace</span><span class="p">)],</span>
<span class="n">axis</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span>
<span class="n">arr</span><span class="o">=</span><span class="n">np</span><span class="o">.</span><span class="n">dstack</span><span class="p">([</span><span class="n">player_count</span><span class="p">,</span> <span class="n">dealer_count</span><span class="p">]),</span>
<span class="p">)</span>
<span class="k">return</span> <span class="n">value_grid</span><span class="p">,</span> <span class="n">policy_grid</span>
<span class="k">def</span><span class="w"> </span><span class="nf">create_plots</span><span class="p">(</span><span class="n">value_grid</span><span class="p">,</span> <span class="n">policy_grid</span><span class="p">,</span> <span class="n">title</span><span class="p">:</span> <span class="nb">str</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Creates a plot using a value and policy grid.&quot;&quot;&quot;</span>
<span class="c1"># create a new figure with 2 subplots (left: state values, right: policy)</span>
<span class="n">player_count</span><span class="p">,</span> <span class="n">dealer_count</span><span class="p">,</span> <span class="n">value</span> <span class="o">=</span> <span class="n">value_grid</span>
<span class="n">fig</span> <span class="o">=</span> <span class="n">plt</span><span class="o">.</span><span class="n">figure</span><span class="p">(</span><span class="n">figsize</span><span class="o">=</span><span class="n">plt</span><span class="o">.</span><span class="n">figaspect</span><span class="p">(</span><span class="mf">0.4</span><span class="p">))</span>
<span class="n">fig</span><span class="o">.</span><span class="n">suptitle</span><span class="p">(</span><span class="n">title</span><span class="p">,</span> <span class="n">fontsize</span><span class="o">=</span><span class="mi">16</span><span class="p">)</span>
<span class="c1"># plot the state values</span>
<span class="n">ax1</span> <span class="o">=</span> <span class="n">fig</span><span class="o">.</span><span class="n">add_subplot</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="n">projection</span><span class="o">=</span><span class="s2">&quot;3d&quot;</span><span class="p">)</span>
<span class="n">ax1</span><span class="o">.</span><span class="n">plot_surface</span><span class="p">(</span>
<span class="n">player_count</span><span class="p">,</span>
<span class="n">dealer_count</span><span class="p">,</span>
<span class="n">value</span><span class="p">,</span>
<span class="n">rstride</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
<span class="n">cstride</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
<span class="n">cmap</span><span class="o">=</span><span class="s2">&quot;viridis&quot;</span><span class="p">,</span>
<span class="n">edgecolor</span><span class="o">=</span><span class="s2">&quot;none&quot;</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">plt</span><span class="o">.</span><span class="n">xticks</span><span class="p">(</span><span class="nb">range</span><span class="p">(</span><span class="mi">12</span><span class="p">,</span> <span class="mi">22</span><span class="p">),</span> <span class="nb">range</span><span class="p">(</span><span class="mi">12</span><span class="p">,</span> <span class="mi">22</span><span class="p">))</span>
<span class="n">plt</span><span class="o">.</span><span class="n">yticks</span><span class="p">(</span><span class="nb">range</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">11</span><span class="p">),</span> <span class="p">[</span><span class="s2">&quot;A&quot;</span><span class="p">]</span> <span class="o">+</span> <span class="nb">list</span><span class="p">(</span><span class="nb">range</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mi">11</span><span class="p">)))</span>
<span class="n">ax1</span><span class="o">.</span><span class="n">set_title</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;State values: </span><span class="si">{</span><span class="n">title</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
<span class="n">ax1</span><span class="o">.</span><span class="n">set_xlabel</span><span class="p">(</span><span class="s2">&quot;Player sum&quot;</span><span class="p">)</span>
<span class="n">ax1</span><span class="o">.</span><span class="n">set_ylabel</span><span class="p">(</span><span class="s2">&quot;Dealer showing&quot;</span><span class="p">)</span>
<span class="n">ax1</span><span class="o">.</span><span class="n">zaxis</span><span class="o">.</span><span class="n">set_rotate_label</span><span class="p">(</span><span class="kc">False</span><span class="p">)</span>
<span class="n">ax1</span><span class="o">.</span><span class="n">set_zlabel</span><span class="p">(</span><span class="s2">&quot;Value&quot;</span><span class="p">,</span> <span class="n">fontsize</span><span class="o">=</span><span class="mi">14</span><span class="p">,</span> <span class="n">rotation</span><span class="o">=</span><span class="mi">90</span><span class="p">)</span>
<span class="n">ax1</span><span class="o">.</span><span class="n">view_init</span><span class="p">(</span><span class="mi">20</span><span class="p">,</span> <span class="mi">220</span><span class="p">)</span>
<span class="c1"># plot the policy</span>
<span class="n">fig</span><span class="o">.</span><span class="n">add_subplot</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">2</span><span class="p">)</span>
<span class="n">ax2</span> <span class="o">=</span> <span class="n">sns</span><span class="o">.</span><span class="n">heatmap</span><span class="p">(</span><span class="n">policy_grid</span><span class="p">,</span> <span class="n">linewidth</span><span class="o">=</span><span class="mi">0</span><span class="p">,</span> <span class="n">annot</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">cmap</span><span class="o">=</span><span class="s2">&quot;Accent_r&quot;</span><span class="p">,</span> <span class="n">cbar</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span>
<span class="n">ax2</span><span class="o">.</span><span class="n">set_title</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Policy: </span><span class="si">{</span><span class="n">title</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
<span class="n">ax2</span><span class="o">.</span><span class="n">set_xlabel</span><span class="p">(</span><span class="s2">&quot;Player sum&quot;</span><span class="p">)</span>
<span class="n">ax2</span><span class="o">.</span><span class="n">set_ylabel</span><span class="p">(</span><span class="s2">&quot;Dealer showing&quot;</span><span class="p">)</span>
<span class="n">ax2</span><span class="o">.</span><span class="n">set_xticklabels</span><span class="p">(</span><span class="nb">range</span><span class="p">(</span><span class="mi">12</span><span class="p">,</span> <span class="mi">22</span><span class="p">))</span>
<span class="n">ax2</span><span class="o">.</span><span class="n">set_yticklabels</span><span class="p">([</span><span class="s2">&quot;A&quot;</span><span class="p">]</span> <span class="o">+</span> <span class="nb">list</span><span class="p">(</span><span class="nb">range</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mi">11</span><span class="p">)),</span> <span class="n">fontsize</span><span class="o">=</span><span class="mi">12</span><span class="p">)</span>
<span class="c1"># add a legend</span>
<span class="n">legend_elements</span> <span class="o">=</span> <span class="p">[</span>
<span class="n">Patch</span><span class="p">(</span><span class="n">facecolor</span><span class="o">=</span><span class="s2">&quot;lightgreen&quot;</span><span class="p">,</span> <span class="n">edgecolor</span><span class="o">=</span><span class="s2">&quot;black&quot;</span><span class="p">,</span> <span class="n">label</span><span class="o">=</span><span class="s2">&quot;Hit&quot;</span><span class="p">),</span>
<span class="n">Patch</span><span class="p">(</span><span class="n">facecolor</span><span class="o">=</span><span class="s2">&quot;grey&quot;</span><span class="p">,</span> <span class="n">edgecolor</span><span class="o">=</span><span class="s2">&quot;black&quot;</span><span class="p">,</span> <span class="n">label</span><span class="o">=</span><span class="s2">&quot;Stick&quot;</span><span class="p">),</span>
<span class="p">]</span>
<span class="n">ax2</span><span class="o">.</span><span class="n">legend</span><span class="p">(</span><span class="n">handles</span><span class="o">=</span><span class="n">legend_elements</span><span class="p">,</span> <span class="n">bbox_to_anchor</span><span class="o">=</span><span class="p">(</span><span class="mf">1.3</span><span class="p">,</span> <span class="mi">1</span><span class="p">))</span>
<span class="k">return</span> <span class="n">fig</span>
<span class="c1"># state values &amp; policy with usable ace (ace counts as 11)</span>
<span class="n">value_grid</span><span class="p">,</span> <span class="n">policy_grid</span> <span class="o">=</span> <span class="n">create_grids</span><span class="p">(</span><span class="n">agent</span><span class="p">,</span> <span class="n">usable_ace</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
<span class="n">fig1</span> <span class="o">=</span> <span class="n">create_plots</span><span class="p">(</span><span class="n">value_grid</span><span class="p">,</span> <span class="n">policy_grid</span><span class="p">,</span> <span class="n">title</span><span class="o">=</span><span class="s2">&quot;With usable ace&quot;</span><span class="p">)</span>
<span class="n">plt</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
</pre></div>
</div>
<img alt="../../../_images/blackjack_with_usable_ace.png" src="../../../_images/blackjack_with_usable_ace.png" />
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="c1"># state values &amp; policy without usable ace (ace counts as 1)</span>
<span class="n">value_grid</span><span class="p">,</span> <span class="n">policy_grid</span> <span class="o">=</span> <span class="n">create_grids</span><span class="p">(</span><span class="n">agent</span><span class="p">,</span> <span class="n">usable_ace</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span>
<span class="n">fig2</span> <span class="o">=</span> <span class="n">create_plots</span><span class="p">(</span><span class="n">value_grid</span><span class="p">,</span> <span class="n">policy_grid</span><span class="p">,</span> <span class="n">title</span><span class="o">=</span><span class="s2">&quot;Without usable ace&quot;</span><span class="p">)</span>
<span class="n">plt</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
</pre></div>
</div>
<img alt="../../../_images/blackjack_without_usable_ace.png" src="../../../_images/blackjack_without_usable_ace.png" />
<p>Its good practice to call env.close() at the end of your script,
so that any used resources by the environment will be closed.</p>
</section>
<section id="think-you-can-do-better">
<h2>Think you can do better?<a class="headerlink" href="#think-you-can-do-better" title="Link to this heading"></a></h2>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="c1"># You can visualize the environment using the play function</span>
<span class="c1"># and try to win a few games.</span>
</pre></div>
</div>
<p>Hopefully this Tutorial helped you get a grip of how to interact with
OpenAI-Gym environments and sets you on a journey to solve many more RL
challenges.</p>
<p>It is recommended that you solve this environment by yourself (project
based learning is really effective!). You can apply your favorite
discrete RL algorithm or give Monte Carlo ES a try (covered in <a class="reference external" href="http://incompleteideas.net/book/the-book-2nd.html">Sutton &amp;
Barto</a>, section
5.3) - this way you can compare your results directly to the book.</p>
<p>Best of fun!</p>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-tutorials-training-agents-blackjack-tutorial-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../../_downloads/e1249c888e952c938d27855c3210a4bb/blackjack_tutorial.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">blackjack_tutorial.py</span></code></a></p>
</div>
<div class="sphx-glr-download sphx-glr-download-jupyter docutils container">
<p><a class="reference download internal" download="" href="../../../_downloads/d1980709c80836b9d7e8f9131878afbb/blackjack_tutorial.ipynb"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Jupyter</span> <span class="pre">notebook:</span> <span class="pre">blackjack_tutorial.ipynb</span></code></a></p>
</div>
</div>
</section>
</section>
</article>
</div>
<footer>
<div class="related-pages">
<a class="next-page" href="../FrozenLake_tuto/">
<div class="page-info">
<div class="context">
<span>Next</span>
</div>
<div class="title">Frozenlake benchmark</div>
</div>
<svg class="furo-related-icon">
<use href="#svg-arrow-right"></use>
</svg>
</a>
<a class="prev-page" href="../reinforce_invpend_gym_v26/">
<svg class="furo-related-icon">
<use href="#svg-arrow-right"></use>
</svg>
<div class="page-info">
<div class="context">
<span>Previous</span>
</div>
<div class="title">Training using REINFORCE for Mujoco</div>
</div>
</a>
</div>
<div class="bottom-of-page">
<div class="left-details">
<div class="copyright">
Copyright &#169; 2025 Farama Foundation
</div>
<!--
Made with <a href="https://www.sphinx-doc.org/">Sphinx</a> and <a class="muted-link" href="https://pradyunsg.me">@pradyunsg</a>'s
<a href="https://github.com/pradyunsg/furo">Furo</a>
-->
</div>
<div class="right-details">
<div class="icons">
<a class="muted-link" href="https://github.com/Farama-Foundation/Gymnasium/"
aria-label="On GitHub">
<svg stroke="currentColor" fill="currentColor" stroke-width="0" viewBox="0 0 16 16">
<path fill-rule="evenodd"
d="M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27.68 0 1.36.09 2 .27 1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55.38A8.013 8.013 0 0 0 16 8c0-4.42-3.58-8-8-8z">
</path>
</svg>
</a>
</div>
</div>
</div>
</footer>
</div>
<aside class="toc-drawer">
<div class="toc-sticky toc-scroll">
<div class="toc-title-container">
<span class="toc-title">
On this page
</span>
</div>
<div class="toc-tree-container">
<div class="toc-tree">
<ul>
<li><a class="reference internal" href="#">Solving Blackjack with Q-Learning</a><ul>
<li><a class="reference internal" href="#imports-and-environment-setup">Imports and Environment Setup</a></li>
<li><a class="reference internal" href="#observing-the-environment">Observing the environment</a></li>
<li><a class="reference internal" href="#executing-an-action">Executing an action</a></li>
<li><a class="reference internal" href="#building-an-agent">Building an agent</a></li>
<li><a class="reference internal" href="#visualizing-the-training">Visualizing the training</a></li>
<li><a class="reference internal" href="#visualising-the-policy">Visualising the policy</a></li>
<li><a class="reference internal" href="#think-you-can-do-better">Think you can do better?</a></li>
</ul>
</li>
</ul>
</div>
</div>
</div>
</aside>
</div>
</div>
</div>
<script>
const toggleMenu = () => {
const menuBtn = document.querySelector(".farama-header-menu__btn");
const menuContainer = document.querySelector(".farama-header-menu-container");
if (document.querySelector(".farama-header-menu").classList.contains("active")) {
menuBtn.setAttribute("aria-expanded", "false");
menuContainer.setAttribute("aria-hidden", "true");
} else {
menuBtn.setAttribute("aria-expanded", "true");
menuContainer.setAttribute("aria-hidden", "false");
}
document.querySelector(".farama-header-menu").classList.toggle("active");
}
document.querySelector(".farama-header-menu__btn").addEventListener("click", toggleMenu);
document.getElementById("farama-close-menu").addEventListener("click", toggleMenu);
</script>
<script async src="https://www.googletagmanager.com/gtag/js?id=G-6H9C8TWXZ8"></script>
<script>
const enableGtag = () => {
window.dataLayer = window.dataLayer || [];
function gtag(){dataLayer.push(arguments);}
gtag('js', new Date());
gtag('config', 'G-6H9C8TWXZ8');
}
(() => {
if (!localStorage.getItem("acceptedCookieAlert")) {
const boxElem = document.createElement("div");
boxElem.classList.add("cookie-alert");
const containerElem = document.createElement("div");
containerElem.classList.add("cookie-alert__container");
const textElem = document.createElement("p");
textElem.innerHTML = `This page uses <a href="https://analytics.google.com/">
Google Analytics</a> to collect statistics.`;
containerElem.appendChild(textElem);
const declineBtn = Object.assign(document.createElement("button"),
{
innerText: "Deny",
className: "farama-btn cookie-alert__button",
id: "cookie-alert__decline",
}
);
declineBtn.addEventListener("click", () => {
localStorage.setItem("acceptedCookieAlert", false);
boxElem.remove();
});
const acceptBtn = Object.assign(document.createElement("button"),
{
innerText: "Allow",
className: "farama-btn cookie-alert__button",
id: "cookie-alert__accept",
}
);
acceptBtn.addEventListener("click", () => {
localStorage.setItem("acceptedCookieAlert", true);
boxElem.remove();
enableGtag();
});
containerElem.appendChild(declineBtn);
containerElem.appendChild(acceptBtn);
boxElem.appendChild(containerElem);
document.body.appendChild(boxElem);
} else if (localStorage.getItem("acceptedCookieAlert") === "true") {
enableGtag();
}
})()
</script>
<script src="../../../_static/documentation_options.js?v=25d39d6f"></script>
<script src="../../../_static/doctools.js?v=9a2dae69"></script>
<script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
<script src="../../../_static/scripts/furo.js?v=7660844c"></script>
<script>
const createProjectsList = (projects, displayImages) => {
const ulElem = Object.assign(document.createElement('ul'),
{
className:'farama-header-menu-list',
}
)
for (let project of projects) {
const liElem = document.createElement("li");
const aElem = Object.assign(document.createElement("a"),
{
href: project.link
}
);
liElem.appendChild(aElem);
if (displayImages) {
const imgElem = Object.assign(document.createElement("img"),
{
src: project.image ? imagesBasepath + project.image : imagesBasepath + "/farama_black.svg",
alt: `${project.name} logo`,
className: "farama-black-logo-invert"
}
);
aElem.appendChild(imgElem);
}
aElem.appendChild(document.createTextNode(project.name));
ulElem.appendChild(liElem);
}
return ulElem;
}
// Create menu with Farama projects by using the API at farama.org/api/projects.json
const createCORSRequest = (method, url) => {
let xhr = new XMLHttpRequest();
xhr.responseType = 'json';
if ("withCredentials" in xhr) {
xhr.open(method, url, true);
} else if (typeof XDomainRequest != "undefined") {
// IE8 & IE9
xhr = new XDomainRequest();
xhr.open(method, url);
} else {
// CORS not supported.
xhr = null;
}
return xhr;
};
const url = 'https://farama.org/api/projects.json';
const imagesBasepath = "https://farama.org/assets/images"
const method = 'GET';
let xhr = createCORSRequest(method, url);
xhr.onload = () => {
const jsonResponse = xhr.response;
const sections = {
"Core Projects": [],
"Mature Projects": {
"Documentation": [],
"Repositories": [],
},
"Incubating Projects": {
"Documentation": [],
"Repositories": [],
},
"Foundation": [
{
name: "About",
link: "https://farama.org/about"
},
{
name: "Standards",
link: "https://farama.org/project_standards",
},
{
name: "Donate",
link: "https://farama.org/donations"
}
]
}
// Categorize projects
Object.keys(jsonResponse).forEach(key => {
projectJson = jsonResponse[key];
if (projectJson.website !== null) {
projectJson.link = projectJson.website;
} else {
projectJson.link = projectJson.github;
}
if (projectJson.type === "core") {
sections["Core Projects"].push(projectJson)
} else if (projectJson.type == "mature") {
if (projectJson.website !== null) {
sections["Mature Projects"]["Documentation"].push(projectJson)
} else {
sections["Mature Projects"]["Repositories"].push(projectJson)
}
} else {
if (projectJson.website !== null) {
sections["Incubating Projects"]["Documentation"].push(projectJson)
} else {
sections["Incubating Projects"]["Repositories"].push(projectJson)
}
}
})
const menuContainer = document.querySelector(".farama-header-menu__body");
Object.keys(sections).forEach((key, i) => {
const sectionElem = Object.assign(
document.createElement('div'), {
className:'farama-header-menu__section',
}
)
sectionElem.appendChild(Object.assign(document.createElement('span'),
{
className:'farama-header-menu__section-title' ,
innerText: key
}
))
// is not a list
if (sections[key].constructor !== Array) {
const subSections = sections[key];
const subSectionContainerElem = Object.assign(
document.createElement('div'), {
className:'farama-header-menu__subsections-container',
style: 'display: flex'
}
)
Object.keys(subSections).forEach((subKey, i) => {
const subSectionElem = Object.assign(
document.createElement('div'), {
className:'farama-header-menu__subsection',
}
)
subSectionElem.appendChild(Object.assign(document.createElement('span'),
{
className:'farama-header-menu__subsection-title' ,
innerText: subKey
}
))
const ulElem = createProjectsList(subSections[subKey], key !== 'Foundation');
subSectionElem.appendChild(ulElem);
subSectionContainerElem.appendChild(subSectionElem);
})
sectionElem.appendChild(subSectionContainerElem);
} else {
const projects = sections[key];
const ulElem = createProjectsList(projects, true);
sectionElem.appendChild(ulElem);
}
menuContainer.appendChild(sectionElem)
});
}
xhr.onerror = function() {
console.error("Unable to load projects");
};
xhr.send();
</script>
<script>
const versioningConfig = {
githubUser: 'Farama-Foundation',
githubRepo: 'Gymnasium',
};
fetch('/main/_static/versioning/versioning_menu.html').then(response => {
if (response.status === 200) {
response.text().then(text => {
const container = document.createElement("div");
container.innerHTML = text;
document.querySelector("body").appendChild(container);
// innerHtml doenst evaluate scripts, we need to add them dynamically
Array.from(container.querySelectorAll("script")).forEach(oldScript => {
const newScript = document.createElement("script");
Array.from(oldScript.attributes).forEach(attr => newScript.setAttribute(attr.name, attr.value));
newScript.appendChild(document.createTextNode(oldScript.innerHTML));
oldScript.parentNode.replaceChild(newScript, oldScript);
});
});
} else {
console.warn("Unable to load versioning menu", response);
}
});
</script>
</body>
</html>