mirror of
https://github.com/Farama-Foundation/Gymnasium.git
synced 2025-08-20 05:52:03 +00:00
1144 lines
90 KiB
HTML
1144 lines
90 KiB
HTML
<!doctype html>
|
||
<html class="no-js" lang="en" data-content_root="../../../">
|
||
<head><meta charset="utf-8"/>
|
||
<meta name="viewport" content="width=device-width,initial-scale=1"/>
|
||
<meta name="color-scheme" content="light dark">
|
||
<meta name="description" content="A standard API for reinforcement learning and a diverse set of reference environments (formerly Gym)">
|
||
<meta property="og:title" content="Gymnasium Documentation" />
|
||
<meta property="og:type" content="website" />
|
||
<meta property="og:description" content="A standard API for reinforcement learning and a diverse set of reference environments (formerly Gym)" />
|
||
<meta property="og:url" content="https://gymnasium.farama.org/tutorials/training_agents/blackjack_tutorial.html" /><meta property="og:image" content="https://gymnasium.farama.org/_static/img/gymnasium-github.png" /><meta name="twitter:card" content="summary_large_image"><meta name="viewport" content="width=device-width, initial-scale=1" />
|
||
<link rel="index" title="Index" href="../../../genindex/" /><link rel="search" title="Search" href="../../../search/" /><link rel="next" title="Frozenlake benchmark" href="../FrozenLake_tuto/" /><link rel="prev" title="Training using REINFORCE for Mujoco" href="../reinforce_invpend_gym_v26/" />
|
||
<link rel="canonical" href="https://gymnasium.farama.org/tutorials/training_agents/blackjack_tutorial.html" />
|
||
|
||
<link rel="shortcut icon" href="../../../_static/favicon.png"/><!-- Generated with Sphinx 7.4.7 and Furo 2023.08.19.dev1 -->
|
||
<title>Solving Blackjack with Q-Learning - Gymnasium Documentation</title>
|
||
<link rel="stylesheet" type="text/css" href="../../../_static/pygments.css?v=8f2a1f02" />
|
||
<link rel="stylesheet" type="text/css" href="../../../_static/styles/furo.css?v=3e7f4c72" />
|
||
<link rel="stylesheet" type="text/css" href="../../../_static/sg_gallery.css?v=61a4c737" />
|
||
<link rel="stylesheet" type="text/css" href="../../../_static/sg_gallery-binder.css?v=f4aeca0c" />
|
||
<link rel="stylesheet" type="text/css" href="../../../_static/sg_gallery-dataframe.css?v=2082cf3c" />
|
||
<link rel="stylesheet" type="text/css" href="../../../_static/sg_gallery-rendered-html.css?v=1277b6f3" />
|
||
<link rel="stylesheet" type="text/css" href="../../../_static/styles/furo-extensions.css?v=82c8b628" />
|
||
|
||
|
||
|
||
|
||
<style>
|
||
body {
|
||
--color-code-background: #f8f8f8;
|
||
--color-code-foreground: black;
|
||
|
||
}
|
||
@media not print {
|
||
body[data-theme="dark"] {
|
||
--color-code-background: #202020;
|
||
--color-code-foreground: #d0d0d0;
|
||
|
||
}
|
||
@media (prefers-color-scheme: dark) {
|
||
body:not([data-theme="light"]) {
|
||
--color-code-background: #202020;
|
||
--color-code-foreground: #d0d0d0;
|
||
|
||
}
|
||
}
|
||
}
|
||
</style></head>
|
||
<body>
|
||
<header class="farama-header" aria-label="Farama header">
|
||
<div class="farama-header__container">
|
||
<div class="farama-header__left--mobile">
|
||
<label class="nav-overlay-icon" for="__navigation">
|
||
<div class="visually-hidden">Toggle site navigation sidebar</div>
|
||
<svg viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg">
|
||
<defs></defs>
|
||
<line x1="0.5" y1="4" x2="23.5" y2="4"></line>
|
||
<line x1="0.232" y1="12" x2="23.5" y2="12"></line>
|
||
<line x1="0.232" y1="20" x2="23.5" y2="20"></line>
|
||
</svg>
|
||
</label>
|
||
</div>
|
||
<div class="farama-header__left farama-header__center--mobile">
|
||
<a href="../../../">
|
||
<img class="farama-header__logo only-light" src="../../../_static/img/gymnasium_black.svg" alt="Light Logo"/>
|
||
<img class="farama-header__logo only-dark" src="../../../_static/img/gymnasium_white.svg" alt="Dark Logo"/>
|
||
<span class="farama-header__title">Gymnasium Documentation</span>
|
||
</a>
|
||
</div>
|
||
<div class="farama-header__right">
|
||
<div class="farama-header-menu">
|
||
<button class="farama-header-menu__btn" aria-label="Open Farama Menu" aria-expanded="false" aria-haspopup="true" aria-controls="farama-menu">
|
||
<img class="farama-black-logo-invert" src="../../../_static/img/farama-logo-header.svg">
|
||
<svg viewBox="0 0 24 24" viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg">
|
||
<polyline style="stroke-linecap: round; stroke-linejoin: round; fill: none; stroke-width: 2px;" points="1 7 12 18 23 7"></polyline>
|
||
</svg>
|
||
</button>
|
||
<div class="farama-header-menu-container farama-hidden" aria-hidden="true" id="farama-menu">
|
||
<div class="farama-header-menu__header">
|
||
<a href="https://farama.org">
|
||
<img class="farama-header-menu__logo farama-white-logo-invert" src="../../../_static/img/farama_solid_white.svg" alt="Farama Foundation logo">
|
||
<span>Farama Foundation</span>
|
||
</a>
|
||
<div class="farama-header-menu-header__right">
|
||
<button id="farama-close-menu">
|
||
<svg viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg" fill="none" stroke="currentColor"
|
||
stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="icon-close">
|
||
<line x1="3" y1="21" x2="21" y2="3"></line>
|
||
<line x1="3" y1="3" x2="21" y2="21"></line>
|
||
</svg>
|
||
</button>
|
||
</div>
|
||
</div>
|
||
<div class="farama-header-menu__body">
|
||
<!-- Response from farama.org/api/projects.json -->
|
||
</div>
|
||
</div>
|
||
</div>
|
||
</div>
|
||
</div>
|
||
</header>
|
||
|
||
|
||
<script>
|
||
document.body.dataset.theme = localStorage.getItem("theme") || "auto";
|
||
</script>
|
||
|
||
|
||
<svg xmlns="http://www.w3.org/2000/svg" style="display: none;">
|
||
<symbol id="svg-toc" viewBox="0 0 24 24">
|
||
<title>Contents</title>
|
||
<svg stroke="currentColor" fill="currentColor" stroke-width="0" viewBox="0 0 1024 1024">
|
||
<path d="M408 442h480c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8H408c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8zm-8 204c0 4.4 3.6 8 8 8h480c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8H408c-4.4 0-8 3.6-8 8v56zm504-486H120c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8h784c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8zm0 632H120c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8h784c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8zM115.4 518.9L271.7 642c5.8 4.6 14.4.5 14.4-6.9V388.9c0-7.4-8.5-11.5-14.4-6.9L115.4 505.1a8.74 8.74 0 0 0 0 13.8z"/>
|
||
</svg>
|
||
</symbol>
|
||
<symbol id="svg-menu" viewBox="0 0 24 24">
|
||
<title>Menu</title>
|
||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
|
||
stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather-menu">
|
||
<line x1="3" y1="12" x2="21" y2="12"></line>
|
||
<line x1="3" y1="6" x2="21" y2="6"></line>
|
||
<line x1="3" y1="18" x2="21" y2="18"></line>
|
||
</svg>
|
||
</symbol>
|
||
<symbol id="svg-arrow-right" viewBox="0 0 24 24">
|
||
<title>Expand</title>
|
||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
|
||
stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather-chevron-right">
|
||
<polyline points="9 18 15 12 9 6"></polyline>
|
||
</svg>
|
||
</symbol>
|
||
<symbol id="svg-sun" viewBox="0 0 24 24">
|
||
<title>Light mode</title>
|
||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
|
||
stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="feather-sun">
|
||
<circle cx="12" cy="12" r="5"></circle>
|
||
<line x1="12" y1="1" x2="12" y2="3"></line>
|
||
<line x1="12" y1="21" x2="12" y2="23"></line>
|
||
<line x1="4.22" y1="4.22" x2="5.64" y2="5.64"></line>
|
||
<line x1="18.36" y1="18.36" x2="19.78" y2="19.78"></line>
|
||
<line x1="1" y1="12" x2="3" y2="12"></line>
|
||
<line x1="21" y1="12" x2="23" y2="12"></line>
|
||
<line x1="4.22" y1="19.78" x2="5.64" y2="18.36"></line>
|
||
<line x1="18.36" y1="5.64" x2="19.78" y2="4.22"></line>
|
||
</svg>
|
||
</symbol>
|
||
<symbol id="svg-moon" viewBox="0 0 24 24">
|
||
<title>Dark mode</title>
|
||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
|
||
stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="icon-tabler-moon">
|
||
<path stroke="none" d="M0 0h24v24H0z" fill="none" />
|
||
<path d="M12 3c.132 0 .263 0 .393 0a7.5 7.5 0 0 0 7.92 12.446a9 9 0 1 1 -8.313 -12.454z" />
|
||
</svg>
|
||
</symbol>
|
||
<symbol id="svg-sun-half" viewBox="0 0 24 24">
|
||
<title>Auto light/dark mode</title>
|
||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
|
||
stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="icon-tabler-shadow">
|
||
<path stroke="none" d="M0 0h24v24H0z" fill="none"/>
|
||
<circle cx="12" cy="12" r="9" />
|
||
<path d="M13 12h5" />
|
||
<path d="M13 15h4" />
|
||
<path d="M13 18h1" />
|
||
<path d="M13 9h4" />
|
||
<path d="M13 6h1" />
|
||
</svg>
|
||
</symbol>
|
||
</svg>
|
||
|
||
<input type="checkbox" class="sidebar-toggle" name="__navigation" id="__navigation">
|
||
<input type="checkbox" class="sidebar-toggle" name="__toc" id="__toc">
|
||
<label class="overlay sidebar-overlay" for="__navigation">
|
||
<div class="visually-hidden">Hide navigation sidebar</div>
|
||
</label>
|
||
<label class="overlay toc-overlay" for="__toc">
|
||
<div class="visually-hidden">Hide table of contents sidebar</div>
|
||
</label>
|
||
|
||
<div class="page">
|
||
<!--<header class="mobile-header">
|
||
<div class="header-left">
|
||
<label class="nav-overlay-icon" for="__navigation">
|
||
<div class="visually-hidden">Toggle site navigation sidebar</div>
|
||
<i class="icon"><svg><use href="#svg-menu"></use></svg></i>
|
||
</label>
|
||
</div>
|
||
<div class="header-center">
|
||
<a href="../../../"><div class="brand">Gymnasium Documentation</div></a>
|
||
</div>
|
||
<div class="header-right">
|
||
<div class="theme-toggle-container theme-toggle-header">
|
||
<button class="theme-toggle">
|
||
<div class="visually-hidden">Toggle Light / Dark / Auto color theme</div>
|
||
<svg class="theme-icon-when-auto"><use href="#svg-sun-half"></use></svg>
|
||
<svg class="theme-icon-when-dark"><use href="#svg-moon"></use></svg>
|
||
<svg class="theme-icon-when-light"><use href="#svg-sun"></use></svg>
|
||
</button>
|
||
</div>
|
||
<label class="toc-overlay-icon toc-header-icon" for="__toc">
|
||
<div class="visually-hidden">Toggle table of contents sidebar</div>
|
||
<i class="icon"><svg><use href="#svg-toc"></use></svg></i>
|
||
</label>
|
||
</div>
|
||
</header>-->
|
||
<aside class="sidebar-drawer">
|
||
<div class="sidebar-container">
|
||
|
||
<div class="sidebar-sticky"><a class="farama-sidebar__title" href="../../../">
|
||
<img class="farama-header__logo only-light" src="../../../_static/img/gymnasium_black.svg" alt="Light Logo"/>
|
||
<img class="farama-header__logo only-dark" src="../../../_static/img/gymnasium_white.svg" alt="Dark Logo"/>
|
||
<span class="farama-header__title">Gymnasium Documentation</span>
|
||
</a><form class="sidebar-search-container" method="get" action="../../../search/" role="search">
|
||
<input class="sidebar-search" placeholder="Search" name="q" aria-label="Search">
|
||
<input type="hidden" name="check_keywords" value="yes">
|
||
<input type="hidden" name="area" value="default">
|
||
</form>
|
||
<div id="searchbox"></div><div class="sidebar-scroll"><div class="sidebar-tree">
|
||
<p class="caption" role="heading"><span class="caption-text">Introduction</span></p>
|
||
<ul>
|
||
<li class="toctree-l1"><a class="reference internal" href="../../../introduction/basic_usage/">Basic Usage</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../../../introduction/train_agent/">Training an Agent</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../../../introduction/create_custom_env/">Create a Custom Environment</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../../../introduction/record_agent/">Recording Agents</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../../../introduction/speed_up_env/">Speeding Up Training</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../../../introduction/gym_compatibility/">Compatibility with Gym</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../../../introduction/migration_guide/">Migration Guide - v0.21 to v1.0.0</a></li>
|
||
</ul>
|
||
<p class="caption" role="heading"><span class="caption-text">API</span></p>
|
||
<ul>
|
||
<li class="toctree-l1"><a class="reference internal" href="../../../api/env/">Env</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../../../api/registry/">Make and register</a></li>
|
||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../api/spaces/">Spaces</a><input class="toctree-checkbox" id="toctree-checkbox-1" name="toctree-checkbox-1" role="switch" type="checkbox"/><label for="toctree-checkbox-1"><div class="visually-hidden">Toggle navigation of Spaces</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../../api/spaces/fundamental/">Fundamental Spaces</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../../api/spaces/composite/">Composite Spaces</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../../api/spaces/utils/">Spaces Utils</a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../api/wrappers/">Wrappers</a><input class="toctree-checkbox" id="toctree-checkbox-2" name="toctree-checkbox-2" role="switch" type="checkbox"/><label for="toctree-checkbox-2"><div class="visually-hidden">Toggle navigation of Wrappers</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../../api/wrappers/table/">List of Wrappers</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../../api/wrappers/misc_wrappers/">Misc Wrappers</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../../api/wrappers/action_wrappers/">Action Wrappers</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../../api/wrappers/observation_wrappers/">Observation Wrappers</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../../api/wrappers/reward_wrappers/">Reward Wrappers</a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../api/vector/">Vectorize</a><input class="toctree-checkbox" id="toctree-checkbox-3" name="toctree-checkbox-3" role="switch" type="checkbox"/><label for="toctree-checkbox-3"><div class="visually-hidden">Toggle navigation of Vectorize</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../../api/vector/wrappers/">Wrappers</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../../api/vector/async_vector_env/">AsyncVectorEnv</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../../api/vector/sync_vector_env/">SyncVectorEnv</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../../api/vector/utils/">Utility functions</a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../../../api/utils/">Utility functions</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../../../api/functional/">Functional Env</a></li>
|
||
</ul>
|
||
<p class="caption" role="heading"><span class="caption-text">Environments</span></p>
|
||
<ul>
|
||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../environments/classic_control/">Classic Control</a><input class="toctree-checkbox" id="toctree-checkbox-4" name="toctree-checkbox-4" role="switch" type="checkbox"/><label for="toctree-checkbox-4"><div class="visually-hidden">Toggle navigation of Classic Control</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../../environments/classic_control/acrobot/">Acrobot</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../../environments/classic_control/cart_pole/">Cart Pole</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../../environments/classic_control/mountain_car_continuous/">Mountain Car Continuous</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../../environments/classic_control/mountain_car/">Mountain Car</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../../environments/classic_control/pendulum/">Pendulum</a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../environments/box2d/">Box2D</a><input class="toctree-checkbox" id="toctree-checkbox-5" name="toctree-checkbox-5" role="switch" type="checkbox"/><label for="toctree-checkbox-5"><div class="visually-hidden">Toggle navigation of Box2D</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../../environments/box2d/bipedal_walker/">Bipedal Walker</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../../environments/box2d/car_racing/">Car Racing</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../../environments/box2d/lunar_lander/">Lunar Lander</a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../environments/toy_text/">Toy Text</a><input class="toctree-checkbox" id="toctree-checkbox-6" name="toctree-checkbox-6" role="switch" type="checkbox"/><label for="toctree-checkbox-6"><div class="visually-hidden">Toggle navigation of Toy Text</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../../environments/toy_text/blackjack/">Blackjack</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../../environments/toy_text/taxi/">Taxi</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../../environments/toy_text/cliff_walking/">Cliff Walking</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../../environments/toy_text/frozen_lake/">Frozen Lake</a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../environments/mujoco/">MuJoCo</a><input class="toctree-checkbox" id="toctree-checkbox-7" name="toctree-checkbox-7" role="switch" type="checkbox"/><label for="toctree-checkbox-7"><div class="visually-hidden">Toggle navigation of MuJoCo</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../../environments/mujoco/ant/">Ant</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../../environments/mujoco/half_cheetah/">Half Cheetah</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../../environments/mujoco/hopper/">Hopper</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../../environments/mujoco/humanoid/">Humanoid</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../../environments/mujoco/humanoid_standup/">Humanoid Standup</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../../environments/mujoco/inverted_double_pendulum/">Inverted Double Pendulum</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../../environments/mujoco/inverted_pendulum/">Inverted Pendulum</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../../environments/mujoco/pusher/">Pusher</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../../environments/mujoco/reacher/">Reacher</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../../environments/mujoco/swimmer/">Swimmer</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../../environments/mujoco/walker2d/">Walker2D</a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../../../environments/atari/">Atari</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../../../environments/third_party_environments/">External Environments</a></li>
|
||
</ul>
|
||
<p class="caption" role="heading"><span class="caption-text">Tutorials</span></p>
|
||
<ul class="current">
|
||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../gymnasium_basics/">Gymnasium Basics Documentation Links</a><input class="toctree-checkbox" id="toctree-checkbox-8" name="toctree-checkbox-8" role="switch" type="checkbox"/><label for="toctree-checkbox-8"><div class="visually-hidden">Toggle navigation of Gymnasium Basics Documentation Links</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../gymnasium_basics/load_quadruped_model/">Load custom quadruped robot environments</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../gymnasium_basics/handling_time_limits/">Handling Time Limits</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../gymnasium_basics/implementing_custom_wrappers/">Implementing Custom Wrappers</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../gymnasium_basics/environment_creation/">Make your own custom environment</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../gymnasium_basics/vector_envs_tutorial/">Training A2C with Vector Envs and Domain Randomization</a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toctree-l1 current has-children"><a class="reference internal" href="../">Training Agents links in the Gymnasium Documentation</a><input checked="" class="toctree-checkbox" id="toctree-checkbox-9" name="toctree-checkbox-9" role="switch" type="checkbox"/><label for="toctree-checkbox-9"><div class="visually-hidden">Toggle navigation of Training Agents links in the Gymnasium Documentation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul class="current">
|
||
<li class="toctree-l2"><a class="reference internal" href="../reinforce_invpend_gym_v26/">Training using REINFORCE for Mujoco</a></li>
|
||
<li class="toctree-l2 current current-page"><a class="current reference internal" href="#">Solving Blackjack with Q-Learning</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../FrozenLake_tuto/">Frozenlake benchmark</a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../../third-party-tutorials/">Third-Party Tutorials</a></li>
|
||
</ul>
|
||
<p class="caption" role="heading"><span class="caption-text">Development</span></p>
|
||
<ul>
|
||
<li class="toctree-l1"><a class="reference external" href="https://github.com/Farama-Foundation/Gymnasium">Github</a></li>
|
||
<li class="toctree-l1"><a class="reference external" href="https://arxiv.org/abs/2407.17032">Paper</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../../../gymnasium_release_notes/">Gymnasium Release Notes</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../../../gym_release_notes/">Gym Release Notes</a></li>
|
||
<li class="toctree-l1"><a class="reference external" href="https://github.com/Farama-Foundation/Gymnasium/blob/main/docs/README.md">Contribute to the Docs</a></li>
|
||
</ul>
|
||
|
||
</div>
|
||
</div>
|
||
|
||
</div>
|
||
|
||
</div>
|
||
</aside>
|
||
<div class="main-container">
|
||
|
||
|
||
|
||
|
||
|
||
<div class="main">
|
||
<div class="content">
|
||
<div class="article-container">
|
||
<a href="#" class="back-to-top muted-link">
|
||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24">
|
||
<path d="M13 20h-2V8l-5.5 5.5-1.42-1.42L12 4.16l7.92 7.92-1.42 1.42L13 8v12z"></path>
|
||
</svg>
|
||
<span>Back to top</span>
|
||
</a>
|
||
<div class="content-icon-container">
|
||
<div class="edit-this-page">
|
||
<a class="muted-link" href="https://github.com/Farama-Foundation/Gymnasium/edit/main/docs/tutorials/training_agents/blackjack_tutorial.py" title="Edit this page">
|
||
<svg aria-hidden="true" viewBox="0 0 24 24" stroke-width="1.5" stroke="currentColor" fill="none" stroke-linecap="round" stroke-linejoin="round">
|
||
<path stroke="none" d="M0 0h24v24H0z" fill="none"/>
|
||
<path d="M4 20h4l10.5 -10.5a1.5 1.5 0 0 0 -4 -4l-10.5 10.5v4" />
|
||
<line x1="13.5" y1="6.5" x2="17.5" y2="10.5" />
|
||
</svg>
|
||
<span class="visually-hidden">Edit this page</span>
|
||
</a>
|
||
</div><div class="theme-toggle-container theme-toggle-content">
|
||
<button class="theme-toggle" title="Toggle color theme">
|
||
<div class="visually-hidden">Toggle Light / Dark / Auto color theme</div>
|
||
<svg class="theme-icon-when-auto">
|
||
<use href="#svg-sun-half"></use>
|
||
</svg>
|
||
<svg class="theme-icon-when-dark">
|
||
<use href="#svg-moon"></use>
|
||
</svg>
|
||
<svg class="theme-icon-when-light">
|
||
<use href="#svg-sun"></use>
|
||
</svg>
|
||
</button>
|
||
</div>
|
||
<label class="toc-overlay-icon toc-content-icon" for="__toc">
|
||
<div class="visually-hidden">Toggle table of contents sidebar</div>
|
||
<i class="icon"><svg>
|
||
<use href="#svg-toc"></use>
|
||
</svg></i>
|
||
</label>
|
||
</div>
|
||
<article role="main">
|
||
|
||
<section class="sphx-glr-example-title" id="solving-blackjack-with-q-learning">
|
||
<span id="sphx-glr-tutorials-training-agents-blackjack-tutorial-py"></span><h1>Solving Blackjack with Q-Learning<a class="headerlink" href="#solving-blackjack-with-q-learning" title="Link to this heading">¶</a></h1>
|
||
<a class="only-light reference internal image-reference" href="../../../_images/blackjack_AE_loop.jpg"><img alt="agent-environment-diagram" class="only-light" src="../../../_images/blackjack_AE_loop.jpg" style="width: 650px;" />
|
||
</a>
|
||
<a class="only-dark reference internal image-reference" href="../../../_images/blackjack_AE_loop_dark.png"><img alt="agent-environment-diagram" class="only-dark" src="../../../_images/blackjack_AE_loop_dark.png" style="width: 650px;" />
|
||
</a>
|
||
<p>In this tutorial, we’ll explore and solve the <em>Blackjack-v1</em>
|
||
environment.</p>
|
||
<p><strong>Blackjack</strong> is one of the most popular casino card games that is also
|
||
infamous for being beatable under certain conditions. This version of
|
||
the game uses an infinite deck (we draw the cards with replacement), so
|
||
counting cards won’t be a viable strategy in our simulated game.
|
||
Full documentation can be found at <a class="reference external" href="https://gymnasium.farama.org/environments/toy_text/blackjack">https://gymnasium.farama.org/environments/toy_text/blackjack</a></p>
|
||
<p><strong>Objective</strong>: To win, your card sum should be greater than the
|
||
dealers without exceeding 21.</p>
|
||
<dl class="simple">
|
||
<dt><strong>Actions</strong>: Agents can pick between two actions:</dt><dd><ul class="simple">
|
||
<li><p>stand (0): the player takes no more cards</p></li>
|
||
<li><p>hit (1): the player will be given another card, however the player could get over 21 and bust</p></li>
|
||
</ul>
|
||
</dd>
|
||
</dl>
|
||
<p><strong>Approach</strong>: To solve this environment by yourself, you can pick your
|
||
favorite discrete RL algorithm. The presented solution uses <em>Q-learning</em>
|
||
(a model-free RL algorithm).</p>
|
||
<section id="imports-and-environment-setup">
|
||
<h2>Imports and Environment Setup<a class="headerlink" href="#imports-and-environment-setup" title="Link to this heading">¶</a></h2>
|
||
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="c1"># Author: Till Zemann</span>
|
||
<span class="c1"># License: MIT License</span>
|
||
|
||
<span class="kn">from</span><span class="w"> </span><span class="nn">__future__</span><span class="w"> </span><span class="kn">import</span> <span class="n">annotations</span>
|
||
|
||
<span class="kn">from</span><span class="w"> </span><span class="nn">collections</span><span class="w"> </span><span class="kn">import</span> <span class="n">defaultdict</span>
|
||
|
||
<span class="kn">import</span><span class="w"> </span><span class="nn">matplotlib.pyplot</span><span class="w"> </span><span class="k">as</span><span class="w"> </span><span class="nn">plt</span>
|
||
<span class="kn">import</span><span class="w"> </span><span class="nn">numpy</span><span class="w"> </span><span class="k">as</span><span class="w"> </span><span class="nn">np</span>
|
||
<span class="kn">import</span><span class="w"> </span><span class="nn">seaborn</span><span class="w"> </span><span class="k">as</span><span class="w"> </span><span class="nn">sns</span>
|
||
<span class="kn">from</span><span class="w"> </span><span class="nn">matplotlib.patches</span><span class="w"> </span><span class="kn">import</span> <span class="n">Patch</span>
|
||
<span class="kn">from</span><span class="w"> </span><span class="nn">tqdm</span><span class="w"> </span><span class="kn">import</span> <span class="n">tqdm</span>
|
||
|
||
<span class="kn">import</span><span class="w"> </span><span class="nn">gymnasium</span><span class="w"> </span><span class="k">as</span><span class="w"> </span><span class="nn">gym</span>
|
||
|
||
|
||
<span class="c1"># Let's start by creating the blackjack environment.</span>
|
||
<span class="c1"># Note: We are going to follow the rules from Sutton & Barto.</span>
|
||
<span class="c1"># Other versions of the game can be found below for you to experiment.</span>
|
||
|
||
<span class="n">env</span> <span class="o">=</span> <span class="n">gym</span><span class="o">.</span><span class="n">make</span><span class="p">(</span><span class="s2">"Blackjack-v1"</span><span class="p">,</span> <span class="n">sab</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
|
||
</pre></div>
|
||
</div>
|
||
<div class="highlight-py notranslate"><div class="highlight"><pre><span></span><span class="c1"># Other possible environment configurations are:</span>
|
||
|
||
<span class="n">env</span> <span class="o">=</span> <span class="n">gym</span><span class="o">.</span><span class="n">make</span><span class="p">(</span><span class="s1">'Blackjack-v1'</span><span class="p">,</span> <span class="n">natural</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">sab</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span>
|
||
<span class="c1"># Whether to give an additional reward for starting with a natural blackjack, i.e. starting with an ace and ten (sum is 21).</span>
|
||
|
||
<span class="n">env</span> <span class="o">=</span> <span class="n">gym</span><span class="o">.</span><span class="n">make</span><span class="p">(</span><span class="s1">'Blackjack-v1'</span><span class="p">,</span> <span class="n">natural</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">sab</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span>
|
||
<span class="c1"># Whether to follow the exact rules outlined in the book by Sutton and Barto. If `sab` is `True`, the keyword argument `natural` will be ignored.</span>
|
||
</pre></div>
|
||
</div>
|
||
</section>
|
||
<section id="observing-the-environment">
|
||
<h2>Observing the environment<a class="headerlink" href="#observing-the-environment" title="Link to this heading">¶</a></h2>
|
||
<p>First of all, we call <code class="docutils literal notranslate"><span class="pre">env.reset()</span></code> to start an episode. This function
|
||
resets the environment to a starting position and returns an initial
|
||
<code class="docutils literal notranslate"><span class="pre">observation</span></code>. We usually also set <code class="docutils literal notranslate"><span class="pre">done</span> <span class="pre">=</span> <span class="pre">False</span></code>. This variable
|
||
will be useful later to check if a game is terminated (i.e., the player wins or loses).</p>
|
||
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="c1"># reset the environment to get the first observation</span>
|
||
<span class="n">done</span> <span class="o">=</span> <span class="kc">False</span>
|
||
<span class="n">observation</span><span class="p">,</span> <span class="n">info</span> <span class="o">=</span> <span class="n">env</span><span class="o">.</span><span class="n">reset</span><span class="p">()</span>
|
||
|
||
<span class="c1"># observation = (16, 9, False)</span>
|
||
</pre></div>
|
||
</div>
|
||
<p>Note that our observation is a 3-tuple consisting of 3 values:</p>
|
||
<ul class="simple">
|
||
<li><p>The players current sum</p></li>
|
||
<li><p>Value of the dealers face-up card</p></li>
|
||
<li><p>Boolean whether the player holds a usable ace (An ace is usable if it
|
||
counts as 11 without busting)</p></li>
|
||
</ul>
|
||
</section>
|
||
<section id="executing-an-action">
|
||
<h2>Executing an action<a class="headerlink" href="#executing-an-action" title="Link to this heading">¶</a></h2>
|
||
<p>After receiving our first observation, we are only going to use the
|
||
<code class="docutils literal notranslate"><span class="pre">env.step(action)</span></code> function to interact with the environment. This
|
||
function takes an action as input and executes it in the environment.
|
||
Because that action changes the state of the environment, it returns
|
||
four useful variables to us. These are:</p>
|
||
<ul class="simple">
|
||
<li><p><code class="docutils literal notranslate"><span class="pre">next_state</span></code>: This is the observation that the agent will receive
|
||
after taking the action.</p></li>
|
||
<li><p><code class="docutils literal notranslate"><span class="pre">reward</span></code>: This is the reward that the agent will receive after
|
||
taking the action.</p></li>
|
||
<li><p><code class="docutils literal notranslate"><span class="pre">terminated</span></code>: This is a boolean variable that indicates whether or
|
||
not the environment has terminated.</p></li>
|
||
<li><p><code class="docutils literal notranslate"><span class="pre">truncated</span></code>: This is a boolean variable that also indicates whether
|
||
the episode ended by early truncation, i.e., a time limit is reached.</p></li>
|
||
<li><p><code class="docutils literal notranslate"><span class="pre">info</span></code>: This is a dictionary that might contain additional
|
||
information about the environment.</p></li>
|
||
</ul>
|
||
<p>The <code class="docutils literal notranslate"><span class="pre">next_state</span></code>, <code class="docutils literal notranslate"><span class="pre">reward</span></code>, <code class="docutils literal notranslate"><span class="pre">terminated</span></code> and <code class="docutils literal notranslate"><span class="pre">truncated</span></code> variables are
|
||
self-explanatory, but the <code class="docutils literal notranslate"><span class="pre">info</span></code> variable requires some additional
|
||
explanation. This variable contains a dictionary that might have some
|
||
extra information about the environment, but in the Blackjack-v1
|
||
environment you can ignore it. For example in Atari environments the
|
||
info dictionary has a <code class="docutils literal notranslate"><span class="pre">ale.lives</span></code> key that tells us how many lives the
|
||
agent has left. If the agent has 0 lives, then the episode is over.</p>
|
||
<p>Note that it is not a good idea to call <code class="docutils literal notranslate"><span class="pre">env.render()</span></code> in your training
|
||
loop because rendering slows down training by a lot. Rather try to build
|
||
an extra loop to evaluate and showcase the agent after training.</p>
|
||
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="c1"># sample a random action from all valid actions</span>
|
||
<span class="n">action</span> <span class="o">=</span> <span class="n">env</span><span class="o">.</span><span class="n">action_space</span><span class="o">.</span><span class="n">sample</span><span class="p">()</span>
|
||
<span class="c1"># action=1</span>
|
||
|
||
<span class="c1"># execute the action in our environment and receive infos from the environment</span>
|
||
<span class="n">observation</span><span class="p">,</span> <span class="n">reward</span><span class="p">,</span> <span class="n">terminated</span><span class="p">,</span> <span class="n">truncated</span><span class="p">,</span> <span class="n">info</span> <span class="o">=</span> <span class="n">env</span><span class="o">.</span><span class="n">step</span><span class="p">(</span><span class="n">action</span><span class="p">)</span>
|
||
|
||
<span class="c1"># observation=(24, 10, False)</span>
|
||
<span class="c1"># reward=-1.0</span>
|
||
<span class="c1"># terminated=True</span>
|
||
<span class="c1"># truncated=False</span>
|
||
<span class="c1"># info={}</span>
|
||
</pre></div>
|
||
</div>
|
||
<p>Once <code class="docutils literal notranslate"><span class="pre">terminated</span> <span class="pre">=</span> <span class="pre">True</span></code> or <code class="docutils literal notranslate"><span class="pre">truncated=True</span></code>, we should stop the
|
||
current episode and begin a new one with <code class="docutils literal notranslate"><span class="pre">env.reset()</span></code>. If you
|
||
continue executing actions without resetting the environment, it still
|
||
responds but the output won’t be useful for training (it might even be
|
||
harmful if the agent learns on invalid data).</p>
|
||
</section>
|
||
<section id="building-an-agent">
|
||
<h2>Building an agent<a class="headerlink" href="#building-an-agent" title="Link to this heading">¶</a></h2>
|
||
<p>Let’s build a <code class="docutils literal notranslate"><span class="pre">Q-learning</span> <span class="pre">agent</span></code> to solve <em>Blackjack-v1</em>! We’ll need
|
||
some functions for picking an action and updating the agents action
|
||
values. To ensure that the agents explores the environment, one possible
|
||
solution is the <code class="docutils literal notranslate"><span class="pre">epsilon-greedy</span></code> strategy, where we pick a random
|
||
action with the percentage <code class="docutils literal notranslate"><span class="pre">epsilon</span></code> and the greedy action (currently
|
||
valued as the best) <code class="docutils literal notranslate"><span class="pre">1</span> <span class="pre">-</span> <span class="pre">epsilon</span></code>.</p>
|
||
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="k">class</span><span class="w"> </span><span class="nc">BlackjackAgent</span><span class="p">:</span>
|
||
<span class="k">def</span><span class="w"> </span><span class="fm">__init__</span><span class="p">(</span>
|
||
<span class="bp">self</span><span class="p">,</span>
|
||
<span class="n">env</span><span class="p">,</span>
|
||
<span class="n">learning_rate</span><span class="p">:</span> <span class="nb">float</span><span class="p">,</span>
|
||
<span class="n">initial_epsilon</span><span class="p">:</span> <span class="nb">float</span><span class="p">,</span>
|
||
<span class="n">epsilon_decay</span><span class="p">:</span> <span class="nb">float</span><span class="p">,</span>
|
||
<span class="n">final_epsilon</span><span class="p">:</span> <span class="nb">float</span><span class="p">,</span>
|
||
<span class="n">discount_factor</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.95</span><span class="p">,</span>
|
||
<span class="p">):</span>
|
||
<span class="w"> </span><span class="sd">"""Initialize a Reinforcement Learning agent with an empty dictionary</span>
|
||
<span class="sd"> of state-action values (q_values), a learning rate and an epsilon.</span>
|
||
|
||
<span class="sd"> Args:</span>
|
||
<span class="sd"> learning_rate: The learning rate</span>
|
||
<span class="sd"> initial_epsilon: The initial epsilon value</span>
|
||
<span class="sd"> epsilon_decay: The decay for epsilon</span>
|
||
<span class="sd"> final_epsilon: The final epsilon value</span>
|
||
<span class="sd"> discount_factor: The discount factor for computing the Q-value</span>
|
||
<span class="sd"> """</span>
|
||
<span class="bp">self</span><span class="o">.</span><span class="n">q_values</span> <span class="o">=</span> <span class="n">defaultdict</span><span class="p">(</span><span class="k">lambda</span><span class="p">:</span> <span class="n">np</span><span class="o">.</span><span class="n">zeros</span><span class="p">(</span><span class="n">env</span><span class="o">.</span><span class="n">action_space</span><span class="o">.</span><span class="n">n</span><span class="p">))</span>
|
||
|
||
<span class="bp">self</span><span class="o">.</span><span class="n">lr</span> <span class="o">=</span> <span class="n">learning_rate</span>
|
||
<span class="bp">self</span><span class="o">.</span><span class="n">discount_factor</span> <span class="o">=</span> <span class="n">discount_factor</span>
|
||
|
||
<span class="bp">self</span><span class="o">.</span><span class="n">epsilon</span> <span class="o">=</span> <span class="n">initial_epsilon</span>
|
||
<span class="bp">self</span><span class="o">.</span><span class="n">epsilon_decay</span> <span class="o">=</span> <span class="n">epsilon_decay</span>
|
||
<span class="bp">self</span><span class="o">.</span><span class="n">final_epsilon</span> <span class="o">=</span> <span class="n">final_epsilon</span>
|
||
|
||
<span class="bp">self</span><span class="o">.</span><span class="n">training_error</span> <span class="o">=</span> <span class="p">[]</span>
|
||
|
||
<span class="k">def</span><span class="w"> </span><span class="nf">get_action</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">env</span><span class="p">,</span> <span class="n">obs</span><span class="p">:</span> <span class="nb">tuple</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="nb">int</span><span class="p">,</span> <span class="nb">bool</span><span class="p">])</span> <span class="o">-></span> <span class="nb">int</span><span class="p">:</span>
|
||
<span class="w"> </span><span class="sd">"""</span>
|
||
<span class="sd"> Returns the best action with probability (1 - epsilon)</span>
|
||
<span class="sd"> otherwise a random action with probability epsilon to ensure exploration.</span>
|
||
<span class="sd"> """</span>
|
||
<span class="c1"># with probability epsilon return a random action to explore the environment</span>
|
||
<span class="k">if</span> <span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">random</span><span class="p">()</span> <span class="o"><</span> <span class="bp">self</span><span class="o">.</span><span class="n">epsilon</span><span class="p">:</span>
|
||
<span class="k">return</span> <span class="n">env</span><span class="o">.</span><span class="n">action_space</span><span class="o">.</span><span class="n">sample</span><span class="p">()</span>
|
||
|
||
<span class="c1"># with probability (1 - epsilon) act greedily (exploit)</span>
|
||
<span class="k">else</span><span class="p">:</span>
|
||
<span class="k">return</span> <span class="nb">int</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">argmax</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">q_values</span><span class="p">[</span><span class="n">obs</span><span class="p">]))</span>
|
||
|
||
<span class="k">def</span><span class="w"> </span><span class="nf">update</span><span class="p">(</span>
|
||
<span class="bp">self</span><span class="p">,</span>
|
||
<span class="n">obs</span><span class="p">:</span> <span class="nb">tuple</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="nb">int</span><span class="p">,</span> <span class="nb">bool</span><span class="p">],</span>
|
||
<span class="n">action</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span>
|
||
<span class="n">reward</span><span class="p">:</span> <span class="nb">float</span><span class="p">,</span>
|
||
<span class="n">terminated</span><span class="p">:</span> <span class="nb">bool</span><span class="p">,</span>
|
||
<span class="n">next_obs</span><span class="p">:</span> <span class="nb">tuple</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="nb">int</span><span class="p">,</span> <span class="nb">bool</span><span class="p">],</span>
|
||
<span class="p">):</span>
|
||
<span class="w"> </span><span class="sd">"""Updates the Q-value of an action."""</span>
|
||
<span class="n">future_q_value</span> <span class="o">=</span> <span class="p">(</span><span class="ow">not</span> <span class="n">terminated</span><span class="p">)</span> <span class="o">*</span> <span class="n">np</span><span class="o">.</span><span class="n">max</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">q_values</span><span class="p">[</span><span class="n">next_obs</span><span class="p">])</span>
|
||
<span class="n">temporal_difference</span> <span class="o">=</span> <span class="p">(</span>
|
||
<span class="n">reward</span> <span class="o">+</span> <span class="bp">self</span><span class="o">.</span><span class="n">discount_factor</span> <span class="o">*</span> <span class="n">future_q_value</span> <span class="o">-</span> <span class="bp">self</span><span class="o">.</span><span class="n">q_values</span><span class="p">[</span><span class="n">obs</span><span class="p">][</span><span class="n">action</span><span class="p">]</span>
|
||
<span class="p">)</span>
|
||
|
||
<span class="bp">self</span><span class="o">.</span><span class="n">q_values</span><span class="p">[</span><span class="n">obs</span><span class="p">][</span><span class="n">action</span><span class="p">]</span> <span class="o">=</span> <span class="p">(</span>
|
||
<span class="bp">self</span><span class="o">.</span><span class="n">q_values</span><span class="p">[</span><span class="n">obs</span><span class="p">][</span><span class="n">action</span><span class="p">]</span> <span class="o">+</span> <span class="bp">self</span><span class="o">.</span><span class="n">lr</span> <span class="o">*</span> <span class="n">temporal_difference</span>
|
||
<span class="p">)</span>
|
||
<span class="bp">self</span><span class="o">.</span><span class="n">training_error</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">temporal_difference</span><span class="p">)</span>
|
||
|
||
<span class="k">def</span><span class="w"> </span><span class="nf">decay_epsilon</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
|
||
<span class="bp">self</span><span class="o">.</span><span class="n">epsilon</span> <span class="o">=</span> <span class="nb">max</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">final_epsilon</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">epsilon</span> <span class="o">-</span> <span class="bp">self</span><span class="o">.</span><span class="n">epsilon_decay</span><span class="p">)</span>
|
||
</pre></div>
|
||
</div>
|
||
<p>To train the agent, we will let the agent play one episode (one complete
|
||
game is called an episode) at a time and then update it’s Q-values after
|
||
each step (one single action in a game is called a step).</p>
|
||
<p>The agent will have to experience a lot of episodes to explore the
|
||
environment sufficiently.</p>
|
||
<p>Now we should be ready to build the training loop.</p>
|
||
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="c1"># hyperparameters</span>
|
||
<span class="n">learning_rate</span> <span class="o">=</span> <span class="mf">0.01</span>
|
||
<span class="n">n_episodes</span> <span class="o">=</span> <span class="mi">100_000</span>
|
||
<span class="n">start_epsilon</span> <span class="o">=</span> <span class="mf">1.0</span>
|
||
<span class="n">epsilon_decay</span> <span class="o">=</span> <span class="n">start_epsilon</span> <span class="o">/</span> <span class="p">(</span><span class="n">n_episodes</span> <span class="o">/</span> <span class="mi">2</span><span class="p">)</span> <span class="c1"># reduce the exploration over time</span>
|
||
<span class="n">final_epsilon</span> <span class="o">=</span> <span class="mf">0.1</span>
|
||
|
||
<span class="n">agent</span> <span class="o">=</span> <span class="n">BlackjackAgent</span><span class="p">(</span>
|
||
<span class="n">env</span><span class="o">=</span><span class="n">env</span><span class="p">,</span>
|
||
<span class="n">learning_rate</span><span class="o">=</span><span class="n">learning_rate</span><span class="p">,</span>
|
||
<span class="n">initial_epsilon</span><span class="o">=</span><span class="n">start_epsilon</span><span class="p">,</span>
|
||
<span class="n">epsilon_decay</span><span class="o">=</span><span class="n">epsilon_decay</span><span class="p">,</span>
|
||
<span class="n">final_epsilon</span><span class="o">=</span><span class="n">final_epsilon</span><span class="p">,</span>
|
||
<span class="p">)</span>
|
||
</pre></div>
|
||
</div>
|
||
<p>Great, let’s train!</p>
|
||
<p>Info: The current hyperparameters are set to quickly train a decent agent.
|
||
If you want to converge to the optimal policy, try increasing
|
||
the n_episodes by 10x and lower the learning_rate (e.g. to 0.001).</p>
|
||
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">env</span> <span class="o">=</span> <span class="n">gym</span><span class="o">.</span><span class="n">wrappers</span><span class="o">.</span><span class="n">RecordEpisodeStatistics</span><span class="p">(</span><span class="n">env</span><span class="p">,</span> <span class="n">deque_size</span><span class="o">=</span><span class="n">n_episodes</span><span class="p">)</span>
|
||
<span class="k">for</span> <span class="n">episode</span> <span class="ow">in</span> <span class="n">tqdm</span><span class="p">(</span><span class="nb">range</span><span class="p">(</span><span class="n">n_episodes</span><span class="p">)):</span>
|
||
<span class="n">obs</span><span class="p">,</span> <span class="n">info</span> <span class="o">=</span> <span class="n">env</span><span class="o">.</span><span class="n">reset</span><span class="p">()</span>
|
||
<span class="n">done</span> <span class="o">=</span> <span class="kc">False</span>
|
||
|
||
<span class="c1"># play one episode</span>
|
||
<span class="k">while</span> <span class="ow">not</span> <span class="n">done</span><span class="p">:</span>
|
||
<span class="n">action</span> <span class="o">=</span> <span class="n">agent</span><span class="o">.</span><span class="n">get_action</span><span class="p">(</span><span class="n">env</span><span class="p">,</span> <span class="n">obs</span><span class="p">)</span>
|
||
<span class="n">next_obs</span><span class="p">,</span> <span class="n">reward</span><span class="p">,</span> <span class="n">terminated</span><span class="p">,</span> <span class="n">truncated</span><span class="p">,</span> <span class="n">info</span> <span class="o">=</span> <span class="n">env</span><span class="o">.</span><span class="n">step</span><span class="p">(</span><span class="n">action</span><span class="p">)</span>
|
||
|
||
<span class="c1"># update the agent</span>
|
||
<span class="n">agent</span><span class="o">.</span><span class="n">update</span><span class="p">(</span><span class="n">obs</span><span class="p">,</span> <span class="n">action</span><span class="p">,</span> <span class="n">reward</span><span class="p">,</span> <span class="n">terminated</span><span class="p">,</span> <span class="n">next_obs</span><span class="p">)</span>
|
||
|
||
<span class="c1"># update if the environment is done and the current obs</span>
|
||
<span class="n">done</span> <span class="o">=</span> <span class="n">terminated</span> <span class="ow">or</span> <span class="n">truncated</span>
|
||
<span class="n">obs</span> <span class="o">=</span> <span class="n">next_obs</span>
|
||
|
||
<span class="n">agent</span><span class="o">.</span><span class="n">decay_epsilon</span><span class="p">()</span>
|
||
</pre></div>
|
||
</div>
|
||
</section>
|
||
<section id="visualizing-the-training">
|
||
<h2>Visualizing the training<a class="headerlink" href="#visualizing-the-training" title="Link to this heading">¶</a></h2>
|
||
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">rolling_length</span> <span class="o">=</span> <span class="mi">500</span>
|
||
<span class="n">fig</span><span class="p">,</span> <span class="n">axs</span> <span class="o">=</span> <span class="n">plt</span><span class="o">.</span><span class="n">subplots</span><span class="p">(</span><span class="n">ncols</span><span class="o">=</span><span class="mi">3</span><span class="p">,</span> <span class="n">figsize</span><span class="o">=</span><span class="p">(</span><span class="mi">12</span><span class="p">,</span> <span class="mi">5</span><span class="p">))</span>
|
||
<span class="n">axs</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">set_title</span><span class="p">(</span><span class="s2">"Episode rewards"</span><span class="p">)</span>
|
||
<span class="c1"># compute and assign a rolling average of the data to provide a smoother graph</span>
|
||
<span class="n">reward_moving_average</span> <span class="o">=</span> <span class="p">(</span>
|
||
<span class="n">np</span><span class="o">.</span><span class="n">convolve</span><span class="p">(</span>
|
||
<span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">(</span><span class="n">env</span><span class="o">.</span><span class="n">return_queue</span><span class="p">)</span><span class="o">.</span><span class="n">flatten</span><span class="p">(),</span> <span class="n">np</span><span class="o">.</span><span class="n">ones</span><span class="p">(</span><span class="n">rolling_length</span><span class="p">),</span> <span class="n">mode</span><span class="o">=</span><span class="s2">"valid"</span>
|
||
<span class="p">)</span>
|
||
<span class="o">/</span> <span class="n">rolling_length</span>
|
||
<span class="p">)</span>
|
||
<span class="n">axs</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">plot</span><span class="p">(</span><span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">reward_moving_average</span><span class="p">)),</span> <span class="n">reward_moving_average</span><span class="p">)</span>
|
||
<span class="n">axs</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span><span class="o">.</span><span class="n">set_title</span><span class="p">(</span><span class="s2">"Episode lengths"</span><span class="p">)</span>
|
||
<span class="n">length_moving_average</span> <span class="o">=</span> <span class="p">(</span>
|
||
<span class="n">np</span><span class="o">.</span><span class="n">convolve</span><span class="p">(</span>
|
||
<span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">(</span><span class="n">env</span><span class="o">.</span><span class="n">length_queue</span><span class="p">)</span><span class="o">.</span><span class="n">flatten</span><span class="p">(),</span> <span class="n">np</span><span class="o">.</span><span class="n">ones</span><span class="p">(</span><span class="n">rolling_length</span><span class="p">),</span> <span class="n">mode</span><span class="o">=</span><span class="s2">"same"</span>
|
||
<span class="p">)</span>
|
||
<span class="o">/</span> <span class="n">rolling_length</span>
|
||
<span class="p">)</span>
|
||
<span class="n">axs</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span><span class="o">.</span><span class="n">plot</span><span class="p">(</span><span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">length_moving_average</span><span class="p">)),</span> <span class="n">length_moving_average</span><span class="p">)</span>
|
||
<span class="n">axs</span><span class="p">[</span><span class="mi">2</span><span class="p">]</span><span class="o">.</span><span class="n">set_title</span><span class="p">(</span><span class="s2">"Training Error"</span><span class="p">)</span>
|
||
<span class="n">training_error_moving_average</span> <span class="o">=</span> <span class="p">(</span>
|
||
<span class="n">np</span><span class="o">.</span><span class="n">convolve</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">(</span><span class="n">agent</span><span class="o">.</span><span class="n">training_error</span><span class="p">),</span> <span class="n">np</span><span class="o">.</span><span class="n">ones</span><span class="p">(</span><span class="n">rolling_length</span><span class="p">),</span> <span class="n">mode</span><span class="o">=</span><span class="s2">"same"</span><span class="p">)</span>
|
||
<span class="o">/</span> <span class="n">rolling_length</span>
|
||
<span class="p">)</span>
|
||
<span class="n">axs</span><span class="p">[</span><span class="mi">2</span><span class="p">]</span><span class="o">.</span><span class="n">plot</span><span class="p">(</span><span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">training_error_moving_average</span><span class="p">)),</span> <span class="n">training_error_moving_average</span><span class="p">)</span>
|
||
<span class="n">plt</span><span class="o">.</span><span class="n">tight_layout</span><span class="p">()</span>
|
||
<span class="n">plt</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
|
||
</pre></div>
|
||
</div>
|
||
<img alt="../../../_images/blackjack_training_plots.png" src="../../../_images/blackjack_training_plots.png" />
|
||
</section>
|
||
<section id="visualising-the-policy">
|
||
<h2>Visualising the policy<a class="headerlink" href="#visualising-the-policy" title="Link to this heading">¶</a></h2>
|
||
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="k">def</span><span class="w"> </span><span class="nf">create_grids</span><span class="p">(</span><span class="n">agent</span><span class="p">,</span> <span class="n">usable_ace</span><span class="o">=</span><span class="kc">False</span><span class="p">):</span>
|
||
<span class="w"> </span><span class="sd">"""Create value and policy grid given an agent."""</span>
|
||
<span class="c1"># convert our state-action values to state values</span>
|
||
<span class="c1"># and build a policy dictionary that maps observations to actions</span>
|
||
<span class="n">state_value</span> <span class="o">=</span> <span class="n">defaultdict</span><span class="p">(</span><span class="nb">float</span><span class="p">)</span>
|
||
<span class="n">policy</span> <span class="o">=</span> <span class="n">defaultdict</span><span class="p">(</span><span class="nb">int</span><span class="p">)</span>
|
||
<span class="k">for</span> <span class="n">obs</span><span class="p">,</span> <span class="n">action_values</span> <span class="ow">in</span> <span class="n">agent</span><span class="o">.</span><span class="n">q_values</span><span class="o">.</span><span class="n">items</span><span class="p">():</span>
|
||
<span class="n">state_value</span><span class="p">[</span><span class="n">obs</span><span class="p">]</span> <span class="o">=</span> <span class="nb">float</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">max</span><span class="p">(</span><span class="n">action_values</span><span class="p">))</span>
|
||
<span class="n">policy</span><span class="p">[</span><span class="n">obs</span><span class="p">]</span> <span class="o">=</span> <span class="nb">int</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">argmax</span><span class="p">(</span><span class="n">action_values</span><span class="p">))</span>
|
||
|
||
<span class="n">player_count</span><span class="p">,</span> <span class="n">dealer_count</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">meshgrid</span><span class="p">(</span>
|
||
<span class="c1"># players count, dealers face-up card</span>
|
||
<span class="n">np</span><span class="o">.</span><span class="n">arange</span><span class="p">(</span><span class="mi">12</span><span class="p">,</span> <span class="mi">22</span><span class="p">),</span>
|
||
<span class="n">np</span><span class="o">.</span><span class="n">arange</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">11</span><span class="p">),</span>
|
||
<span class="p">)</span>
|
||
|
||
<span class="c1"># create the value grid for plotting</span>
|
||
<span class="n">value</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">apply_along_axis</span><span class="p">(</span>
|
||
<span class="k">lambda</span> <span class="n">obs</span><span class="p">:</span> <span class="n">state_value</span><span class="p">[(</span><span class="n">obs</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="n">obs</span><span class="p">[</span><span class="mi">1</span><span class="p">],</span> <span class="n">usable_ace</span><span class="p">)],</span>
|
||
<span class="n">axis</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span>
|
||
<span class="n">arr</span><span class="o">=</span><span class="n">np</span><span class="o">.</span><span class="n">dstack</span><span class="p">([</span><span class="n">player_count</span><span class="p">,</span> <span class="n">dealer_count</span><span class="p">]),</span>
|
||
<span class="p">)</span>
|
||
<span class="n">value_grid</span> <span class="o">=</span> <span class="n">player_count</span><span class="p">,</span> <span class="n">dealer_count</span><span class="p">,</span> <span class="n">value</span>
|
||
|
||
<span class="c1"># create the policy grid for plotting</span>
|
||
<span class="n">policy_grid</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">apply_along_axis</span><span class="p">(</span>
|
||
<span class="k">lambda</span> <span class="n">obs</span><span class="p">:</span> <span class="n">policy</span><span class="p">[(</span><span class="n">obs</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="n">obs</span><span class="p">[</span><span class="mi">1</span><span class="p">],</span> <span class="n">usable_ace</span><span class="p">)],</span>
|
||
<span class="n">axis</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span>
|
||
<span class="n">arr</span><span class="o">=</span><span class="n">np</span><span class="o">.</span><span class="n">dstack</span><span class="p">([</span><span class="n">player_count</span><span class="p">,</span> <span class="n">dealer_count</span><span class="p">]),</span>
|
||
<span class="p">)</span>
|
||
<span class="k">return</span> <span class="n">value_grid</span><span class="p">,</span> <span class="n">policy_grid</span>
|
||
|
||
|
||
<span class="k">def</span><span class="w"> </span><span class="nf">create_plots</span><span class="p">(</span><span class="n">value_grid</span><span class="p">,</span> <span class="n">policy_grid</span><span class="p">,</span> <span class="n">title</span><span class="p">:</span> <span class="nb">str</span><span class="p">):</span>
|
||
<span class="w"> </span><span class="sd">"""Creates a plot using a value and policy grid."""</span>
|
||
<span class="c1"># create a new figure with 2 subplots (left: state values, right: policy)</span>
|
||
<span class="n">player_count</span><span class="p">,</span> <span class="n">dealer_count</span><span class="p">,</span> <span class="n">value</span> <span class="o">=</span> <span class="n">value_grid</span>
|
||
<span class="n">fig</span> <span class="o">=</span> <span class="n">plt</span><span class="o">.</span><span class="n">figure</span><span class="p">(</span><span class="n">figsize</span><span class="o">=</span><span class="n">plt</span><span class="o">.</span><span class="n">figaspect</span><span class="p">(</span><span class="mf">0.4</span><span class="p">))</span>
|
||
<span class="n">fig</span><span class="o">.</span><span class="n">suptitle</span><span class="p">(</span><span class="n">title</span><span class="p">,</span> <span class="n">fontsize</span><span class="o">=</span><span class="mi">16</span><span class="p">)</span>
|
||
|
||
<span class="c1"># plot the state values</span>
|
||
<span class="n">ax1</span> <span class="o">=</span> <span class="n">fig</span><span class="o">.</span><span class="n">add_subplot</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="n">projection</span><span class="o">=</span><span class="s2">"3d"</span><span class="p">)</span>
|
||
<span class="n">ax1</span><span class="o">.</span><span class="n">plot_surface</span><span class="p">(</span>
|
||
<span class="n">player_count</span><span class="p">,</span>
|
||
<span class="n">dealer_count</span><span class="p">,</span>
|
||
<span class="n">value</span><span class="p">,</span>
|
||
<span class="n">rstride</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
|
||
<span class="n">cstride</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
|
||
<span class="n">cmap</span><span class="o">=</span><span class="s2">"viridis"</span><span class="p">,</span>
|
||
<span class="n">edgecolor</span><span class="o">=</span><span class="s2">"none"</span><span class="p">,</span>
|
||
<span class="p">)</span>
|
||
<span class="n">plt</span><span class="o">.</span><span class="n">xticks</span><span class="p">(</span><span class="nb">range</span><span class="p">(</span><span class="mi">12</span><span class="p">,</span> <span class="mi">22</span><span class="p">),</span> <span class="nb">range</span><span class="p">(</span><span class="mi">12</span><span class="p">,</span> <span class="mi">22</span><span class="p">))</span>
|
||
<span class="n">plt</span><span class="o">.</span><span class="n">yticks</span><span class="p">(</span><span class="nb">range</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">11</span><span class="p">),</span> <span class="p">[</span><span class="s2">"A"</span><span class="p">]</span> <span class="o">+</span> <span class="nb">list</span><span class="p">(</span><span class="nb">range</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mi">11</span><span class="p">)))</span>
|
||
<span class="n">ax1</span><span class="o">.</span><span class="n">set_title</span><span class="p">(</span><span class="sa">f</span><span class="s2">"State values: </span><span class="si">{</span><span class="n">title</span><span class="si">}</span><span class="s2">"</span><span class="p">)</span>
|
||
<span class="n">ax1</span><span class="o">.</span><span class="n">set_xlabel</span><span class="p">(</span><span class="s2">"Player sum"</span><span class="p">)</span>
|
||
<span class="n">ax1</span><span class="o">.</span><span class="n">set_ylabel</span><span class="p">(</span><span class="s2">"Dealer showing"</span><span class="p">)</span>
|
||
<span class="n">ax1</span><span class="o">.</span><span class="n">zaxis</span><span class="o">.</span><span class="n">set_rotate_label</span><span class="p">(</span><span class="kc">False</span><span class="p">)</span>
|
||
<span class="n">ax1</span><span class="o">.</span><span class="n">set_zlabel</span><span class="p">(</span><span class="s2">"Value"</span><span class="p">,</span> <span class="n">fontsize</span><span class="o">=</span><span class="mi">14</span><span class="p">,</span> <span class="n">rotation</span><span class="o">=</span><span class="mi">90</span><span class="p">)</span>
|
||
<span class="n">ax1</span><span class="o">.</span><span class="n">view_init</span><span class="p">(</span><span class="mi">20</span><span class="p">,</span> <span class="mi">220</span><span class="p">)</span>
|
||
|
||
<span class="c1"># plot the policy</span>
|
||
<span class="n">fig</span><span class="o">.</span><span class="n">add_subplot</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">2</span><span class="p">)</span>
|
||
<span class="n">ax2</span> <span class="o">=</span> <span class="n">sns</span><span class="o">.</span><span class="n">heatmap</span><span class="p">(</span><span class="n">policy_grid</span><span class="p">,</span> <span class="n">linewidth</span><span class="o">=</span><span class="mi">0</span><span class="p">,</span> <span class="n">annot</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">cmap</span><span class="o">=</span><span class="s2">"Accent_r"</span><span class="p">,</span> <span class="n">cbar</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span>
|
||
<span class="n">ax2</span><span class="o">.</span><span class="n">set_title</span><span class="p">(</span><span class="sa">f</span><span class="s2">"Policy: </span><span class="si">{</span><span class="n">title</span><span class="si">}</span><span class="s2">"</span><span class="p">)</span>
|
||
<span class="n">ax2</span><span class="o">.</span><span class="n">set_xlabel</span><span class="p">(</span><span class="s2">"Player sum"</span><span class="p">)</span>
|
||
<span class="n">ax2</span><span class="o">.</span><span class="n">set_ylabel</span><span class="p">(</span><span class="s2">"Dealer showing"</span><span class="p">)</span>
|
||
<span class="n">ax2</span><span class="o">.</span><span class="n">set_xticklabels</span><span class="p">(</span><span class="nb">range</span><span class="p">(</span><span class="mi">12</span><span class="p">,</span> <span class="mi">22</span><span class="p">))</span>
|
||
<span class="n">ax2</span><span class="o">.</span><span class="n">set_yticklabels</span><span class="p">([</span><span class="s2">"A"</span><span class="p">]</span> <span class="o">+</span> <span class="nb">list</span><span class="p">(</span><span class="nb">range</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mi">11</span><span class="p">)),</span> <span class="n">fontsize</span><span class="o">=</span><span class="mi">12</span><span class="p">)</span>
|
||
|
||
<span class="c1"># add a legend</span>
|
||
<span class="n">legend_elements</span> <span class="o">=</span> <span class="p">[</span>
|
||
<span class="n">Patch</span><span class="p">(</span><span class="n">facecolor</span><span class="o">=</span><span class="s2">"lightgreen"</span><span class="p">,</span> <span class="n">edgecolor</span><span class="o">=</span><span class="s2">"black"</span><span class="p">,</span> <span class="n">label</span><span class="o">=</span><span class="s2">"Hit"</span><span class="p">),</span>
|
||
<span class="n">Patch</span><span class="p">(</span><span class="n">facecolor</span><span class="o">=</span><span class="s2">"grey"</span><span class="p">,</span> <span class="n">edgecolor</span><span class="o">=</span><span class="s2">"black"</span><span class="p">,</span> <span class="n">label</span><span class="o">=</span><span class="s2">"Stick"</span><span class="p">),</span>
|
||
<span class="p">]</span>
|
||
<span class="n">ax2</span><span class="o">.</span><span class="n">legend</span><span class="p">(</span><span class="n">handles</span><span class="o">=</span><span class="n">legend_elements</span><span class="p">,</span> <span class="n">bbox_to_anchor</span><span class="o">=</span><span class="p">(</span><span class="mf">1.3</span><span class="p">,</span> <span class="mi">1</span><span class="p">))</span>
|
||
<span class="k">return</span> <span class="n">fig</span>
|
||
|
||
|
||
<span class="c1"># state values & policy with usable ace (ace counts as 11)</span>
|
||
<span class="n">value_grid</span><span class="p">,</span> <span class="n">policy_grid</span> <span class="o">=</span> <span class="n">create_grids</span><span class="p">(</span><span class="n">agent</span><span class="p">,</span> <span class="n">usable_ace</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
|
||
<span class="n">fig1</span> <span class="o">=</span> <span class="n">create_plots</span><span class="p">(</span><span class="n">value_grid</span><span class="p">,</span> <span class="n">policy_grid</span><span class="p">,</span> <span class="n">title</span><span class="o">=</span><span class="s2">"With usable ace"</span><span class="p">)</span>
|
||
<span class="n">plt</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
|
||
</pre></div>
|
||
</div>
|
||
<img alt="../../../_images/blackjack_with_usable_ace.png" src="../../../_images/blackjack_with_usable_ace.png" />
|
||
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="c1"># state values & policy without usable ace (ace counts as 1)</span>
|
||
<span class="n">value_grid</span><span class="p">,</span> <span class="n">policy_grid</span> <span class="o">=</span> <span class="n">create_grids</span><span class="p">(</span><span class="n">agent</span><span class="p">,</span> <span class="n">usable_ace</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span>
|
||
<span class="n">fig2</span> <span class="o">=</span> <span class="n">create_plots</span><span class="p">(</span><span class="n">value_grid</span><span class="p">,</span> <span class="n">policy_grid</span><span class="p">,</span> <span class="n">title</span><span class="o">=</span><span class="s2">"Without usable ace"</span><span class="p">)</span>
|
||
<span class="n">plt</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
|
||
</pre></div>
|
||
</div>
|
||
<img alt="../../../_images/blackjack_without_usable_ace.png" src="../../../_images/blackjack_without_usable_ace.png" />
|
||
<p>It’s good practice to call env.close() at the end of your script,
|
||
so that any used resources by the environment will be closed.</p>
|
||
</section>
|
||
<section id="think-you-can-do-better">
|
||
<h2>Think you can do better?<a class="headerlink" href="#think-you-can-do-better" title="Link to this heading">¶</a></h2>
|
||
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="c1"># You can visualize the environment using the play function</span>
|
||
<span class="c1"># and try to win a few games.</span>
|
||
</pre></div>
|
||
</div>
|
||
<p>Hopefully this Tutorial helped you get a grip of how to interact with
|
||
OpenAI-Gym environments and sets you on a journey to solve many more RL
|
||
challenges.</p>
|
||
<p>It is recommended that you solve this environment by yourself (project
|
||
based learning is really effective!). You can apply your favorite
|
||
discrete RL algorithm or give Monte Carlo ES a try (covered in <a class="reference external" href="http://incompleteideas.net/book/the-book-2nd.html">Sutton &
|
||
Barto</a>, section
|
||
5.3) - this way you can compare your results directly to the book.</p>
|
||
<p>Best of fun!</p>
|
||
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-tutorials-training-agents-blackjack-tutorial-py">
|
||
<div class="sphx-glr-download sphx-glr-download-python docutils container">
|
||
<p><a class="reference download internal" download="" href="../../../_downloads/e1249c888e952c938d27855c3210a4bb/blackjack_tutorial.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">blackjack_tutorial.py</span></code></a></p>
|
||
</div>
|
||
<div class="sphx-glr-download sphx-glr-download-jupyter docutils container">
|
||
<p><a class="reference download internal" download="" href="../../../_downloads/d1980709c80836b9d7e8f9131878afbb/blackjack_tutorial.ipynb"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Jupyter</span> <span class="pre">notebook:</span> <span class="pre">blackjack_tutorial.ipynb</span></code></a></p>
|
||
</div>
|
||
</div>
|
||
</section>
|
||
</section>
|
||
|
||
</article>
|
||
</div>
|
||
<footer>
|
||
|
||
<div class="related-pages">
|
||
<a class="next-page" href="../FrozenLake_tuto/">
|
||
<div class="page-info">
|
||
<div class="context">
|
||
<span>Next</span>
|
||
</div>
|
||
<div class="title">Frozenlake benchmark</div>
|
||
</div>
|
||
<svg class="furo-related-icon">
|
||
<use href="#svg-arrow-right"></use>
|
||
</svg>
|
||
</a>
|
||
<a class="prev-page" href="../reinforce_invpend_gym_v26/">
|
||
<svg class="furo-related-icon">
|
||
<use href="#svg-arrow-right"></use>
|
||
</svg>
|
||
<div class="page-info">
|
||
<div class="context">
|
||
<span>Previous</span>
|
||
</div>
|
||
|
||
<div class="title">Training using REINFORCE for Mujoco</div>
|
||
|
||
</div>
|
||
</a>
|
||
</div>
|
||
<div class="bottom-of-page">
|
||
<div class="left-details">
|
||
<div class="copyright">
|
||
Copyright © 2025 Farama Foundation
|
||
</div>
|
||
<!--
|
||
Made with <a href="https://www.sphinx-doc.org/">Sphinx</a> and <a class="muted-link" href="https://pradyunsg.me">@pradyunsg</a>'s
|
||
|
||
<a href="https://github.com/pradyunsg/furo">Furo</a>
|
||
-->
|
||
</div>
|
||
<div class="right-details">
|
||
<div class="icons">
|
||
<a class="muted-link" href="https://github.com/Farama-Foundation/Gymnasium/"
|
||
aria-label="On GitHub">
|
||
<svg stroke="currentColor" fill="currentColor" stroke-width="0" viewBox="0 0 16 16">
|
||
<path fill-rule="evenodd"
|
||
d="M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27.68 0 1.36.09 2 .27 1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55.38A8.013 8.013 0 0 0 16 8c0-4.42-3.58-8-8-8z">
|
||
</path>
|
||
</svg>
|
||
</a>
|
||
</div>
|
||
</div>
|
||
</div>
|
||
|
||
</footer>
|
||
</div>
|
||
<aside class="toc-drawer">
|
||
|
||
|
||
<div class="toc-sticky toc-scroll">
|
||
<div class="toc-title-container">
|
||
<span class="toc-title">
|
||
On this page
|
||
</span>
|
||
</div>
|
||
<div class="toc-tree-container">
|
||
<div class="toc-tree">
|
||
<ul>
|
||
<li><a class="reference internal" href="#">Solving Blackjack with Q-Learning</a><ul>
|
||
<li><a class="reference internal" href="#imports-and-environment-setup">Imports and Environment Setup</a></li>
|
||
<li><a class="reference internal" href="#observing-the-environment">Observing the environment</a></li>
|
||
<li><a class="reference internal" href="#executing-an-action">Executing an action</a></li>
|
||
<li><a class="reference internal" href="#building-an-agent">Building an agent</a></li>
|
||
<li><a class="reference internal" href="#visualizing-the-training">Visualizing the training</a></li>
|
||
<li><a class="reference internal" href="#visualising-the-policy">Visualising the policy</a></li>
|
||
<li><a class="reference internal" href="#think-you-can-do-better">Think you can do better?</a></li>
|
||
</ul>
|
||
</li>
|
||
</ul>
|
||
|
||
</div>
|
||
</div>
|
||
</div>
|
||
|
||
|
||
</aside>
|
||
</div>
|
||
</div>
|
||
</div>
|
||
<script>
|
||
const toggleMenu = () => {
|
||
const menuBtn = document.querySelector(".farama-header-menu__btn");
|
||
const menuContainer = document.querySelector(".farama-header-menu-container");
|
||
if (document.querySelector(".farama-header-menu").classList.contains("active")) {
|
||
menuBtn.setAttribute("aria-expanded", "false");
|
||
menuContainer.setAttribute("aria-hidden", "true");
|
||
} else {
|
||
menuBtn.setAttribute("aria-expanded", "true");
|
||
menuContainer.setAttribute("aria-hidden", "false");
|
||
}
|
||
document.querySelector(".farama-header-menu").classList.toggle("active");
|
||
}
|
||
|
||
document.querySelector(".farama-header-menu__btn").addEventListener("click", toggleMenu);
|
||
document.getElementById("farama-close-menu").addEventListener("click", toggleMenu);
|
||
</script>
|
||
|
||
|
||
<script async src="https://www.googletagmanager.com/gtag/js?id=G-6H9C8TWXZ8"></script>
|
||
<script>
|
||
const enableGtag = () => {
|
||
window.dataLayer = window.dataLayer || [];
|
||
function gtag(){dataLayer.push(arguments);}
|
||
gtag('js', new Date());
|
||
gtag('config', 'G-6H9C8TWXZ8');
|
||
}
|
||
(() => {
|
||
if (!localStorage.getItem("acceptedCookieAlert")) {
|
||
const boxElem = document.createElement("div");
|
||
boxElem.classList.add("cookie-alert");
|
||
const containerElem = document.createElement("div");
|
||
containerElem.classList.add("cookie-alert__container");
|
||
const textElem = document.createElement("p");
|
||
textElem.innerHTML = `This page uses <a href="https://analytics.google.com/">
|
||
Google Analytics</a> to collect statistics.`;
|
||
containerElem.appendChild(textElem);
|
||
|
||
const declineBtn = Object.assign(document.createElement("button"),
|
||
{
|
||
innerText: "Deny",
|
||
className: "farama-btn cookie-alert__button",
|
||
id: "cookie-alert__decline",
|
||
}
|
||
);
|
||
declineBtn.addEventListener("click", () => {
|
||
localStorage.setItem("acceptedCookieAlert", false);
|
||
boxElem.remove();
|
||
});
|
||
|
||
const acceptBtn = Object.assign(document.createElement("button"),
|
||
{
|
||
innerText: "Allow",
|
||
className: "farama-btn cookie-alert__button",
|
||
id: "cookie-alert__accept",
|
||
}
|
||
);
|
||
acceptBtn.addEventListener("click", () => {
|
||
localStorage.setItem("acceptedCookieAlert", true);
|
||
boxElem.remove();
|
||
enableGtag();
|
||
});
|
||
|
||
containerElem.appendChild(declineBtn);
|
||
containerElem.appendChild(acceptBtn);
|
||
boxElem.appendChild(containerElem);
|
||
document.body.appendChild(boxElem);
|
||
} else if (localStorage.getItem("acceptedCookieAlert") === "true") {
|
||
enableGtag();
|
||
}
|
||
})()
|
||
</script>
|
||
|
||
<script src="../../../_static/documentation_options.js?v=25d39d6f"></script>
|
||
<script src="../../../_static/doctools.js?v=9a2dae69"></script>
|
||
<script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
|
||
<script src="../../../_static/scripts/furo.js?v=7660844c"></script>
|
||
|
||
<script>
|
||
|
||
const createProjectsList = (projects, displayImages) => {
|
||
const ulElem = Object.assign(document.createElement('ul'),
|
||
{
|
||
className:'farama-header-menu-list',
|
||
}
|
||
)
|
||
for (let project of projects) {
|
||
const liElem = document.createElement("li");
|
||
const aElem = Object.assign(document.createElement("a"),
|
||
{
|
||
href: project.link
|
||
}
|
||
);
|
||
liElem.appendChild(aElem);
|
||
if (displayImages) {
|
||
const imgElem = Object.assign(document.createElement("img"),
|
||
{
|
||
src: project.image ? imagesBasepath + project.image : imagesBasepath + "/farama_black.svg",
|
||
alt: `${project.name} logo`,
|
||
className: "farama-black-logo-invert"
|
||
}
|
||
);
|
||
aElem.appendChild(imgElem);
|
||
}
|
||
aElem.appendChild(document.createTextNode(project.name));
|
||
ulElem.appendChild(liElem);
|
||
}
|
||
return ulElem;
|
||
}
|
||
|
||
// Create menu with Farama projects by using the API at farama.org/api/projects.json
|
||
const createCORSRequest = (method, url) => {
|
||
let xhr = new XMLHttpRequest();
|
||
xhr.responseType = 'json';
|
||
|
||
if ("withCredentials" in xhr) {
|
||
xhr.open(method, url, true);
|
||
} else if (typeof XDomainRequest != "undefined") {
|
||
// IE8 & IE9
|
||
xhr = new XDomainRequest();
|
||
xhr.open(method, url);
|
||
} else {
|
||
// CORS not supported.
|
||
xhr = null;
|
||
}
|
||
return xhr;
|
||
};
|
||
|
||
const url = 'https://farama.org/api/projects.json';
|
||
const imagesBasepath = "https://farama.org/assets/images"
|
||
const method = 'GET';
|
||
let xhr = createCORSRequest(method, url);
|
||
|
||
xhr.onload = () => {
|
||
const jsonResponse = xhr.response;
|
||
const sections = {
|
||
"Core Projects": [],
|
||
"Mature Projects": {
|
||
"Documentation": [],
|
||
"Repositories": [],
|
||
},
|
||
"Incubating Projects": {
|
||
"Documentation": [],
|
||
"Repositories": [],
|
||
},
|
||
"Foundation": [
|
||
{
|
||
name: "About",
|
||
link: "https://farama.org/about"
|
||
},
|
||
{
|
||
name: "Standards",
|
||
link: "https://farama.org/project_standards",
|
||
},
|
||
{
|
||
name: "Donate",
|
||
link: "https://farama.org/donations"
|
||
}
|
||
]
|
||
}
|
||
|
||
// Categorize projects
|
||
Object.keys(jsonResponse).forEach(key => {
|
||
projectJson = jsonResponse[key];
|
||
if (projectJson.website !== null) {
|
||
projectJson.link = projectJson.website;
|
||
} else {
|
||
projectJson.link = projectJson.github;
|
||
}
|
||
if (projectJson.type === "core") {
|
||
sections["Core Projects"].push(projectJson)
|
||
} else if (projectJson.type == "mature") {
|
||
if (projectJson.website !== null) {
|
||
sections["Mature Projects"]["Documentation"].push(projectJson)
|
||
} else {
|
||
sections["Mature Projects"]["Repositories"].push(projectJson)
|
||
}
|
||
} else {
|
||
if (projectJson.website !== null) {
|
||
sections["Incubating Projects"]["Documentation"].push(projectJson)
|
||
} else {
|
||
sections["Incubating Projects"]["Repositories"].push(projectJson)
|
||
}
|
||
}
|
||
})
|
||
|
||
const menuContainer = document.querySelector(".farama-header-menu__body");
|
||
|
||
Object.keys(sections).forEach((key, i) => {
|
||
const sectionElem = Object.assign(
|
||
document.createElement('div'), {
|
||
className:'farama-header-menu__section',
|
||
}
|
||
)
|
||
sectionElem.appendChild(Object.assign(document.createElement('span'),
|
||
{
|
||
className:'farama-header-menu__section-title' ,
|
||
innerText: key
|
||
}
|
||
))
|
||
// is not a list
|
||
if (sections[key].constructor !== Array) {
|
||
const subSections = sections[key];
|
||
const subSectionContainerElem = Object.assign(
|
||
document.createElement('div'), {
|
||
className:'farama-header-menu__subsections-container',
|
||
style: 'display: flex'
|
||
}
|
||
)
|
||
Object.keys(subSections).forEach((subKey, i) => {
|
||
const subSectionElem = Object.assign(
|
||
document.createElement('div'), {
|
||
className:'farama-header-menu__subsection',
|
||
}
|
||
)
|
||
subSectionElem.appendChild(Object.assign(document.createElement('span'),
|
||
{
|
||
className:'farama-header-menu__subsection-title' ,
|
||
innerText: subKey
|
||
}
|
||
))
|
||
const ulElem = createProjectsList(subSections[subKey], key !== 'Foundation');
|
||
subSectionElem.appendChild(ulElem);
|
||
subSectionContainerElem.appendChild(subSectionElem);
|
||
})
|
||
sectionElem.appendChild(subSectionContainerElem);
|
||
} else {
|
||
const projects = sections[key];
|
||
const ulElem = createProjectsList(projects, true);
|
||
sectionElem.appendChild(ulElem);
|
||
}
|
||
menuContainer.appendChild(sectionElem)
|
||
});
|
||
}
|
||
|
||
xhr.onerror = function() {
|
||
console.error("Unable to load projects");
|
||
};
|
||
|
||
xhr.send();
|
||
</script>
|
||
|
||
|
||
<script>
|
||
const versioningConfig = {
|
||
githubUser: 'Farama-Foundation',
|
||
githubRepo: 'Gymnasium',
|
||
};
|
||
fetch('/main/_static/versioning/versioning_menu.html').then(response => {
|
||
if (response.status === 200) {
|
||
response.text().then(text => {
|
||
const container = document.createElement("div");
|
||
container.innerHTML = text;
|
||
document.querySelector("body").appendChild(container);
|
||
// innerHtml doenst evaluate scripts, we need to add them dynamically
|
||
Array.from(container.querySelectorAll("script")).forEach(oldScript => {
|
||
const newScript = document.createElement("script");
|
||
Array.from(oldScript.attributes).forEach(attr => newScript.setAttribute(attr.name, attr.value));
|
||
newScript.appendChild(document.createTextNode(oldScript.innerHTML));
|
||
oldScript.parentNode.replaceChild(newScript, oldScript);
|
||
});
|
||
});
|
||
} else {
|
||
console.warn("Unable to load versioning menu", response);
|
||
}
|
||
});
|
||
</script>
|
||
|
||
</body>
|
||
</html> |