mirror of
https://github.com/Farama-Foundation/Gymnasium.git
synced 2025-08-20 14:02:03 +00:00
920 lines
65 KiB
HTML
920 lines
65 KiB
HTML
<!doctype html>
|
||
<html class="no-js" lang="en" data-content_root="../../">
|
||
<head><meta charset="utf-8"/>
|
||
<meta name="viewport" content="width=device-width,initial-scale=1"/>
|
||
<meta name="color-scheme" content="light dark">
|
||
<meta name="description" content="A standard API for reinforcement learning and a diverse set of reference environments (formerly Gym)">
|
||
<meta property="og:title" content="Gymnasium Documentation" />
|
||
<meta property="og:type" content="website" />
|
||
<meta property="og:description" content="A standard API for reinforcement learning and a diverse set of reference environments (formerly Gym)" />
|
||
<meta property="og:url" content="https://gymnasium.farama.org/introduction/train_agent.html" /><meta property="og:image" content="https://gymnasium.farama.org/_static/img/gymnasium-github.png" /><meta name="twitter:card" content="summary_large_image"><meta name="viewport" content="width=device-width, initial-scale=1" />
|
||
<link rel="index" title="Index" href="../../genindex/" /><link rel="search" title="Search" href="../../search/" /><link rel="next" title="Create a Custom Environment" href="../create_custom_env/" /><link rel="prev" title="Basic Usage" href="../basic_usage/" />
|
||
<link rel="canonical" href="https://gymnasium.farama.org/introduction/train_agent.html" />
|
||
|
||
<link rel="shortcut icon" href="../../_static/favicon.png"/><!-- Generated with Sphinx 7.4.7 and Furo 2023.08.19.dev1 -->
|
||
<title>Training an Agent - Gymnasium Documentation</title>
|
||
<link rel="stylesheet" type="text/css" href="../../_static/pygments.css?v=8f2a1f02" />
|
||
<link rel="stylesheet" type="text/css" href="../../_static/styles/furo.css?v=3e7f4c72" />
|
||
<link rel="stylesheet" type="text/css" href="../../_static/sg_gallery.css?v=61a4c737" />
|
||
<link rel="stylesheet" type="text/css" href="../../_static/sg_gallery-binder.css?v=f4aeca0c" />
|
||
<link rel="stylesheet" type="text/css" href="../../_static/sg_gallery-dataframe.css?v=2082cf3c" />
|
||
<link rel="stylesheet" type="text/css" href="../../_static/sg_gallery-rendered-html.css?v=1277b6f3" />
|
||
<link rel="stylesheet" type="text/css" href="../../_static/styles/furo-extensions.css?v=82c8b628" />
|
||
|
||
|
||
|
||
|
||
<style>
|
||
body {
|
||
--color-code-background: #f8f8f8;
|
||
--color-code-foreground: black;
|
||
|
||
}
|
||
@media not print {
|
||
body[data-theme="dark"] {
|
||
--color-code-background: #202020;
|
||
--color-code-foreground: #d0d0d0;
|
||
|
||
}
|
||
@media (prefers-color-scheme: dark) {
|
||
body:not([data-theme="light"]) {
|
||
--color-code-background: #202020;
|
||
--color-code-foreground: #d0d0d0;
|
||
|
||
}
|
||
}
|
||
}
|
||
</style></head>
|
||
<body>
|
||
<header class="farama-header" aria-label="Farama header">
|
||
<div class="farama-header__container">
|
||
<div class="farama-header__left--mobile">
|
||
<label class="nav-overlay-icon" for="__navigation">
|
||
<div class="visually-hidden">Toggle site navigation sidebar</div>
|
||
<svg viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg">
|
||
<defs></defs>
|
||
<line x1="0.5" y1="4" x2="23.5" y2="4"></line>
|
||
<line x1="0.232" y1="12" x2="23.5" y2="12"></line>
|
||
<line x1="0.232" y1="20" x2="23.5" y2="20"></line>
|
||
</svg>
|
||
</label>
|
||
</div>
|
||
<div class="farama-header__left farama-header__center--mobile">
|
||
<a href="../../">
|
||
<img class="farama-header__logo only-light" src="../../_static/img/gymnasium_black.svg" alt="Light Logo"/>
|
||
<img class="farama-header__logo only-dark" src="../../_static/img/gymnasium_white.svg" alt="Dark Logo"/>
|
||
<span class="farama-header__title">Gymnasium Documentation</span>
|
||
</a>
|
||
</div>
|
||
<div class="farama-header__right">
|
||
<div class="farama-header-menu">
|
||
<button class="farama-header-menu__btn" aria-label="Open Farama Menu" aria-expanded="false" aria-haspopup="true" aria-controls="farama-menu">
|
||
<img class="farama-black-logo-invert" src="../../_static/img/farama-logo-header.svg">
|
||
<svg viewBox="0 0 24 24" viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg">
|
||
<polyline style="stroke-linecap: round; stroke-linejoin: round; fill: none; stroke-width: 2px;" points="1 7 12 18 23 7"></polyline>
|
||
</svg>
|
||
</button>
|
||
<div class="farama-header-menu-container farama-hidden" aria-hidden="true" id="farama-menu">
|
||
<div class="farama-header-menu__header">
|
||
<a href="https://farama.org">
|
||
<img class="farama-header-menu__logo farama-white-logo-invert" src="../../_static/img/farama_solid_white.svg" alt="Farama Foundation logo">
|
||
<span>Farama Foundation</span>
|
||
</a>
|
||
<div class="farama-header-menu-header__right">
|
||
<button id="farama-close-menu">
|
||
<svg viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg" fill="none" stroke="currentColor"
|
||
stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="icon-close">
|
||
<line x1="3" y1="21" x2="21" y2="3"></line>
|
||
<line x1="3" y1="3" x2="21" y2="21"></line>
|
||
</svg>
|
||
</button>
|
||
</div>
|
||
</div>
|
||
<div class="farama-header-menu__body">
|
||
<!-- Response from farama.org/api/projects.json -->
|
||
</div>
|
||
</div>
|
||
</div>
|
||
</div>
|
||
</div>
|
||
</header>
|
||
|
||
|
||
<script>
|
||
document.body.dataset.theme = localStorage.getItem("theme") || "auto";
|
||
</script>
|
||
|
||
|
||
<svg xmlns="http://www.w3.org/2000/svg" style="display: none;">
|
||
<symbol id="svg-toc" viewBox="0 0 24 24">
|
||
<title>Contents</title>
|
||
<svg stroke="currentColor" fill="currentColor" stroke-width="0" viewBox="0 0 1024 1024">
|
||
<path d="M408 442h480c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8H408c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8zm-8 204c0 4.4 3.6 8 8 8h480c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8H408c-4.4 0-8 3.6-8 8v56zm504-486H120c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8h784c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8zm0 632H120c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8h784c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8zM115.4 518.9L271.7 642c5.8 4.6 14.4.5 14.4-6.9V388.9c0-7.4-8.5-11.5-14.4-6.9L115.4 505.1a8.74 8.74 0 0 0 0 13.8z"/>
|
||
</svg>
|
||
</symbol>
|
||
<symbol id="svg-menu" viewBox="0 0 24 24">
|
||
<title>Menu</title>
|
||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
|
||
stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather-menu">
|
||
<line x1="3" y1="12" x2="21" y2="12"></line>
|
||
<line x1="3" y1="6" x2="21" y2="6"></line>
|
||
<line x1="3" y1="18" x2="21" y2="18"></line>
|
||
</svg>
|
||
</symbol>
|
||
<symbol id="svg-arrow-right" viewBox="0 0 24 24">
|
||
<title>Expand</title>
|
||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
|
||
stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather-chevron-right">
|
||
<polyline points="9 18 15 12 9 6"></polyline>
|
||
</svg>
|
||
</symbol>
|
||
<symbol id="svg-sun" viewBox="0 0 24 24">
|
||
<title>Light mode</title>
|
||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
|
||
stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="feather-sun">
|
||
<circle cx="12" cy="12" r="5"></circle>
|
||
<line x1="12" y1="1" x2="12" y2="3"></line>
|
||
<line x1="12" y1="21" x2="12" y2="23"></line>
|
||
<line x1="4.22" y1="4.22" x2="5.64" y2="5.64"></line>
|
||
<line x1="18.36" y1="18.36" x2="19.78" y2="19.78"></line>
|
||
<line x1="1" y1="12" x2="3" y2="12"></line>
|
||
<line x1="21" y1="12" x2="23" y2="12"></line>
|
||
<line x1="4.22" y1="19.78" x2="5.64" y2="18.36"></line>
|
||
<line x1="18.36" y1="5.64" x2="19.78" y2="4.22"></line>
|
||
</svg>
|
||
</symbol>
|
||
<symbol id="svg-moon" viewBox="0 0 24 24">
|
||
<title>Dark mode</title>
|
||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
|
||
stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="icon-tabler-moon">
|
||
<path stroke="none" d="M0 0h24v24H0z" fill="none" />
|
||
<path d="M12 3c.132 0 .263 0 .393 0a7.5 7.5 0 0 0 7.92 12.446a9 9 0 1 1 -8.313 -12.454z" />
|
||
</svg>
|
||
</symbol>
|
||
<symbol id="svg-sun-half" viewBox="0 0 24 24">
|
||
<title>Auto light/dark mode</title>
|
||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
|
||
stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="icon-tabler-shadow">
|
||
<path stroke="none" d="M0 0h24v24H0z" fill="none"/>
|
||
<circle cx="12" cy="12" r="9" />
|
||
<path d="M13 12h5" />
|
||
<path d="M13 15h4" />
|
||
<path d="M13 18h1" />
|
||
<path d="M13 9h4" />
|
||
<path d="M13 6h1" />
|
||
</svg>
|
||
</symbol>
|
||
</svg>
|
||
|
||
<input type="checkbox" class="sidebar-toggle" name="__navigation" id="__navigation">
|
||
<input type="checkbox" class="sidebar-toggle" name="__toc" id="__toc">
|
||
<label class="overlay sidebar-overlay" for="__navigation">
|
||
<div class="visually-hidden">Hide navigation sidebar</div>
|
||
</label>
|
||
<label class="overlay toc-overlay" for="__toc">
|
||
<div class="visually-hidden">Hide table of contents sidebar</div>
|
||
</label>
|
||
|
||
<div class="page">
|
||
<!--<header class="mobile-header">
|
||
<div class="header-left">
|
||
<label class="nav-overlay-icon" for="__navigation">
|
||
<div class="visually-hidden">Toggle site navigation sidebar</div>
|
||
<i class="icon"><svg><use href="#svg-menu"></use></svg></i>
|
||
</label>
|
||
</div>
|
||
<div class="header-center">
|
||
<a href="../../"><div class="brand">Gymnasium Documentation</div></a>
|
||
</div>
|
||
<div class="header-right">
|
||
<div class="theme-toggle-container theme-toggle-header">
|
||
<button class="theme-toggle">
|
||
<div class="visually-hidden">Toggle Light / Dark / Auto color theme</div>
|
||
<svg class="theme-icon-when-auto"><use href="#svg-sun-half"></use></svg>
|
||
<svg class="theme-icon-when-dark"><use href="#svg-moon"></use></svg>
|
||
<svg class="theme-icon-when-light"><use href="#svg-sun"></use></svg>
|
||
</button>
|
||
</div>
|
||
<label class="toc-overlay-icon toc-header-icon" for="__toc">
|
||
<div class="visually-hidden">Toggle table of contents sidebar</div>
|
||
<i class="icon"><svg><use href="#svg-toc"></use></svg></i>
|
||
</label>
|
||
</div>
|
||
</header>-->
|
||
<aside class="sidebar-drawer">
|
||
<div class="sidebar-container">
|
||
|
||
<div class="sidebar-sticky"><a class="farama-sidebar__title" href="../../">
|
||
<img class="farama-header__logo only-light" src="../../_static/img/gymnasium_black.svg" alt="Light Logo"/>
|
||
<img class="farama-header__logo only-dark" src="../../_static/img/gymnasium_white.svg" alt="Dark Logo"/>
|
||
<span class="farama-header__title">Gymnasium Documentation</span>
|
||
</a><form class="sidebar-search-container" method="get" action="../../search/" role="search">
|
||
<input class="sidebar-search" placeholder="Search" name="q" aria-label="Search">
|
||
<input type="hidden" name="check_keywords" value="yes">
|
||
<input type="hidden" name="area" value="default">
|
||
</form>
|
||
<div id="searchbox"></div><div class="sidebar-scroll"><div class="sidebar-tree">
|
||
<p class="caption" role="heading"><span class="caption-text">Introduction</span></p>
|
||
<ul class="current">
|
||
<li class="toctree-l1"><a class="reference internal" href="../basic_usage/">Basic Usage</a></li>
|
||
<li class="toctree-l1 current current-page"><a class="current reference internal" href="#">Training an Agent</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../create_custom_env/">Create a Custom Environment</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../record_agent/">Recording Agents</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../speed_up_env/">Speeding Up Training</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../gym_compatibility/">Compatibility with Gym</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../migration_guide/">Migration Guide - v0.21 to v1.0.0</a></li>
|
||
</ul>
|
||
<p class="caption" role="heading"><span class="caption-text">API</span></p>
|
||
<ul>
|
||
<li class="toctree-l1"><a class="reference internal" href="../../api/env/">Env</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../../api/registry/">Make and register</a></li>
|
||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../api/spaces/">Spaces</a><input class="toctree-checkbox" id="toctree-checkbox-1" name="toctree-checkbox-1" role="switch" type="checkbox"/><label for="toctree-checkbox-1"><div class="visually-hidden">Toggle navigation of Spaces</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../api/spaces/fundamental/">Fundamental Spaces</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../api/spaces/composite/">Composite Spaces</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../api/spaces/utils/">Spaces Utils</a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../api/wrappers/">Wrappers</a><input class="toctree-checkbox" id="toctree-checkbox-2" name="toctree-checkbox-2" role="switch" type="checkbox"/><label for="toctree-checkbox-2"><div class="visually-hidden">Toggle navigation of Wrappers</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../api/wrappers/table/">List of Wrappers</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../api/wrappers/misc_wrappers/">Misc Wrappers</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../api/wrappers/action_wrappers/">Action Wrappers</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../api/wrappers/observation_wrappers/">Observation Wrappers</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../api/wrappers/reward_wrappers/">Reward Wrappers</a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../api/vector/">Vectorize</a><input class="toctree-checkbox" id="toctree-checkbox-3" name="toctree-checkbox-3" role="switch" type="checkbox"/><label for="toctree-checkbox-3"><div class="visually-hidden">Toggle navigation of Vectorize</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../api/vector/wrappers/">Wrappers</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../api/vector/async_vector_env/">AsyncVectorEnv</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../api/vector/sync_vector_env/">SyncVectorEnv</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../api/vector/utils/">Utility functions</a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../../api/utils/">Utility functions</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../../api/functional/">Functional Env</a></li>
|
||
</ul>
|
||
<p class="caption" role="heading"><span class="caption-text">Environments</span></p>
|
||
<ul>
|
||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../environments/classic_control/">Classic Control</a><input class="toctree-checkbox" id="toctree-checkbox-4" name="toctree-checkbox-4" role="switch" type="checkbox"/><label for="toctree-checkbox-4"><div class="visually-hidden">Toggle navigation of Classic Control</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../environments/classic_control/acrobot/">Acrobot</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../environments/classic_control/cart_pole/">Cart Pole</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../environments/classic_control/mountain_car_continuous/">Mountain Car Continuous</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../environments/classic_control/mountain_car/">Mountain Car</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../environments/classic_control/pendulum/">Pendulum</a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../environments/box2d/">Box2D</a><input class="toctree-checkbox" id="toctree-checkbox-5" name="toctree-checkbox-5" role="switch" type="checkbox"/><label for="toctree-checkbox-5"><div class="visually-hidden">Toggle navigation of Box2D</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../environments/box2d/bipedal_walker/">Bipedal Walker</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../environments/box2d/car_racing/">Car Racing</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../environments/box2d/lunar_lander/">Lunar Lander</a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../environments/toy_text/">Toy Text</a><input class="toctree-checkbox" id="toctree-checkbox-6" name="toctree-checkbox-6" role="switch" type="checkbox"/><label for="toctree-checkbox-6"><div class="visually-hidden">Toggle navigation of Toy Text</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../environments/toy_text/blackjack/">Blackjack</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../environments/toy_text/taxi/">Taxi</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../environments/toy_text/cliff_walking/">Cliff Walking</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../environments/toy_text/frozen_lake/">Frozen Lake</a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../environments/mujoco/">MuJoCo</a><input class="toctree-checkbox" id="toctree-checkbox-7" name="toctree-checkbox-7" role="switch" type="checkbox"/><label for="toctree-checkbox-7"><div class="visually-hidden">Toggle navigation of MuJoCo</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../environments/mujoco/ant/">Ant</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../environments/mujoco/half_cheetah/">Half Cheetah</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../environments/mujoco/hopper/">Hopper</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../environments/mujoco/humanoid/">Humanoid</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../environments/mujoco/humanoid_standup/">Humanoid Standup</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../environments/mujoco/inverted_double_pendulum/">Inverted Double Pendulum</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../environments/mujoco/inverted_pendulum/">Inverted Pendulum</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../environments/mujoco/pusher/">Pusher</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../environments/mujoco/reacher/">Reacher</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../environments/mujoco/swimmer/">Swimmer</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../environments/mujoco/walker2d/">Walker2D</a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../../environments/atari/">Atari</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../../environments/third_party_environments/">External Environments</a></li>
|
||
</ul>
|
||
<p class="caption" role="heading"><span class="caption-text">Tutorials</span></p>
|
||
<ul>
|
||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../tutorials/gymnasium_basics/">Gymnasium Basics Documentation Links</a><input class="toctree-checkbox" id="toctree-checkbox-8" name="toctree-checkbox-8" role="switch" type="checkbox"/><label for="toctree-checkbox-8"><div class="visually-hidden">Toggle navigation of Gymnasium Basics Documentation Links</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../tutorials/gymnasium_basics/load_quadruped_model/">Load custom quadruped robot environments</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../tutorials/gymnasium_basics/handling_time_limits/">Handling Time Limits</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../tutorials/gymnasium_basics/implementing_custom_wrappers/">Implementing Custom Wrappers</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../tutorials/gymnasium_basics/environment_creation/">Make your own custom environment</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../tutorials/gymnasium_basics/vector_envs_tutorial/">Training A2C with Vector Envs and Domain Randomization</a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../tutorials/training_agents/">Training Agents links in the Gymnasium Documentation</a><input class="toctree-checkbox" id="toctree-checkbox-9" name="toctree-checkbox-9" role="switch" type="checkbox"/><label for="toctree-checkbox-9"><div class="visually-hidden">Toggle navigation of Training Agents links in the Gymnasium Documentation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../tutorials/training_agents/reinforce_invpend_gym_v26/">Training using REINFORCE for Mujoco</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../tutorials/training_agents/blackjack_tutorial/">Solving Blackjack with Q-Learning</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../tutorials/training_agents/FrozenLake_tuto/">Frozenlake benchmark</a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../../tutorials/third-party-tutorials/">Third-Party Tutorials</a></li>
|
||
</ul>
|
||
<p class="caption" role="heading"><span class="caption-text">Development</span></p>
|
||
<ul>
|
||
<li class="toctree-l1"><a class="reference external" href="https://github.com/Farama-Foundation/Gymnasium">Github</a></li>
|
||
<li class="toctree-l1"><a class="reference external" href="https://arxiv.org/abs/2407.17032">Paper</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../../gymnasium_release_notes/">Gymnasium Release Notes</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../../gym_release_notes/">Gym Release Notes</a></li>
|
||
<li class="toctree-l1"><a class="reference external" href="https://github.com/Farama-Foundation/Gymnasium/blob/main/docs/README.md">Contribute to the Docs</a></li>
|
||
</ul>
|
||
|
||
</div>
|
||
</div>
|
||
|
||
</div>
|
||
|
||
</div>
|
||
</aside>
|
||
<div class="main-container">
|
||
|
||
|
||
|
||
|
||
|
||
<div class="main">
|
||
<div class="content">
|
||
<div class="article-container">
|
||
<a href="#" class="back-to-top muted-link">
|
||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24">
|
||
<path d="M13 20h-2V8l-5.5 5.5-1.42-1.42L12 4.16l7.92 7.92-1.42 1.42L13 8v12z"></path>
|
||
</svg>
|
||
<span>Back to top</span>
|
||
</a>
|
||
<div class="content-icon-container"><div class="edit-this-page">
|
||
<a class="muted-link" href="https://github.com/Farama-Foundation/Gymnasium/edit/main/docs/introduction/train_agent.md" title="Edit this page">
|
||
<svg aria-hidden="true" viewBox="0 0 24 24" stroke-width="1.5" stroke="currentColor" fill="none" stroke-linecap="round" stroke-linejoin="round">
|
||
<path stroke="none" d="M0 0h24v24H0z" fill="none"/>
|
||
<path d="M4 20h4l10.5 -10.5a1.5 1.5 0 0 0 -4 -4l-10.5 10.5v4" />
|
||
<line x1="13.5" y1="6.5" x2="17.5" y2="10.5" />
|
||
</svg>
|
||
<span class="visually-hidden">Edit this page</span>
|
||
</a>
|
||
</div><div class="theme-toggle-container theme-toggle-content">
|
||
<button class="theme-toggle" title="Toggle color theme">
|
||
<div class="visually-hidden">Toggle Light / Dark / Auto color theme</div>
|
||
<svg class="theme-icon-when-auto">
|
||
<use href="#svg-sun-half"></use>
|
||
</svg>
|
||
<svg class="theme-icon-when-dark">
|
||
<use href="#svg-moon"></use>
|
||
</svg>
|
||
<svg class="theme-icon-when-light">
|
||
<use href="#svg-sun"></use>
|
||
</svg>
|
||
</button>
|
||
</div>
|
||
<label class="toc-overlay-icon toc-content-icon" for="__toc">
|
||
<div class="visually-hidden">Toggle table of contents sidebar</div>
|
||
<i class="icon"><svg>
|
||
<use href="#svg-toc"></use>
|
||
</svg></i>
|
||
</label>
|
||
</div>
|
||
<article role="main">
|
||
|
||
<section class="tex2jax_ignore mathjax_ignore" id="training-an-agent">
|
||
<h1>Training an Agent<a class="headerlink" href="#training-an-agent" title="Link to this heading">¶</a></h1>
|
||
<p>This page provides a short outline of how to train an agent for a Gymnasium environment, in particular, we will use a tabular based Q-learning to solve the Blackjack v1 environment. For a full complete version of this tutorial and more training tutorials for other environments and algorithm, see <a class="reference internal" href="#../tutorials/training_agents"><span class="xref myst">this</span></a>. Please read <a class="reference internal" href="../basic_usage/"><span class="doc std std-doc">basic usage</span></a> before reading this page. Before we implement any code, here is an overview of Blackjack and Q-learning.</p>
|
||
<p>Blackjack is one of the most popular casino card games that is also infamous for being beatable under certain conditions. This version of the game uses an infinite deck (we draw the cards with replacement), so counting cards won’t be a viable strategy in our simulated game. The observation is a tuple of the player’s current sum, the value of the dealers face-up card and a boolean value on whether the player holds a usable case. The agent can pick between two actions: stand (0) such that the player takes no more cards and hit (1) such that the player will take another card. To win, your card sum should be greater than the dealers without exceeding 21. The game ends if the player selects stand or if the card sum is greater than 21. Full documentation can be found at <a class="reference external" href="https://gymnasium.farama.org/environments/toy_text/blackjack">https://gymnasium.farama.org/environments/toy_text/blackjack</a>.</p>
|
||
<p>Q-learning is a model-free off-policy learning algorithm by Watkins, 1989 for environments with discrete action spaces and was famous for being the first reinforcement learning algorithm to prove convergence to an optimal policy under certain conditions.</p>
|
||
<section id="executing-an-action">
|
||
<h2>Executing an action<a class="headerlink" href="#executing-an-action" title="Link to this heading">¶</a></h2>
|
||
<p>After receiving our first observation, we are only going to use the<code class="docutils literal notranslate"><span class="pre">env.step(action)</span></code> function to interact with the environment. This function takes an action as input and executes it in the environment. Because that action changes the state of the environment, it returns four useful variables to us. These are:</p>
|
||
<ul class="simple">
|
||
<li><p><code class="docutils literal notranslate"><span class="pre">next</span> <span class="pre">observation</span></code>: This is the observation that the agent will receive after taking the action.</p></li>
|
||
<li><p><code class="docutils literal notranslate"><span class="pre">reward</span></code>: This is the reward that the agent will receive after taking the action.</p></li>
|
||
<li><p><code class="docutils literal notranslate"><span class="pre">terminated</span></code>: This is a boolean variable that indicates whether or not the environment has terminated, i.e., ended due to an internal condition.</p></li>
|
||
<li><p><code class="docutils literal notranslate"><span class="pre">truncated</span></code>: This is a boolean variable that also indicates whether the episode ended by early truncation, i.e., a time limit is reached.</p></li>
|
||
<li><p><code class="docutils literal notranslate"><span class="pre">info</span></code>: This is a dictionary that might contain additional information about the environment.</p></li>
|
||
</ul>
|
||
<p>The <code class="docutils literal notranslate"><span class="pre">next</span> <span class="pre">observation</span></code>, <code class="docutils literal notranslate"><span class="pre">reward</span></code>, <code class="docutils literal notranslate"><span class="pre">terminated</span></code> and <code class="docutils literal notranslate"><span class="pre">truncated</span></code> variables are self-explanatory, but the <code class="docutils literal notranslate"><span class="pre">info</span></code> variable requires some additional explanation. This variable contains a dictionary that might have some extra information about the environment, but in the Blackjack-v1 environment you can ignore it. For example in Atari environments the info dictionary has a <code class="docutils literal notranslate"><span class="pre">ale.lives</span></code> key that tells us how many lives the agent has left. If the agent has 0 lives, then the episode is over.</p>
|
||
<p>Note that it is not a good idea to call <code class="docutils literal notranslate"><span class="pre">env.render()</span></code> in your training loop because rendering slows down training by a lot. Rather try to build an extra loop to evaluate and showcase the agent after training.</p>
|
||
</section>
|
||
<section id="building-an-agent">
|
||
<h2>Building an agent<a class="headerlink" href="#building-an-agent" title="Link to this heading">¶</a></h2>
|
||
<p>Let’s build a Q-learning agent to solve Blackjack! We’ll need some functions for picking an action and updating the agents action values. To ensure that the agents explores the environment, one possible solution is the epsilon-greedy strategy, where we pick a random action with the percentage <code class="docutils literal notranslate"><span class="pre">epsilon</span></code> and the greedy action (currently valued as the best) <code class="docutils literal notranslate"><span class="pre">1</span> <span class="pre">-</span> <span class="pre">epsilon</span></code>.</p>
|
||
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span><span class="w"> </span><span class="nn">collections</span><span class="w"> </span><span class="kn">import</span> <span class="n">defaultdict</span>
|
||
<span class="kn">import</span><span class="w"> </span><span class="nn">gymnasium</span><span class="w"> </span><span class="k">as</span><span class="w"> </span><span class="nn">gym</span>
|
||
<span class="kn">import</span><span class="w"> </span><span class="nn">numpy</span><span class="w"> </span><span class="k">as</span><span class="w"> </span><span class="nn">np</span>
|
||
|
||
|
||
<span class="k">class</span><span class="w"> </span><span class="nc">BlackjackAgent</span><span class="p">:</span>
|
||
<span class="k">def</span><span class="w"> </span><span class="fm">__init__</span><span class="p">(</span>
|
||
<span class="bp">self</span><span class="p">,</span>
|
||
<span class="n">env</span><span class="p">:</span> <span class="n">gym</span><span class="o">.</span><span class="n">Env</span><span class="p">,</span>
|
||
<span class="n">learning_rate</span><span class="p">:</span> <span class="nb">float</span><span class="p">,</span>
|
||
<span class="n">initial_epsilon</span><span class="p">:</span> <span class="nb">float</span><span class="p">,</span>
|
||
<span class="n">epsilon_decay</span><span class="p">:</span> <span class="nb">float</span><span class="p">,</span>
|
||
<span class="n">final_epsilon</span><span class="p">:</span> <span class="nb">float</span><span class="p">,</span>
|
||
<span class="n">discount_factor</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.95</span><span class="p">,</span>
|
||
<span class="p">):</span>
|
||
<span class="w"> </span><span class="sd">"""Initialize a Reinforcement Learning agent with an empty dictionary</span>
|
||
<span class="sd"> of state-action values (q_values), a learning rate and an epsilon.</span>
|
||
|
||
<span class="sd"> Args:</span>
|
||
<span class="sd"> env: The training environment</span>
|
||
<span class="sd"> learning_rate: The learning rate</span>
|
||
<span class="sd"> initial_epsilon: The initial epsilon value</span>
|
||
<span class="sd"> epsilon_decay: The decay for epsilon</span>
|
||
<span class="sd"> final_epsilon: The final epsilon value</span>
|
||
<span class="sd"> discount_factor: The discount factor for computing the Q-value</span>
|
||
<span class="sd"> """</span>
|
||
<span class="bp">self</span><span class="o">.</span><span class="n">env</span> <span class="o">=</span> <span class="n">env</span>
|
||
<span class="bp">self</span><span class="o">.</span><span class="n">q_values</span> <span class="o">=</span> <span class="n">defaultdict</span><span class="p">(</span><span class="k">lambda</span><span class="p">:</span> <span class="n">np</span><span class="o">.</span><span class="n">zeros</span><span class="p">(</span><span class="n">env</span><span class="o">.</span><span class="n">action_space</span><span class="o">.</span><span class="n">n</span><span class="p">))</span>
|
||
|
||
<span class="bp">self</span><span class="o">.</span><span class="n">lr</span> <span class="o">=</span> <span class="n">learning_rate</span>
|
||
<span class="bp">self</span><span class="o">.</span><span class="n">discount_factor</span> <span class="o">=</span> <span class="n">discount_factor</span>
|
||
|
||
<span class="bp">self</span><span class="o">.</span><span class="n">epsilon</span> <span class="o">=</span> <span class="n">initial_epsilon</span>
|
||
<span class="bp">self</span><span class="o">.</span><span class="n">epsilon_decay</span> <span class="o">=</span> <span class="n">epsilon_decay</span>
|
||
<span class="bp">self</span><span class="o">.</span><span class="n">final_epsilon</span> <span class="o">=</span> <span class="n">final_epsilon</span>
|
||
|
||
<span class="bp">self</span><span class="o">.</span><span class="n">training_error</span> <span class="o">=</span> <span class="p">[]</span>
|
||
|
||
<span class="k">def</span><span class="w"> </span><span class="nf">get_action</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">obs</span><span class="p">:</span> <span class="nb">tuple</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="nb">int</span><span class="p">,</span> <span class="nb">bool</span><span class="p">])</span> <span class="o">-></span> <span class="nb">int</span><span class="p">:</span>
|
||
<span class="w"> </span><span class="sd">"""</span>
|
||
<span class="sd"> Returns the best action with probability (1 - epsilon)</span>
|
||
<span class="sd"> otherwise a random action with probability epsilon to ensure exploration.</span>
|
||
<span class="sd"> """</span>
|
||
<span class="c1"># with probability epsilon return a random action to explore the environment</span>
|
||
<span class="k">if</span> <span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">random</span><span class="p">()</span> <span class="o"><</span> <span class="bp">self</span><span class="o">.</span><span class="n">epsilon</span><span class="p">:</span>
|
||
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">env</span><span class="o">.</span><span class="n">action_space</span><span class="o">.</span><span class="n">sample</span><span class="p">()</span>
|
||
<span class="c1"># with probability (1 - epsilon) act greedily (exploit)</span>
|
||
<span class="k">else</span><span class="p">:</span>
|
||
<span class="k">return</span> <span class="nb">int</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">argmax</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">q_values</span><span class="p">[</span><span class="n">obs</span><span class="p">]))</span>
|
||
|
||
<span class="k">def</span><span class="w"> </span><span class="nf">update</span><span class="p">(</span>
|
||
<span class="bp">self</span><span class="p">,</span>
|
||
<span class="n">obs</span><span class="p">:</span> <span class="nb">tuple</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="nb">int</span><span class="p">,</span> <span class="nb">bool</span><span class="p">],</span>
|
||
<span class="n">action</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span>
|
||
<span class="n">reward</span><span class="p">:</span> <span class="nb">float</span><span class="p">,</span>
|
||
<span class="n">terminated</span><span class="p">:</span> <span class="nb">bool</span><span class="p">,</span>
|
||
<span class="n">next_obs</span><span class="p">:</span> <span class="nb">tuple</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="nb">int</span><span class="p">,</span> <span class="nb">bool</span><span class="p">],</span>
|
||
<span class="p">):</span>
|
||
<span class="w"> </span><span class="sd">"""Updates the Q-value of an action."""</span>
|
||
<span class="n">future_q_value</span> <span class="o">=</span> <span class="p">(</span><span class="ow">not</span> <span class="n">terminated</span><span class="p">)</span> <span class="o">*</span> <span class="n">np</span><span class="o">.</span><span class="n">max</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">q_values</span><span class="p">[</span><span class="n">next_obs</span><span class="p">])</span>
|
||
<span class="n">temporal_difference</span> <span class="o">=</span> <span class="p">(</span>
|
||
<span class="n">reward</span> <span class="o">+</span> <span class="bp">self</span><span class="o">.</span><span class="n">discount_factor</span> <span class="o">*</span> <span class="n">future_q_value</span> <span class="o">-</span> <span class="bp">self</span><span class="o">.</span><span class="n">q_values</span><span class="p">[</span><span class="n">obs</span><span class="p">][</span><span class="n">action</span><span class="p">]</span>
|
||
<span class="p">)</span>
|
||
|
||
<span class="bp">self</span><span class="o">.</span><span class="n">q_values</span><span class="p">[</span><span class="n">obs</span><span class="p">][</span><span class="n">action</span><span class="p">]</span> <span class="o">=</span> <span class="p">(</span>
|
||
<span class="bp">self</span><span class="o">.</span><span class="n">q_values</span><span class="p">[</span><span class="n">obs</span><span class="p">][</span><span class="n">action</span><span class="p">]</span> <span class="o">+</span> <span class="bp">self</span><span class="o">.</span><span class="n">lr</span> <span class="o">*</span> <span class="n">temporal_difference</span>
|
||
<span class="p">)</span>
|
||
<span class="bp">self</span><span class="o">.</span><span class="n">training_error</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">temporal_difference</span><span class="p">)</span>
|
||
|
||
<span class="k">def</span><span class="w"> </span><span class="nf">decay_epsilon</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
|
||
<span class="bp">self</span><span class="o">.</span><span class="n">epsilon</span> <span class="o">=</span> <span class="nb">max</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">final_epsilon</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">epsilon</span> <span class="o">-</span> <span class="bp">self</span><span class="o">.</span><span class="n">epsilon_decay</span><span class="p">)</span>
|
||
</pre></div>
|
||
</div>
|
||
</section>
|
||
<section id="training-the-agent">
|
||
<h2>Training the agent<a class="headerlink" href="#training-the-agent" title="Link to this heading">¶</a></h2>
|
||
<p>To train the agent, we will let the agent play one episode (one complete game is called an episode) at a time and update it’s Q-values after each action taken during the episode. The agent will have to experience a lot of episodes to explore the environment sufficiently.</p>
|
||
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="c1"># hyperparameters</span>
|
||
<span class="n">learning_rate</span> <span class="o">=</span> <span class="mf">0.01</span>
|
||
<span class="n">n_episodes</span> <span class="o">=</span> <span class="mi">100_000</span>
|
||
<span class="n">start_epsilon</span> <span class="o">=</span> <span class="mf">1.0</span>
|
||
<span class="n">epsilon_decay</span> <span class="o">=</span> <span class="n">start_epsilon</span> <span class="o">/</span> <span class="p">(</span><span class="n">n_episodes</span> <span class="o">/</span> <span class="mi">2</span><span class="p">)</span> <span class="c1"># reduce the exploration over time</span>
|
||
<span class="n">final_epsilon</span> <span class="o">=</span> <span class="mf">0.1</span>
|
||
|
||
<span class="n">env</span> <span class="o">=</span> <span class="n">gym</span><span class="o">.</span><span class="n">make</span><span class="p">(</span><span class="s2">"Blackjack-v1"</span><span class="p">,</span> <span class="n">sab</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span>
|
||
<span class="n">env</span> <span class="o">=</span> <span class="n">gym</span><span class="o">.</span><span class="n">wrappers</span><span class="o">.</span><span class="n">RecordEpisodeStatistics</span><span class="p">(</span><span class="n">env</span><span class="p">,</span> <span class="n">buffer_length</span><span class="o">=</span><span class="n">n_episodes</span><span class="p">)</span>
|
||
|
||
<span class="n">agent</span> <span class="o">=</span> <span class="n">BlackjackAgent</span><span class="p">(</span>
|
||
<span class="n">env</span><span class="o">=</span><span class="n">env</span><span class="p">,</span>
|
||
<span class="n">learning_rate</span><span class="o">=</span><span class="n">learning_rate</span><span class="p">,</span>
|
||
<span class="n">initial_epsilon</span><span class="o">=</span><span class="n">start_epsilon</span><span class="p">,</span>
|
||
<span class="n">epsilon_decay</span><span class="o">=</span><span class="n">epsilon_decay</span><span class="p">,</span>
|
||
<span class="n">final_epsilon</span><span class="o">=</span><span class="n">final_epsilon</span><span class="p">,</span>
|
||
<span class="p">)</span>
|
||
</pre></div>
|
||
</div>
|
||
<p>Info: The current hyperparameters are set to quickly train a decent agent. If you want to converge to the optimal policy, try increasing the <code class="docutils literal notranslate"><span class="pre">n_episodes</span></code> by 10x and lower the learning_rate (e.g. to 0.001).</p>
|
||
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span><span class="w"> </span><span class="nn">tqdm</span><span class="w"> </span><span class="kn">import</span> <span class="n">tqdm</span>
|
||
|
||
<span class="k">for</span> <span class="n">episode</span> <span class="ow">in</span> <span class="n">tqdm</span><span class="p">(</span><span class="nb">range</span><span class="p">(</span><span class="n">n_episodes</span><span class="p">)):</span>
|
||
<span class="n">obs</span><span class="p">,</span> <span class="n">info</span> <span class="o">=</span> <span class="n">env</span><span class="o">.</span><span class="n">reset</span><span class="p">()</span>
|
||
<span class="n">done</span> <span class="o">=</span> <span class="kc">False</span>
|
||
|
||
<span class="c1"># play one episode</span>
|
||
<span class="k">while</span> <span class="ow">not</span> <span class="n">done</span><span class="p">:</span>
|
||
<span class="n">action</span> <span class="o">=</span> <span class="n">agent</span><span class="o">.</span><span class="n">get_action</span><span class="p">(</span><span class="n">obs</span><span class="p">)</span>
|
||
<span class="n">next_obs</span><span class="p">,</span> <span class="n">reward</span><span class="p">,</span> <span class="n">terminated</span><span class="p">,</span> <span class="n">truncated</span><span class="p">,</span> <span class="n">info</span> <span class="o">=</span> <span class="n">env</span><span class="o">.</span><span class="n">step</span><span class="p">(</span><span class="n">action</span><span class="p">)</span>
|
||
|
||
<span class="c1"># update the agent</span>
|
||
<span class="n">agent</span><span class="o">.</span><span class="n">update</span><span class="p">(</span><span class="n">obs</span><span class="p">,</span> <span class="n">action</span><span class="p">,</span> <span class="n">reward</span><span class="p">,</span> <span class="n">terminated</span><span class="p">,</span> <span class="n">next_obs</span><span class="p">)</span>
|
||
|
||
<span class="c1"># update if the environment is done and the current obs</span>
|
||
<span class="n">done</span> <span class="o">=</span> <span class="n">terminated</span> <span class="ow">or</span> <span class="n">truncated</span>
|
||
<span class="n">obs</span> <span class="o">=</span> <span class="n">next_obs</span>
|
||
|
||
<span class="n">agent</span><span class="o">.</span><span class="n">decay_epsilon</span><span class="p">()</span>
|
||
</pre></div>
|
||
</div>
|
||
<p>You can use <code class="docutils literal notranslate"><span class="pre">matplotlib</span></code> to visualize the training reward and length.</p>
|
||
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span><span class="w"> </span><span class="nn">matplotlib</span><span class="w"> </span><span class="kn">import</span> <span class="n">pyplot</span> <span class="k">as</span> <span class="n">plt</span>
|
||
|
||
<span class="k">def</span><span class="w"> </span><span class="nf">get_moving_avgs</span><span class="p">(</span><span class="n">arr</span><span class="p">,</span> <span class="n">window</span><span class="p">,</span> <span class="n">convolution_mode</span><span class="p">):</span>
|
||
<span class="k">return</span> <span class="n">np</span><span class="o">.</span><span class="n">convolve</span><span class="p">(</span>
|
||
<span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">(</span><span class="n">arr</span><span class="p">)</span><span class="o">.</span><span class="n">flatten</span><span class="p">(),</span>
|
||
<span class="n">np</span><span class="o">.</span><span class="n">ones</span><span class="p">(</span><span class="n">window</span><span class="p">),</span>
|
||
<span class="n">mode</span><span class="o">=</span><span class="n">convolution_mode</span>
|
||
<span class="p">)</span> <span class="o">/</span> <span class="n">window</span>
|
||
|
||
<span class="c1"># Smooth over a 500 episode window</span>
|
||
<span class="n">rolling_length</span> <span class="o">=</span> <span class="mi">500</span>
|
||
<span class="n">fig</span><span class="p">,</span> <span class="n">axs</span> <span class="o">=</span> <span class="n">plt</span><span class="o">.</span><span class="n">subplots</span><span class="p">(</span><span class="n">ncols</span><span class="o">=</span><span class="mi">3</span><span class="p">,</span> <span class="n">figsize</span><span class="o">=</span><span class="p">(</span><span class="mi">12</span><span class="p">,</span> <span class="mi">5</span><span class="p">))</span>
|
||
|
||
<span class="n">axs</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">set_title</span><span class="p">(</span><span class="s2">"Episode rewards"</span><span class="p">)</span>
|
||
<span class="n">reward_moving_average</span> <span class="o">=</span> <span class="n">get_moving_avgs</span><span class="p">(</span>
|
||
<span class="n">env</span><span class="o">.</span><span class="n">return_queue</span><span class="p">,</span>
|
||
<span class="n">rolling_length</span><span class="p">,</span>
|
||
<span class="s2">"valid"</span>
|
||
<span class="p">)</span>
|
||
<span class="n">axs</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">plot</span><span class="p">(</span><span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">reward_moving_average</span><span class="p">)),</span> <span class="n">reward_moving_average</span><span class="p">)</span>
|
||
|
||
<span class="n">axs</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span><span class="o">.</span><span class="n">set_title</span><span class="p">(</span><span class="s2">"Episode lengths"</span><span class="p">)</span>
|
||
<span class="n">length_moving_average</span> <span class="o">=</span> <span class="n">get_moving_avgs</span><span class="p">(</span>
|
||
<span class="n">env</span><span class="o">.</span><span class="n">length_queue</span><span class="p">,</span>
|
||
<span class="n">rolling_length</span><span class="p">,</span>
|
||
<span class="s2">"valid"</span>
|
||
<span class="p">)</span>
|
||
<span class="n">axs</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span><span class="o">.</span><span class="n">plot</span><span class="p">(</span><span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">length_moving_average</span><span class="p">)),</span> <span class="n">length_moving_average</span><span class="p">)</span>
|
||
|
||
<span class="n">axs</span><span class="p">[</span><span class="mi">2</span><span class="p">]</span><span class="o">.</span><span class="n">set_title</span><span class="p">(</span><span class="s2">"Training Error"</span><span class="p">)</span>
|
||
<span class="n">training_error_moving_average</span> <span class="o">=</span> <span class="n">get_moving_avgs</span><span class="p">(</span>
|
||
<span class="n">agent</span><span class="o">.</span><span class="n">training_error</span><span class="p">,</span>
|
||
<span class="n">rolling_length</span><span class="p">,</span>
|
||
<span class="s2">"same"</span>
|
||
<span class="p">)</span>
|
||
<span class="n">axs</span><span class="p">[</span><span class="mi">2</span><span class="p">]</span><span class="o">.</span><span class="n">plot</span><span class="p">(</span><span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">training_error_moving_average</span><span class="p">)),</span> <span class="n">training_error_moving_average</span><span class="p">)</span>
|
||
<span class="n">plt</span><span class="o">.</span><span class="n">tight_layout</span><span class="p">()</span>
|
||
<span class="n">plt</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
|
||
|
||
|
||
</pre></div>
|
||
</div>
|
||
<p><img alt="" src="../../_images/blackjack_training_plots.png" /></p>
|
||
<p>Hopefully this tutorial helped you get a grip of how to interact with Gymnasium environments and sets you on a journey to solve many more RL challenges.</p>
|
||
<p>It is recommended that you solve this environment by yourself (project based learning is really effective!). You can apply your favorite discrete RL algorithm or give Monte Carlo ES a try (covered in <code class="docutils literal notranslate"><span class="pre">Sutton</span> <span class="pre">&</span> <span class="pre">Barto</span> <span class="pre"><http://incompleteideas.net/book/the-book-2nd.html></span></code>_, section 5.3) - this way you can compare your results directly to the book.</p>
|
||
<p>Best of luck!</p>
|
||
</section>
|
||
</section>
|
||
|
||
</article>
|
||
</div>
|
||
<footer>
|
||
|
||
<div class="related-pages">
|
||
<a class="next-page" href="../create_custom_env/">
|
||
<div class="page-info">
|
||
<div class="context">
|
||
<span>Next</span>
|
||
</div>
|
||
<div class="title">Create a Custom Environment</div>
|
||
</div>
|
||
<svg class="furo-related-icon">
|
||
<use href="#svg-arrow-right"></use>
|
||
</svg>
|
||
</a>
|
||
<a class="prev-page" href="../basic_usage/">
|
||
<svg class="furo-related-icon">
|
||
<use href="#svg-arrow-right"></use>
|
||
</svg>
|
||
<div class="page-info">
|
||
<div class="context">
|
||
<span>Previous</span>
|
||
</div>
|
||
|
||
<div class="title">Basic Usage</div>
|
||
|
||
</div>
|
||
</a>
|
||
</div>
|
||
<div class="bottom-of-page">
|
||
<div class="left-details">
|
||
<div class="copyright">
|
||
Copyright © 2025 Farama Foundation
|
||
</div>
|
||
<!--
|
||
Made with <a href="https://www.sphinx-doc.org/">Sphinx</a> and <a class="muted-link" href="https://pradyunsg.me">@pradyunsg</a>'s
|
||
|
||
<a href="https://github.com/pradyunsg/furo">Furo</a>
|
||
-->
|
||
</div>
|
||
<div class="right-details">
|
||
<div class="icons">
|
||
<a class="muted-link" href="https://github.com/Farama-Foundation/Gymnasium/"
|
||
aria-label="On GitHub">
|
||
<svg stroke="currentColor" fill="currentColor" stroke-width="0" viewBox="0 0 16 16">
|
||
<path fill-rule="evenodd"
|
||
d="M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27.68 0 1.36.09 2 .27 1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55.38A8.013 8.013 0 0 0 16 8c0-4.42-3.58-8-8-8z">
|
||
</path>
|
||
</svg>
|
||
</a>
|
||
</div>
|
||
</div>
|
||
</div>
|
||
|
||
</footer>
|
||
</div>
|
||
<aside class="toc-drawer">
|
||
|
||
|
||
<div class="toc-sticky toc-scroll">
|
||
<div class="toc-title-container">
|
||
<span class="toc-title">
|
||
On this page
|
||
</span>
|
||
</div>
|
||
<div class="toc-tree-container">
|
||
<div class="toc-tree">
|
||
<ul>
|
||
<li><a class="reference internal" href="#">Training an Agent</a><ul>
|
||
<li><a class="reference internal" href="#executing-an-action">Executing an action</a></li>
|
||
<li><a class="reference internal" href="#building-an-agent">Building an agent</a></li>
|
||
<li><a class="reference internal" href="#training-the-agent">Training the agent</a></li>
|
||
</ul>
|
||
</li>
|
||
</ul>
|
||
|
||
</div>
|
||
</div>
|
||
</div>
|
||
|
||
|
||
</aside>
|
||
</div>
|
||
</div>
|
||
</div>
|
||
<script>
|
||
const toggleMenu = () => {
|
||
const menuBtn = document.querySelector(".farama-header-menu__btn");
|
||
const menuContainer = document.querySelector(".farama-header-menu-container");
|
||
if (document.querySelector(".farama-header-menu").classList.contains("active")) {
|
||
menuBtn.setAttribute("aria-expanded", "false");
|
||
menuContainer.setAttribute("aria-hidden", "true");
|
||
} else {
|
||
menuBtn.setAttribute("aria-expanded", "true");
|
||
menuContainer.setAttribute("aria-hidden", "false");
|
||
}
|
||
document.querySelector(".farama-header-menu").classList.toggle("active");
|
||
}
|
||
|
||
document.querySelector(".farama-header-menu__btn").addEventListener("click", toggleMenu);
|
||
document.getElementById("farama-close-menu").addEventListener("click", toggleMenu);
|
||
</script>
|
||
|
||
|
||
<script async src="https://www.googletagmanager.com/gtag/js?id=G-6H9C8TWXZ8"></script>
|
||
<script>
|
||
const enableGtag = () => {
|
||
window.dataLayer = window.dataLayer || [];
|
||
function gtag(){dataLayer.push(arguments);}
|
||
gtag('js', new Date());
|
||
gtag('config', 'G-6H9C8TWXZ8');
|
||
}
|
||
(() => {
|
||
if (!localStorage.getItem("acceptedCookieAlert")) {
|
||
const boxElem = document.createElement("div");
|
||
boxElem.classList.add("cookie-alert");
|
||
const containerElem = document.createElement("div");
|
||
containerElem.classList.add("cookie-alert__container");
|
||
const textElem = document.createElement("p");
|
||
textElem.innerHTML = `This page uses <a href="https://analytics.google.com/">
|
||
Google Analytics</a> to collect statistics.`;
|
||
containerElem.appendChild(textElem);
|
||
|
||
const declineBtn = Object.assign(document.createElement("button"),
|
||
{
|
||
innerText: "Deny",
|
||
className: "farama-btn cookie-alert__button",
|
||
id: "cookie-alert__decline",
|
||
}
|
||
);
|
||
declineBtn.addEventListener("click", () => {
|
||
localStorage.setItem("acceptedCookieAlert", false);
|
||
boxElem.remove();
|
||
});
|
||
|
||
const acceptBtn = Object.assign(document.createElement("button"),
|
||
{
|
||
innerText: "Allow",
|
||
className: "farama-btn cookie-alert__button",
|
||
id: "cookie-alert__accept",
|
||
}
|
||
);
|
||
acceptBtn.addEventListener("click", () => {
|
||
localStorage.setItem("acceptedCookieAlert", true);
|
||
boxElem.remove();
|
||
enableGtag();
|
||
});
|
||
|
||
containerElem.appendChild(declineBtn);
|
||
containerElem.appendChild(acceptBtn);
|
||
boxElem.appendChild(containerElem);
|
||
document.body.appendChild(boxElem);
|
||
} else if (localStorage.getItem("acceptedCookieAlert") === "true") {
|
||
enableGtag();
|
||
}
|
||
})()
|
||
</script>
|
||
|
||
<script src="../../_static/documentation_options.js?v=25d39d6f"></script>
|
||
<script src="../../_static/doctools.js?v=9a2dae69"></script>
|
||
<script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
|
||
<script src="../../_static/scripts/furo.js?v=7660844c"></script>
|
||
|
||
<script>
|
||
|
||
const createProjectsList = (projects, displayImages) => {
|
||
const ulElem = Object.assign(document.createElement('ul'),
|
||
{
|
||
className:'farama-header-menu-list',
|
||
}
|
||
)
|
||
for (let project of projects) {
|
||
const liElem = document.createElement("li");
|
||
const aElem = Object.assign(document.createElement("a"),
|
||
{
|
||
href: project.link
|
||
}
|
||
);
|
||
liElem.appendChild(aElem);
|
||
if (displayImages) {
|
||
const imgElem = Object.assign(document.createElement("img"),
|
||
{
|
||
src: project.image ? imagesBasepath + project.image : imagesBasepath + "/farama_black.svg",
|
||
alt: `${project.name} logo`,
|
||
className: "farama-black-logo-invert"
|
||
}
|
||
);
|
||
aElem.appendChild(imgElem);
|
||
}
|
||
aElem.appendChild(document.createTextNode(project.name));
|
||
ulElem.appendChild(liElem);
|
||
}
|
||
return ulElem;
|
||
}
|
||
|
||
// Create menu with Farama projects by using the API at farama.org/api/projects.json
|
||
const createCORSRequest = (method, url) => {
|
||
let xhr = new XMLHttpRequest();
|
||
xhr.responseType = 'json';
|
||
|
||
if ("withCredentials" in xhr) {
|
||
xhr.open(method, url, true);
|
||
} else if (typeof XDomainRequest != "undefined") {
|
||
// IE8 & IE9
|
||
xhr = new XDomainRequest();
|
||
xhr.open(method, url);
|
||
} else {
|
||
// CORS not supported.
|
||
xhr = null;
|
||
}
|
||
return xhr;
|
||
};
|
||
|
||
const url = 'https://farama.org/api/projects.json';
|
||
const imagesBasepath = "https://farama.org/assets/images"
|
||
const method = 'GET';
|
||
let xhr = createCORSRequest(method, url);
|
||
|
||
xhr.onload = () => {
|
||
const jsonResponse = xhr.response;
|
||
const sections = {
|
||
"Core Projects": [],
|
||
"Mature Projects": {
|
||
"Documentation": [],
|
||
"Repositories": [],
|
||
},
|
||
"Incubating Projects": {
|
||
"Documentation": [],
|
||
"Repositories": [],
|
||
},
|
||
"Foundation": [
|
||
{
|
||
name: "About",
|
||
link: "https://farama.org/about"
|
||
},
|
||
{
|
||
name: "Standards",
|
||
link: "https://farama.org/project_standards",
|
||
},
|
||
{
|
||
name: "Donate",
|
||
link: "https://farama.org/donations"
|
||
}
|
||
]
|
||
}
|
||
|
||
// Categorize projects
|
||
Object.keys(jsonResponse).forEach(key => {
|
||
projectJson = jsonResponse[key];
|
||
if (projectJson.website !== null) {
|
||
projectJson.link = projectJson.website;
|
||
} else {
|
||
projectJson.link = projectJson.github;
|
||
}
|
||
if (projectJson.type === "core") {
|
||
sections["Core Projects"].push(projectJson)
|
||
} else if (projectJson.type == "mature") {
|
||
if (projectJson.website !== null) {
|
||
sections["Mature Projects"]["Documentation"].push(projectJson)
|
||
} else {
|
||
sections["Mature Projects"]["Repositories"].push(projectJson)
|
||
}
|
||
} else {
|
||
if (projectJson.website !== null) {
|
||
sections["Incubating Projects"]["Documentation"].push(projectJson)
|
||
} else {
|
||
sections["Incubating Projects"]["Repositories"].push(projectJson)
|
||
}
|
||
}
|
||
})
|
||
|
||
const menuContainer = document.querySelector(".farama-header-menu__body");
|
||
|
||
Object.keys(sections).forEach((key, i) => {
|
||
const sectionElem = Object.assign(
|
||
document.createElement('div'), {
|
||
className:'farama-header-menu__section',
|
||
}
|
||
)
|
||
sectionElem.appendChild(Object.assign(document.createElement('span'),
|
||
{
|
||
className:'farama-header-menu__section-title' ,
|
||
innerText: key
|
||
}
|
||
))
|
||
// is not a list
|
||
if (sections[key].constructor !== Array) {
|
||
const subSections = sections[key];
|
||
const subSectionContainerElem = Object.assign(
|
||
document.createElement('div'), {
|
||
className:'farama-header-menu__subsections-container',
|
||
style: 'display: flex'
|
||
}
|
||
)
|
||
Object.keys(subSections).forEach((subKey, i) => {
|
||
const subSectionElem = Object.assign(
|
||
document.createElement('div'), {
|
||
className:'farama-header-menu__subsection',
|
||
}
|
||
)
|
||
subSectionElem.appendChild(Object.assign(document.createElement('span'),
|
||
{
|
||
className:'farama-header-menu__subsection-title' ,
|
||
innerText: subKey
|
||
}
|
||
))
|
||
const ulElem = createProjectsList(subSections[subKey], key !== 'Foundation');
|
||
subSectionElem.appendChild(ulElem);
|
||
subSectionContainerElem.appendChild(subSectionElem);
|
||
})
|
||
sectionElem.appendChild(subSectionContainerElem);
|
||
} else {
|
||
const projects = sections[key];
|
||
const ulElem = createProjectsList(projects, true);
|
||
sectionElem.appendChild(ulElem);
|
||
}
|
||
menuContainer.appendChild(sectionElem)
|
||
});
|
||
}
|
||
|
||
xhr.onerror = function() {
|
||
console.error("Unable to load projects");
|
||
};
|
||
|
||
xhr.send();
|
||
</script>
|
||
|
||
|
||
<script>
|
||
const versioningConfig = {
|
||
githubUser: 'Farama-Foundation',
|
||
githubRepo: 'Gymnasium',
|
||
};
|
||
fetch('/main/_static/versioning/versioning_menu.html').then(response => {
|
||
if (response.status === 200) {
|
||
response.text().then(text => {
|
||
const container = document.createElement("div");
|
||
container.innerHTML = text;
|
||
document.querySelector("body").appendChild(container);
|
||
// innerHtml doenst evaluate scripts, we need to add them dynamically
|
||
Array.from(container.querySelectorAll("script")).forEach(oldScript => {
|
||
const newScript = document.createElement("script");
|
||
Array.from(oldScript.attributes).forEach(attr => newScript.setAttribute(attr.name, attr.value));
|
||
newScript.appendChild(document.createTextNode(oldScript.innerHTML));
|
||
oldScript.parentNode.replaceChild(newScript, oldScript);
|
||
});
|
||
});
|
||
} else {
|
||
console.warn("Unable to load versioning menu", response);
|
||
}
|
||
});
|
||
</script>
|
||
|
||
</body>
|
||
</html> |